annotate PsiCLASS-1.0.2/AddXS.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
1 #include <stdio.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
2 #include <stdint.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
3
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
4 #include <string.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
5 #include <stdlib.h>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
6 #include <vector>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
7 #include <map>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
8 #include <fstream>
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
9
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
10 char usage[] =
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
11 "./addXS ref.fa [OPTIONS] < in.sam > out.sam\n"
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
12 "From bam to bam: samtools view -h in.bam | ./addXS ref.fa | samtools view -bS - > out.bam\n" ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
13 char samLine[65537] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
14 char seq[65537] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
15
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
16 char nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
17 -1, -1, -1, -1, -1, -1, 0, // Regard 'N' as 'A'
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
18 -1, -1, -1, -1, -1, 3,
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
19 -1, -1, -1, -1, -1, -1 } ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
20 char numToNuc[26] = {'A', 'C', 'G', 'T'} ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
21
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
22 struct _pair
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
23 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
24 int a, b ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
25 } ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
26
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
27 class BitSequence
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
28 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
29 private:
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
30 int len ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
31 //std::vector<uint32_t> sequence ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
32 uint32_t *sequence ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
33
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
34 public:
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
35 BitSequence() { len = 0 ; sequence = NULL ;}
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
36 BitSequence( int l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
37 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
38 len = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
39 sequence = new uint32_t[ l / 16 + 1 ] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
40 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
41
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
42 ~BitSequence()
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
43 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
44 //if ( sequence != NULL )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
45 // delete[] sequence ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
46 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
47
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
48 void SetLength( int l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
49 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
50 len = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
51 if ( sequence != NULL )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
52 delete[] sequence ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
53 sequence = new uint32_t[ l / 16 + 1 ] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
54 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
55
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
56 int GetLength()
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
57 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
58 return len ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
59 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
60
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
61 void Append( char c )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
62 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
63 if ( ( len & 15 ) == 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
64 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
65 sequence[ len / 16 ] = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
66 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
67 ++len ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
68 //printf( "%d %c\n", len, c ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
69 Set( c, len - 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
70 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
71
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
72 // pos is 0-based coordinate
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
73 // notice that the order within one 32 bit butcket is reversed
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
74 void Set( char c, int pos )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
75 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
76 if ( pos >= len )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
77 return ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
78
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
79 if ( c >= 'a' && c <= 'z' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
80 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
81 c = c - 'a' + 'A' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
82 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
83 if ( c == 'N' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
84 c = 'A' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
85
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
86 int ind = pos >> 4 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
87 int offset = pos & 15 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
88 int mask = ( (int)( nucToNum[c - 'A'] & 3 ) ) << ( 2 * offset ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
89 sequence[ind] = sequence[ind] | mask ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
90 //if ( c != 'A' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
91 // printf( "%d: %c %c %d %d : %d\n", pos, c, Get(pos), ind, offset, mask ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
92 //Print() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
93 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
94
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
95 char Get( int pos )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
96 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
97 if ( pos >= len )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
98 return 'N' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
99
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
100 int ind = pos >> 4 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
101 int offset = pos & 15 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
102 //printf( "%d: %d\n", pos, sequence[ind] ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
103 return numToNuc[ ( ( sequence[ind] >> ( 2 * offset ) ) & 3 ) ] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
104 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
105
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
106 void Print()
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
107 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
108 int i ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
109 for ( i = 0 ; i < len ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
110 printf( "%c", Get( i ) ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
111 printf( "\n" ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
112 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
113
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
114 void Print( FILE *fp, int start, int end, bool rc )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
115 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
116 if ( !rc )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
117 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
118 for ( int i = start ; i <= end ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
119 fprintf( fp, "%c", Get( i ) ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
120 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
121 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
122 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
123 for ( int i = end ; i >= start ; --i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
124 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
125 char c = Get( i ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
126 if ( c == 'A' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
127 c = 'T' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
128 else if ( c == 'C' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
129 c = 'G' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
130 else if ( c == 'G' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
131 c = 'C' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
132 else //if ( c == 'T' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
133 c = 'A' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
134 fprintf( fp, "%c", c ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
135 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
136 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
137 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
138 } ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
139
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
140 void ReverseComplement( char *s, int l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
141 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
142 int u, v ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
143 for ( u = 0, v = l - 1 ; u <= v ; ++u, --v )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
144 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
145 char tmp ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
146 tmp = s[v] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
147 s[v] = s[u] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
148 s[u] = tmp ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
149
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
150 s[u] = numToNuc[ 3 - nucToNum[ s[u] - 'A' ] ] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
151 s[v] = numToNuc[ 3 - nucToNum[ s[v] - 'A' ] ] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
152 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
153 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
154
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
155
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
156 int main( int argc, char *argv[] )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
157 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
158 int i, j, k ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
159
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
160 std::vector<BitSequence> genome ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
161 std::map<std::string, int> chrToChrId ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
162
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
163 char readid[200], chrom[50], mapq[10], cigar[1000], mateChrom[50] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
164 char buffer[1024] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
165 char pattern[1024] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
166 int start, mstart, flag, tlen ; // read start and mate read start
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
167 int seqLen ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
168 int chrCnt = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
169 int chrId ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
170 int width = 7 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
171 int score ; // 0-bit: multiple aligned, 1-bit can shift the intron, 2-bit can shift the alignment, 3-bit contain sequencing error
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
172
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
173 if ( argc < 2 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
174 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
175 printf( "%s", usage ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
176 return 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
177 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
178
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
179
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
180 std::ifstream fpRef ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
181 fpRef.open( argv[1] ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
182 std::string line ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
183
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
184 int motifStrand[1024] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
185
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
186 /*motifStrand[ 0b10110010 ] = 1 ; // GT/AG
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
187 motifStrand[ 0b01110001 ] = 0 ; // CT/AC
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
188 motifStrand[ 0b10010010 ] = 1 ;// GC/AG
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
189 motifStrand[ 0b01111001 ] = 0 ; // CT/GC
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
190 motifStrand[ 0b00110001 ] = 1 ; // AT/AC
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
191 motifStrand[ 0b10110011 ] = 0 ; // GT/AT*/
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
192 memset( motifStrand, -1, sizeof( motifStrand )) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
193 motifStrand[ 0xb2 ] = 1 ; // GT/AG
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
194 motifStrand[ 0x71 ] = 0 ; // CT/AC
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
195 motifStrand[ 0x92 ] = 1 ;// GC/AG
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
196 motifStrand[ 0x79 ] = 0 ; // CT/GC
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
197 motifStrand[ 0x31 ] = 1 ; // AT/AC
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
198 motifStrand[ 0xb3 ] = 0 ; // GT/AT
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
199
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
200 k = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
201 while ( getline( fpRef, line ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
202 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
203 //printf( "%s\n", line.c_str() ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
204 int len = line.length() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
205 if ( line[0] == '>' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
206 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
207 //char *s = strdup( line.c_str() + 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
208 if ( chrCnt > 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
209 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
210 genome[ chrCnt - 1 ].SetLength( k ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
211 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
212 for ( i = 1 ; i < len ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
213 if ( line[i] == ' ' || line[i] == '\t' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
214 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
215 chrToChrId[ line.substr( 1, i - 1 ) ] = chrCnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
216 ++chrCnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
217
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
218 BitSequence bs ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
219 genome.push_back( bs ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
220
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
221 k = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
222 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
223 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
224 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
225 k += len ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
226 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
227 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
228 genome[ chrCnt - 1 ].SetLength( k ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
229 fpRef.close() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
230
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
231 fpRef.open( argv[1] ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
232 while ( getline( fpRef, line ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
233 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
234 //printf( "%s\n", line.c_str() ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
235 int len = line.length() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
236 if ( line[0] == '>' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
237 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
238 //char *s = strdup( line.c_str() + 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
239 for ( i = 1 ; i < len ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
240 if ( line[i] == ' ' || line[i] == '\t' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
241 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
242 chrId = chrToChrId[ line.substr( 1, i - 1 ) ] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
243 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
244 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
245 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
246 BitSequence &bs = genome[chrId] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
247 for ( i = 0 ; i < len ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
248 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
249 if ( ( line[i] >= 'A' && line[i] <= 'Z' ) ||
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
250 ( line[i] >= 'a' && line[i] <= 'z' ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
251 bs.Append( line[i] ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
252 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
253 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
254 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
255 fpRef.close() ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
256
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
257
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
258
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
259 while ( fgets( samLine, sizeof( samLine ), stdin ) != NULL )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
260 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
261 if ( samLine[0] == '@' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
262 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
263 printf( "%s", samLine ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
264 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
265 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
266
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
267 sscanf( samLine, "%s %d %s %d %s %s %s %d %d %s", readid, &flag, chrom, &start, mapq, cigar, mateChrom, &mstart, &tlen, seq ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
268 struct _pair segments[1000] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
269 struct _pair seqSegments[1000] ; // the segments with respect to the sequence.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
270 int segCnt = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
271 int seqSegCnt = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
272 int num = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
273 int len = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
274
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
275 score = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
276
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
277 for ( i = 0 ; cigar[i] ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
278 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
279 if ( cigar[i] >= '0' && cigar[i] <= '9' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
280 num = num * 10 + cigar[i] - '0' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
281 else if ( cigar[i] == 'I' || cigar[i] == 'S' || cigar[i] == 'H'
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
282 || cigar[i] == 'P' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
283 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
284 num = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
285 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
286 else if ( cigar[i] == 'N' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
287 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
288 segments[segCnt].a = start ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
289 segments[segCnt].b = start + len - 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
290 ++segCnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
291
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
292 start = start + len + num ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
293 len = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
294 num = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
295 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
296 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
297 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
298 len += num ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
299 num = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
300 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
301 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
302
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
303
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
304 if ( len > 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
305 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
306 segments[segCnt].a = start ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
307 segments[segCnt].b = start + len - 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
308 ++segCnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
309 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
310
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
311 if ( segCnt <= 1 || strstr( samLine, "XS:A:" ) != NULL )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
312 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
313 printf( "%s", samLine ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
314 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
315 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
316
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
317 std::string schr( chrom ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
318 int chrId = chrToChrId[schr] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
319 int strand = -1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
320
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
321 len = strlen( samLine ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
322 samLine[len - 1] = '\0' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
323 int motif = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
324
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
325 BitSequence &chrSeq = genome[ chrId ] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
326
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
327 for ( i = 0 ; i < segCnt - 1 ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
328 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
329 char m[4] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
330 m[0] = chrSeq.Get( segments[i].b + 1 - 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
331 m[1] = chrSeq.Get( segments[i].b + 2 - 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
332 m[2] = chrSeq.Get( segments[i + 1].a - 2 - 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
333 m[3] = chrSeq.Get( segments[i + 1].a - 1 - 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
334 motif = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
335 for ( j = 0 ; j < 4 ; ++j )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
336 motif = ( motif << 2 ) + ( nucToNum[ m[j] - 'A' ] & 3 );
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
337 strand = motifStrand[ motif ] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
338 if ( strand != -1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
339 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
340 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
341 if ( strand == 1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
342 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
343 printf( "%s\tXS:A:+\n", samLine ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
344 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
345 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
346 else if ( strand == 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
347 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
348 printf( "%s\tXS:A:-\n", samLine ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
349 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
350 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
351
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
352 char *p ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
353 if ( ( p = strstr( samLine, "NH:i:" ) ) != NULL )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
354 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
355 p += 5 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
356 num = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
357 while ( *p >= '0' && *p <= '9' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
358 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
359 num = num * 10 + *p - '0' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
360 ++p ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
361 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
362 if ( num > 1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
363 score |= 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
364 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
365
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
366 seqLen = strlen( seq ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
367 for ( i = 0 ; i < seqLen ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
368 if ( seq[i] == 'N' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
369 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
370 score |= 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
371 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
372 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
373
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
374 num = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
375 len = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
376 seqSegCnt = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
377 start = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
378 for ( i = 0 ; cigar[i] ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
379 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
380 if ( cigar[i] >= '0' && cigar[i] <= '9' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
381 num = num * 10 + cigar[i] - '0' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
382 else if ( cigar[i] == 'D' || cigar[i] == 'S' || cigar[i] == 'H'
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
383 || cigar[i] == 'P' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
384 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
385 num = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
386 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
387 else if ( cigar[i] == 'N' )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
388 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
389 seqSegments[seqSegCnt].a = start ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
390 seqSegments[seqSegCnt].b = start + len - 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
391 ++seqSegCnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
392
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
393 start = start + len ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
394 len = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
395 num = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
396 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
397 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
398 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
399 len += num ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
400 num = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
401 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
402 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
403 if ( len > 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
404 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
405 seqSegments[seqSegCnt].a = start ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
406 seqSegments[seqSegCnt].b = start + len - 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
407 ++seqSegCnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
408 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
409
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
410 // Check whether we can shift the intron to a canonical splice site
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
411 for ( i = 0 ; i < segCnt - 1 ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
412 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
413 for ( j = -width ; j < width ; ++j )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
414 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
415 if ( segments[i].b + j < segments[i].a || segments[i+1].a + j > segments[i+1].b )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
416 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
417 // Look for the splice signal.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
418 char m[4] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
419 m[0] = chrSeq.Get( segments[i].b + j + 1 - 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
420 m[1] = chrSeq.Get( segments[i].b + j + 2 - 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
421 m[2] = chrSeq.Get( segments[i + 1].a + j - 2 - 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
422 m[3] = chrSeq.Get( segments[i + 1].a + j - 1 - 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
423 motif = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
424 for ( k = 0 ; k < 4 ; ++k )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
425 motif = ( motif << 2 ) + ( nucToNum[ m[k] - 'A' ] & 3 );
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
426 strand = motifStrand[ motif ] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
427 if ( strand == -1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
428 continue ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
429 //printf( "%d %d: %d\n", i, j, motif ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
430
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
431 // We found a signal, then test whether we can shift the intron.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
432 // Extract the sequence from the reference genome.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
433 int l = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
434 if ( j < 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
435 for ( k = segments[i + 1].a + j, l = 0 ; k <= segments[i + 1].a - 1 ; ++k, ++l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
436 buffer[l] = chrSeq.Get( k - 1 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
437 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
438 for ( k = segments[i].b + 1, l= 0 ; k <= segments[i].b + j ; ++k, ++l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
439 buffer[l] = chrSeq.Get(k - 1) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
440 buffer[l] = '\0' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
441
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
442 //printf( "%s\n", buffer ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
443
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
444 // Get the coordinate on the sequence.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
445 int tag ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
446 int useBegin = 0 ; // is the portion from the beginning part of a seqSegment?
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
447 int from, to ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
448 if ( j < 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
449 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
450 tag = i ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
451 useBegin = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
452 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
453 else // j>0
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
454 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
455 tag = i + 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
456 useBegin = 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
457 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
458 //printf( "%d %d\n", tag, useBegin ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
459
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
460 if ( ( flag & 0x10 ) != 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
461 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
462 //TODO: it seems star already fliped the sequence.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
463 // is it true for other aligners?
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
464
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
465 //tag = segCnt - 1 - tag ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
466 //useBegin = 1 - useBegin ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
467
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
468
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
469 //ReverseComplement( buffer, l ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
470 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
471
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
472 if ( useBegin )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
473 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
474 from = seqSegments[tag].a ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
475 to = seqSegments[tag].a + l - 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
476 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
477 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
478 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
479 from = seqSegments[tag].b - l + 1 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
480 to = seqSegments[tag].b ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
481 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
482 //printf( "%d %d %d\n", tag, from, to ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
483
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
484 for ( k = 0 ; k < l ; ++k )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
485 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
486 if ( seq[from + k] != buffer[k] )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
487 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
488 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
489
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
490
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
491 if ( k >= l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
492 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
493 // We can shift the intron
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
494 score |= 2 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
495 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
496 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
497 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
498 if ( j < width )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
499 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
500 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
501
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
502 // Check whether the sequence is low-complexity. To do this, we test whether we can shift the seqSegments and directly inspect the sequence.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
503 for ( i = 0 ; i < seqSegCnt ; ++i )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
504 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
505 int l ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
506 for ( k = 0, j = seqSegments[i].a - width ; j <= seqSegments[i].b + width ; ++j, ++k )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
507 buffer[k] = chrSeq.Get( j ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
508 buffer[k] = '0' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
509
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
510 for ( l = 0, j = seqSegments[i].a ; j <= seqSegments[i].b ; ++j, ++l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
511 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
512 pattern[l] = seq[j] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
513 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
514 pattern[l] = '\0' ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
515
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
516 // It seems the aligner already flipped the read in its output?
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
517 //if ( flag & 0x10 != 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
518 // ReverseComplement( pattern, l ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
519
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
520
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
521 char *p = buffer ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
522 int cnt = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
523 while ( ( p = strstr( p, pattern ) ) != NULL )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
524 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
525 ++cnt ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
526 ++p ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
527 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
528 if ( cnt > 1 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
529 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
530 score |= 4 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
531 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
532 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
533
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
534 int count[5] = { 0, 0, 0, 0, 0 } ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
535 int max = 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
536 for ( j = 0 ; j < l ; ++j )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
537 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
538 ++count[ nucToNum[ pattern[j] - 'A' ] ] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
539 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
540
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
541 for ( j = 0 ; j < 5 ; ++j )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
542 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
543 if ( count[j] > max )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
544 max = count[j] ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
545 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
546 if ( max > 0.8 * l )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
547 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
548 score |= 4 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
549 break ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
550 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
551 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
552
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
553 if ( ( p = strstr( samLine, "NM:i:" ) ) != NULL || ( p = strstr( samLine, "nM:i:" ) ) != NULL )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
554 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
555 int nm = atoi( p + 5 ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
556 if ( nm > 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
557 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
558 score |= 8 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
559 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
560 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
561 else
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
562 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
563 // TODO: check the nm by ourself
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
564 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
565
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
566 // Check whether the alignment is concordant.
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
567 if ( ( flag & 0x1 ) != 0 && ( flag & 0x4 ) == 0 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
568 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
569 start = segments[0].a ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
570 if ( strcmp( mateChrom, "=" )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
571 || ( flag & 0x10 ) == ( flag & 0x20 )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
572 || ( ( flag & 0x10 ) == 0 && ( mstart < start || mstart > start + 2000000 ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
573 || ( ( flag & 0x10 ) != 0 && ( mstart > start || mstart < start - 2000000 ) ) )
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
574 {
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
575 score |= 16 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
576 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
577 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
578
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
579 printf( "%s\tXS:A:?\tYS:i:%d\n", samLine, score ) ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
580 }
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
581
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
582 return 0 ;
903fc43d6227 Uploaded
lsong10
parents:
diff changeset
583 }