comparison PsiCLASS-1.0.2/AddXS.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 #include <stdio.h>
2 #include <stdint.h>
3
4 #include <string.h>
5 #include <stdlib.h>
6 #include <vector>
7 #include <map>
8 #include <fstream>
9
10 char usage[] =
11 "./addXS ref.fa [OPTIONS] < in.sam > out.sam\n"
12 "From bam to bam: samtools view -h in.bam | ./addXS ref.fa | samtools view -bS - > out.bam\n" ;
13 char samLine[65537] ;
14 char seq[65537] ;
15
16 char nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2,
17 -1, -1, -1, -1, -1, -1, 0, // Regard 'N' as 'A'
18 -1, -1, -1, -1, -1, 3,
19 -1, -1, -1, -1, -1, -1 } ;
20 char numToNuc[26] = {'A', 'C', 'G', 'T'} ;
21
22 struct _pair
23 {
24 int a, b ;
25 } ;
26
27 class BitSequence
28 {
29 private:
30 int len ;
31 //std::vector<uint32_t> sequence ;
32 uint32_t *sequence ;
33
34 public:
35 BitSequence() { len = 0 ; sequence = NULL ;}
36 BitSequence( int l )
37 {
38 len = 0 ;
39 sequence = new uint32_t[ l / 16 + 1 ] ;
40 }
41
42 ~BitSequence()
43 {
44 //if ( sequence != NULL )
45 // delete[] sequence ;
46 }
47
48 void SetLength( int l )
49 {
50 len = 0 ;
51 if ( sequence != NULL )
52 delete[] sequence ;
53 sequence = new uint32_t[ l / 16 + 1 ] ;
54 }
55
56 int GetLength()
57 {
58 return len ;
59 }
60
61 void Append( char c )
62 {
63 if ( ( len & 15 ) == 0 )
64 {
65 sequence[ len / 16 ] = 0 ;
66 }
67 ++len ;
68 //printf( "%d %c\n", len, c ) ;
69 Set( c, len - 1 ) ;
70 }
71
72 // pos is 0-based coordinate
73 // notice that the order within one 32 bit butcket is reversed
74 void Set( char c, int pos )
75 {
76 if ( pos >= len )
77 return ;
78
79 if ( c >= 'a' && c <= 'z' )
80 {
81 c = c - 'a' + 'A' ;
82 }
83 if ( c == 'N' )
84 c = 'A' ;
85
86 int ind = pos >> 4 ;
87 int offset = pos & 15 ;
88 int mask = ( (int)( nucToNum[c - 'A'] & 3 ) ) << ( 2 * offset ) ;
89 sequence[ind] = sequence[ind] | mask ;
90 //if ( c != 'A' )
91 // printf( "%d: %c %c %d %d : %d\n", pos, c, Get(pos), ind, offset, mask ) ;
92 //Print() ;
93 }
94
95 char Get( int pos )
96 {
97 if ( pos >= len )
98 return 'N' ;
99
100 int ind = pos >> 4 ;
101 int offset = pos & 15 ;
102 //printf( "%d: %d\n", pos, sequence[ind] ) ;
103 return numToNuc[ ( ( sequence[ind] >> ( 2 * offset ) ) & 3 ) ] ;
104 }
105
106 void Print()
107 {
108 int i ;
109 for ( i = 0 ; i < len ; ++i )
110 printf( "%c", Get( i ) ) ;
111 printf( "\n" ) ;
112 }
113
114 void Print( FILE *fp, int start, int end, bool rc )
115 {
116 if ( !rc )
117 {
118 for ( int i = start ; i <= end ; ++i )
119 fprintf( fp, "%c", Get( i ) ) ;
120 }
121 else
122 {
123 for ( int i = end ; i >= start ; --i )
124 {
125 char c = Get( i ) ;
126 if ( c == 'A' )
127 c = 'T' ;
128 else if ( c == 'C' )
129 c = 'G' ;
130 else if ( c == 'G' )
131 c = 'C' ;
132 else //if ( c == 'T' )
133 c = 'A' ;
134 fprintf( fp, "%c", c ) ;
135 }
136 }
137 }
138 } ;
139
140 void ReverseComplement( char *s, int l )
141 {
142 int u, v ;
143 for ( u = 0, v = l - 1 ; u <= v ; ++u, --v )
144 {
145 char tmp ;
146 tmp = s[v] ;
147 s[v] = s[u] ;
148 s[u] = tmp ;
149
150 s[u] = numToNuc[ 3 - nucToNum[ s[u] - 'A' ] ] ;
151 s[v] = numToNuc[ 3 - nucToNum[ s[v] - 'A' ] ] ;
152 }
153 }
154
155
156 int main( int argc, char *argv[] )
157 {
158 int i, j, k ;
159
160 std::vector<BitSequence> genome ;
161 std::map<std::string, int> chrToChrId ;
162
163 char readid[200], chrom[50], mapq[10], cigar[1000], mateChrom[50] ;
164 char buffer[1024] ;
165 char pattern[1024] ;
166 int start, mstart, flag, tlen ; // read start and mate read start
167 int seqLen ;
168 int chrCnt = 0 ;
169 int chrId ;
170 int width = 7 ;
171 int score ; // 0-bit: multiple aligned, 1-bit can shift the intron, 2-bit can shift the alignment, 3-bit contain sequencing error
172
173 if ( argc < 2 )
174 {
175 printf( "%s", usage ) ;
176 return 0 ;
177 }
178
179
180 std::ifstream fpRef ;
181 fpRef.open( argv[1] ) ;
182 std::string line ;
183
184 int motifStrand[1024] ;
185
186 /*motifStrand[ 0b10110010 ] = 1 ; // GT/AG
187 motifStrand[ 0b01110001 ] = 0 ; // CT/AC
188 motifStrand[ 0b10010010 ] = 1 ;// GC/AG
189 motifStrand[ 0b01111001 ] = 0 ; // CT/GC
190 motifStrand[ 0b00110001 ] = 1 ; // AT/AC
191 motifStrand[ 0b10110011 ] = 0 ; // GT/AT*/
192 memset( motifStrand, -1, sizeof( motifStrand )) ;
193 motifStrand[ 0xb2 ] = 1 ; // GT/AG
194 motifStrand[ 0x71 ] = 0 ; // CT/AC
195 motifStrand[ 0x92 ] = 1 ;// GC/AG
196 motifStrand[ 0x79 ] = 0 ; // CT/GC
197 motifStrand[ 0x31 ] = 1 ; // AT/AC
198 motifStrand[ 0xb3 ] = 0 ; // GT/AT
199
200 k = 0 ;
201 while ( getline( fpRef, line ) )
202 {
203 //printf( "%s\n", line.c_str() ) ;
204 int len = line.length() ;
205 if ( line[0] == '>' )
206 {
207 //char *s = strdup( line.c_str() + 1 ) ;
208 if ( chrCnt > 0 )
209 {
210 genome[ chrCnt - 1 ].SetLength( k ) ;
211 }
212 for ( i = 1 ; i < len ; ++i )
213 if ( line[i] == ' ' || line[i] == '\t' )
214 break ;
215 chrToChrId[ line.substr( 1, i - 1 ) ] = chrCnt ;
216 ++chrCnt ;
217
218 BitSequence bs ;
219 genome.push_back( bs ) ;
220
221 k = 0 ;
222 }
223 else
224 {
225 k += len ;
226 }
227 }
228 genome[ chrCnt - 1 ].SetLength( k ) ;
229 fpRef.close() ;
230
231 fpRef.open( argv[1] ) ;
232 while ( getline( fpRef, line ) )
233 {
234 //printf( "%s\n", line.c_str() ) ;
235 int len = line.length() ;
236 if ( line[0] == '>' )
237 {
238 //char *s = strdup( line.c_str() + 1 ) ;
239 for ( i = 1 ; i < len ; ++i )
240 if ( line[i] == ' ' || line[i] == '\t' )
241 break ;
242 chrId = chrToChrId[ line.substr( 1, i - 1 ) ] ;
243 }
244 else
245 {
246 BitSequence &bs = genome[chrId] ;
247 for ( i = 0 ; i < len ; ++i )
248 {
249 if ( ( line[i] >= 'A' && line[i] <= 'Z' ) ||
250 ( line[i] >= 'a' && line[i] <= 'z' ) )
251 bs.Append( line[i] ) ;
252 }
253 }
254 }
255 fpRef.close() ;
256
257
258
259 while ( fgets( samLine, sizeof( samLine ), stdin ) != NULL )
260 {
261 if ( samLine[0] == '@' )
262 {
263 printf( "%s", samLine ) ;
264 continue ;
265 }
266
267 sscanf( samLine, "%s %d %s %d %s %s %s %d %d %s", readid, &flag, chrom, &start, mapq, cigar, mateChrom, &mstart, &tlen, seq ) ;
268 struct _pair segments[1000] ;
269 struct _pair seqSegments[1000] ; // the segments with respect to the sequence.
270 int segCnt = 0 ;
271 int seqSegCnt = 0 ;
272 int num = 0 ;
273 int len = 0 ;
274
275 score = 0 ;
276
277 for ( i = 0 ; cigar[i] ; ++i )
278 {
279 if ( cigar[i] >= '0' && cigar[i] <= '9' )
280 num = num * 10 + cigar[i] - '0' ;
281 else if ( cigar[i] == 'I' || cigar[i] == 'S' || cigar[i] == 'H'
282 || cigar[i] == 'P' )
283 {
284 num = 0 ;
285 }
286 else if ( cigar[i] == 'N' )
287 {
288 segments[segCnt].a = start ;
289 segments[segCnt].b = start + len - 1 ;
290 ++segCnt ;
291
292 start = start + len + num ;
293 len = 0 ;
294 num = 0 ;
295 }
296 else
297 {
298 len += num ;
299 num = 0 ;
300 }
301 }
302
303
304 if ( len > 0 )
305 {
306 segments[segCnt].a = start ;
307 segments[segCnt].b = start + len - 1 ;
308 ++segCnt ;
309 }
310
311 if ( segCnt <= 1 || strstr( samLine, "XS:A:" ) != NULL )
312 {
313 printf( "%s", samLine ) ;
314 continue ;
315 }
316
317 std::string schr( chrom ) ;
318 int chrId = chrToChrId[schr] ;
319 int strand = -1 ;
320
321 len = strlen( samLine ) ;
322 samLine[len - 1] = '\0' ;
323 int motif = 0 ;
324
325 BitSequence &chrSeq = genome[ chrId ] ;
326
327 for ( i = 0 ; i < segCnt - 1 ; ++i )
328 {
329 char m[4] ;
330 m[0] = chrSeq.Get( segments[i].b + 1 - 1 ) ;
331 m[1] = chrSeq.Get( segments[i].b + 2 - 1 ) ;
332 m[2] = chrSeq.Get( segments[i + 1].a - 2 - 1 ) ;
333 m[3] = chrSeq.Get( segments[i + 1].a - 1 - 1 ) ;
334 motif = 0 ;
335 for ( j = 0 ; j < 4 ; ++j )
336 motif = ( motif << 2 ) + ( nucToNum[ m[j] - 'A' ] & 3 );
337 strand = motifStrand[ motif ] ;
338 if ( strand != -1 )
339 break ;
340 }
341 if ( strand == 1 )
342 {
343 printf( "%s\tXS:A:+\n", samLine ) ;
344 continue ;
345 }
346 else if ( strand == 0 )
347 {
348 printf( "%s\tXS:A:-\n", samLine ) ;
349 continue ;
350 }
351
352 char *p ;
353 if ( ( p = strstr( samLine, "NH:i:" ) ) != NULL )
354 {
355 p += 5 ;
356 num = 0 ;
357 while ( *p >= '0' && *p <= '9' )
358 {
359 num = num * 10 + *p - '0' ;
360 ++p ;
361 }
362 if ( num > 1 )
363 score |= 1 ;
364 }
365
366 seqLen = strlen( seq ) ;
367 for ( i = 0 ; i < seqLen ; ++i )
368 if ( seq[i] == 'N' )
369 {
370 score |= 1 ;
371 break ;
372 }
373
374 num = 0 ;
375 len = 0 ;
376 seqSegCnt = 0 ;
377 start = 0 ;
378 for ( i = 0 ; cigar[i] ; ++i )
379 {
380 if ( cigar[i] >= '0' && cigar[i] <= '9' )
381 num = num * 10 + cigar[i] - '0' ;
382 else if ( cigar[i] == 'D' || cigar[i] == 'S' || cigar[i] == 'H'
383 || cigar[i] == 'P' )
384 {
385 num = 0 ;
386 }
387 else if ( cigar[i] == 'N' )
388 {
389 seqSegments[seqSegCnt].a = start ;
390 seqSegments[seqSegCnt].b = start + len - 1 ;
391 ++seqSegCnt ;
392
393 start = start + len ;
394 len = 0 ;
395 num = 0 ;
396 }
397 else
398 {
399 len += num ;
400 num = 0 ;
401 }
402 }
403 if ( len > 0 )
404 {
405 seqSegments[seqSegCnt].a = start ;
406 seqSegments[seqSegCnt].b = start + len - 1 ;
407 ++seqSegCnt ;
408 }
409
410 // Check whether we can shift the intron to a canonical splice site
411 for ( i = 0 ; i < segCnt - 1 ; ++i )
412 {
413 for ( j = -width ; j < width ; ++j )
414 {
415 if ( segments[i].b + j < segments[i].a || segments[i+1].a + j > segments[i+1].b )
416 continue ;
417 // Look for the splice signal.
418 char m[4] ;
419 m[0] = chrSeq.Get( segments[i].b + j + 1 - 1 ) ;
420 m[1] = chrSeq.Get( segments[i].b + j + 2 - 1 ) ;
421 m[2] = chrSeq.Get( segments[i + 1].a + j - 2 - 1 ) ;
422 m[3] = chrSeq.Get( segments[i + 1].a + j - 1 - 1 ) ;
423 motif = 0 ;
424 for ( k = 0 ; k < 4 ; ++k )
425 motif = ( motif << 2 ) + ( nucToNum[ m[k] - 'A' ] & 3 );
426 strand = motifStrand[ motif ] ;
427 if ( strand == -1 )
428 continue ;
429 //printf( "%d %d: %d\n", i, j, motif ) ;
430
431 // We found a signal, then test whether we can shift the intron.
432 // Extract the sequence from the reference genome.
433 int l = 0 ;
434 if ( j < 0 )
435 for ( k = segments[i + 1].a + j, l = 0 ; k <= segments[i + 1].a - 1 ; ++k, ++l )
436 buffer[l] = chrSeq.Get( k - 1 ) ;
437 else
438 for ( k = segments[i].b + 1, l= 0 ; k <= segments[i].b + j ; ++k, ++l )
439 buffer[l] = chrSeq.Get(k - 1) ;
440 buffer[l] = '\0' ;
441
442 //printf( "%s\n", buffer ) ;
443
444 // Get the coordinate on the sequence.
445 int tag ;
446 int useBegin = 0 ; // is the portion from the beginning part of a seqSegment?
447 int from, to ;
448 if ( j < 0 )
449 {
450 tag = i ;
451 useBegin = 0 ;
452 }
453 else // j>0
454 {
455 tag = i + 1 ;
456 useBegin = 1 ;
457 }
458 //printf( "%d %d\n", tag, useBegin ) ;
459
460 if ( ( flag & 0x10 ) != 0 )
461 {
462 //TODO: it seems star already fliped the sequence.
463 // is it true for other aligners?
464
465 //tag = segCnt - 1 - tag ;
466 //useBegin = 1 - useBegin ;
467
468
469 //ReverseComplement( buffer, l ) ;
470 }
471
472 if ( useBegin )
473 {
474 from = seqSegments[tag].a ;
475 to = seqSegments[tag].a + l - 1 ;
476 }
477 else
478 {
479 from = seqSegments[tag].b - l + 1 ;
480 to = seqSegments[tag].b ;
481 }
482 //printf( "%d %d %d\n", tag, from, to ) ;
483
484 for ( k = 0 ; k < l ; ++k )
485 {
486 if ( seq[from + k] != buffer[k] )
487 break ;
488 }
489
490
491 if ( k >= l )
492 {
493 // We can shift the intron
494 score |= 2 ;
495 break ;
496 }
497 }
498 if ( j < width )
499 break ;
500 }
501
502 // Check whether the sequence is low-complexity. To do this, we test whether we can shift the seqSegments and directly inspect the sequence.
503 for ( i = 0 ; i < seqSegCnt ; ++i )
504 {
505 int l ;
506 for ( k = 0, j = seqSegments[i].a - width ; j <= seqSegments[i].b + width ; ++j, ++k )
507 buffer[k] = chrSeq.Get( j ) ;
508 buffer[k] = '0' ;
509
510 for ( l = 0, j = seqSegments[i].a ; j <= seqSegments[i].b ; ++j, ++l )
511 {
512 pattern[l] = seq[j] ;
513 }
514 pattern[l] = '\0' ;
515
516 // It seems the aligner already flipped the read in its output?
517 //if ( flag & 0x10 != 0 )
518 // ReverseComplement( pattern, l ) ;
519
520
521 char *p = buffer ;
522 int cnt = 0 ;
523 while ( ( p = strstr( p, pattern ) ) != NULL )
524 {
525 ++cnt ;
526 ++p ;
527 }
528 if ( cnt > 1 )
529 {
530 score |= 4 ;
531 break ;
532 }
533
534 int count[5] = { 0, 0, 0, 0, 0 } ;
535 int max = 0 ;
536 for ( j = 0 ; j < l ; ++j )
537 {
538 ++count[ nucToNum[ pattern[j] - 'A' ] ] ;
539 }
540
541 for ( j = 0 ; j < 5 ; ++j )
542 {
543 if ( count[j] > max )
544 max = count[j] ;
545 }
546 if ( max > 0.8 * l )
547 {
548 score |= 4 ;
549 break ;
550 }
551 }
552
553 if ( ( p = strstr( samLine, "NM:i:" ) ) != NULL || ( p = strstr( samLine, "nM:i:" ) ) != NULL )
554 {
555 int nm = atoi( p + 5 ) ;
556 if ( nm > 0 )
557 {
558 score |= 8 ;
559 }
560 }
561 else
562 {
563 // TODO: check the nm by ourself
564 }
565
566 // Check whether the alignment is concordant.
567 if ( ( flag & 0x1 ) != 0 && ( flag & 0x4 ) == 0 )
568 {
569 start = segments[0].a ;
570 if ( strcmp( mateChrom, "=" )
571 || ( flag & 0x10 ) == ( flag & 0x20 )
572 || ( ( flag & 0x10 ) == 0 && ( mstart < start || mstart > start + 2000000 ) )
573 || ( ( flag & 0x10 ) != 0 && ( mstart > start || mstart < start - 2000000 ) ) )
574 {
575 score |= 16 ;
576 }
577 }
578
579 printf( "%s\tXS:A:?\tYS:i:%d\n", samLine, score ) ;
580 }
581
582 return 0 ;
583 }