comparison PsiCLASS-1.0.2/FindJunction.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 /*
2 Find junctions from the SAM file generated by Tophat2.
3 The SAM file should be sorted.
4 The junction is the start and the end coordinates of the spliced region in this program. The output of the junction is a bit different.
5
6 Usage: ./a.out input [option] >output.
7 ./a.out -h for help.
8 */
9 #include <stdio.h>
10 #include <string.h>
11 #include <stdlib.h>
12
13 #include "sam.h"
14
15 #define LINE_SIZE 4097
16 #define QUEUE_SIZE 10001
17 #define HASH_MAX 1000003
18
19 struct _readTree
20 {
21 char id[256] ;
22 int leftAnchor, rightAnchor ;
23 int pos ;
24 //bool secondary ;
25 bool valid ;
26 int editDistance ;
27 int NH, cnt ; // if cnt < NH, then it has real secondary match for this splice junction
28 struct _readTree *left, *right ;
29
30 int flag ;// The flag from sam head.
31 } ;
32
33 // The structure of a junction
34 struct _junction
35 {
36 int start, end ;
37 int readCnt ; // The # of reads containing this junction
38 int secReadCnt ; // The # of reads whose secondary alignment has this junction.
39 char strand ; // On '+' or '-' strand
40 int leftAnchor, rightAnchor ; // The longest left and right anchor
41 int oppositeAnchor ; // The longest anchor of the shorter side.
42 int uniqEditDistance, secEditDistance ;
43 struct _readTree head ;
44 } ;
45
46 char line[LINE_SIZE] ;
47 char col[11][LINE_SIZE] ; // The option fields is not needed.
48 char strand ; // Extract XS field
49 char noncanonStrandInfo ;
50 //bool secondary ;
51 int NH ;
52 int editDistance ;
53 int mateStart ;
54 int filterYS ;
55 int samFlag ;
56
57 struct _junction junctionQueue[QUEUE_SIZE] ; // Expected only a few junctions in it for each read. This queue is sorted.
58
59 int qHead, qTail ;
60
61 bool flagPrintJunction ;
62 bool flagPrintAll ;
63 bool flagStrict ;
64 int junctionCnt ;
65 bool anchorBoth ;
66 bool validRead ;
67
68 int flank ;
69 struct _cigarSeg
70 {
71 int len ;
72 char type ;
73 } ;
74
75 struct _readTree *contradictedReads ;
76
77 char nucToNum[26] = { 0, 4, 1, 4, 4, 4, 2,
78 4, 4, 4, 4, 4, 4, 4,
79 4, 4, 4, 4, 4, 3,
80 4, 4, 4, 4, 4, 4 } ;
81
82 void PrintHelp()
83 {
84 printf(
85 "Prints reads from the SAM/BAM file that containing junctions.\n"
86 "Usage: ./a.out input [option]>output\n"
87 "Options:\n"
88 "\t-j xx [-B]: Output the junctions using 1-based coordinates. The format is \"reference id\" start end \"# of read\" strand.(They are sorted)\n and the xx is an integer means the maximum unqualified anchor length for a splice junction(default=8). If -B, the splice junction must be supported by a read whose both anchors are longer than xx.\n"
89 "\t-a: Output all the junctions, and use non-positive support number to indicate unqualified junctions.\n"
90 "\t-y: If the bits from YS field of bam matches the argument, we filter the alignment (default: 4).\n"
91 ) ;
92 }
93
94 void GetJunctionInfo( struct _junction &junc, struct _readTree *p )
95 {
96 if ( p == NULL )
97 return ;
98
99 if ( p->valid )
100 {
101 //if ( junc.start == 22381343 + 1 && junc.end == 22904987 - 1 )
102 // printf( "%s %d %d %d\n", p->id, p->leftAnchor, p->rightAnchor, p->flag ) ;
103
104
105
106 if ( p->cnt < p->NH )
107 {
108 junc.secEditDistance += p->editDistance ;
109 ++junc.secReadCnt ;
110 }
111 else
112 {
113 junc.uniqEditDistance += p->editDistance ;
114 ++junc.readCnt ;
115 }
116 int l = p->leftAnchor, r = p->rightAnchor ;
117
118 if ( !anchorBoth )
119 {
120 if ( l > junc.leftAnchor )
121 junc.leftAnchor = l ;
122 if ( r > junc.rightAnchor )
123 junc.rightAnchor = r ;
124
125 if ( l <= r && l > junc.oppositeAnchor )
126 junc.oppositeAnchor = l ;
127 else if ( r < l && r > junc.oppositeAnchor )
128 junc.oppositeAnchor = r ;
129 }
130 else
131 {
132 if ( l > flank && r > flank )
133 {
134 junc.leftAnchor = l ;
135 junc.rightAnchor = r ;
136 }
137
138 if ( l <= r && l > junc.oppositeAnchor )
139 junc.oppositeAnchor = l ;
140 else if ( r < l && r > junc.oppositeAnchor )
141 junc.oppositeAnchor = r ;
142 }
143 }
144 GetJunctionInfo( junc, p->left ) ;
145 GetJunctionInfo( junc, p->right ) ;
146 }
147
148 void PrintJunctionReads( struct _junction &junc, struct _readTree *p )
149 {
150 if ( p == NULL )
151 return ;
152
153 if ( p->valid )
154 printf( "%s\n", p->id ) ;
155 PrintJunctionReads( junc, p->left ) ;
156 PrintJunctionReads( junc, p->right ) ;
157 }
158
159 void PrintJunction( char *chrome, struct _junction &junc )
160 {
161 int sum ;
162 junc.leftAnchor = 0 ;
163 junc.rightAnchor = 0 ;
164 junc.oppositeAnchor = 0 ;
165 junc.readCnt = 0 ;
166 junc.secReadCnt = 0 ;
167 junc.uniqEditDistance = 0 ;
168 junc.secEditDistance = 0 ;
169 GetJunctionInfo( junc, &junc.head ) ;
170
171 sum = junc.readCnt + junc.secReadCnt ;
172
173 if ( junc.leftAnchor <= flank || junc.rightAnchor <= flank || ( junc.readCnt + junc.secReadCnt <= 0 ) )
174 {
175 if ( flagPrintAll )
176 {
177 sum = -sum ;
178 }
179 else
180 return ;
181 }
182
183 /*if ( junc.end - junc.start + 1 >= 200000 )
184 {
185 if ( junc.secReadCnt > 0 )
186 junc.secReadCnt = 1 ;
187 }*/
188
189 /*if ( junc.oppositeAnchor <= ( ( flank / 2 < 1 ) ? flank / 2 : 1 ) && ( junc.readCnt + junc.secReadCnt ) <= 10 )
190 {
191 if ( junc.readCnt > 0 )
192 junc.readCnt = 1 ;
193 if ( junc.secReadCnt > 0 )
194 junc.secReadCnt = 0 ;
195 }*/
196
197 printf( "%s %d %d %d %c %d %d %d %d\n", chrome, junc.start - 1, junc.end + 1, sum, junc.strand,
198 junc.readCnt, junc.secReadCnt, junc.uniqEditDistance, junc.secEditDistance ) ;
199 //PrintJunctionReads( junc, &junc.head ) ;
200 }
201
202 void ClearReadTree( struct _readTree *p )
203 {
204 if ( p == NULL )
205 return ;
206 ClearReadTree( p->left ) ;
207 ClearReadTree( p->right ) ;
208 free( p ) ;
209 }
210
211 // Insert to the read tree
212 bool InsertReadTree( struct _readTree *p, char *id, int l, int r )
213 {
214 int tmp = strcmp( p->id, id ) ;
215 if ( tmp == 0 )
216 {
217 p->cnt += 1 ;
218 return true ;
219 }
220 else if ( tmp < 0 )
221 {
222 if ( p->left )
223 return InsertReadTree( p->left, id, l, r ) ;
224 else
225 {
226 p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ;
227 strcpy( p->left->id, id ) ;
228 p->left->leftAnchor = l ;
229 p->left->rightAnchor = r ;
230 //p->left->secondary = secondary ;
231 p->left->editDistance = editDistance ;
232 p->left->left = p->left->right = NULL ;
233 p->left->valid = validRead ;
234 p->left->cnt = 1 ;
235 p->left->NH = NH ;
236 p->left->flag = samFlag ;
237 return false ;
238 }
239 }
240 else
241 {
242 if ( p->right )
243 return InsertReadTree( p->right, id, l, r ) ;
244 else
245 {
246 p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ;
247 strcpy( p->right->id, id ) ;
248 p->right->leftAnchor = l ;
249 p->right->rightAnchor = r ;
250 //p->right->secondary = secondary ;
251 p->right->editDistance = editDistance ;
252 p->right->left = p->right->right = NULL ;
253 p->right->valid = validRead ;
254 p->right->cnt = 1 ;
255 p->right->NH = NH ;
256 p->right->flag = samFlag ;
257 return false ;
258 }
259 }
260 }
261
262
263 bool SearchContradictedReads( struct _readTree *p, char *id, int pos )
264 {
265 if ( p == NULL )
266 return false ;
267 int tmp = strcmp( p->id, id ) ;
268 if ( tmp == 0 && p->pos == pos )
269 return true ;
270 else if ( tmp <= 0 )
271 return SearchContradictedReads( p->left, id, pos ) ;
272 else
273 return SearchContradictedReads( p->right, id, pos ) ;
274 }
275
276 bool InsertContradictedReads( struct _readTree *p, char *id, int pos )
277 {
278 if ( p == NULL )
279 {
280 contradictedReads = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ;
281 strcpy( contradictedReads->id, id ) ;
282 contradictedReads->pos = pos ;
283 contradictedReads->left = contradictedReads->right = NULL ;
284 return false ;
285 }
286 int tmp = strcmp( p->id, id ) ;
287 if ( tmp == 0 && p->pos == pos )
288 return true ;
289 else if ( tmp <= 0 )
290 {
291 if ( p->left )
292 return InsertContradictedReads( p->left, id, pos ) ;
293 else
294 {
295 p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ;
296 strcpy( p->left->id, id ) ;
297 p->left->pos = pos ;
298 p->left->left = p->left->right = NULL ;
299 return false ;
300 }
301 }
302 else
303 {
304 if ( p->right )
305 return InsertContradictedReads( p->right, id, pos ) ;
306 else
307 {
308 p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ;
309 strcpy( p->right->id, id ) ;
310 p->right->pos = pos ;
311 p->right->left = p->right->right = NULL ;
312 return false ;
313 }
314 }
315 }
316
317 struct _readTree *GetReadTreeNode( struct _readTree *p, char *id )
318 {
319 if ( p == NULL )
320 return NULL ;
321 int tmp = strcmp( p->id, id ) ;
322 if ( tmp == 0 )
323 return p ;
324 else if ( tmp < 0 )
325 return GetReadTreeNode( p->left, id ) ;
326 else
327 return GetReadTreeNode( p->right, id ) ;
328 }
329
330 // Insert the new junction into the queue,
331 // and make sure the queue is sorted.
332 // Assume each junction range is only on one strand.
333 // l, r is the left and right anchor from a read
334 void InsertQueue( int start, int end, int l, int r )
335 {
336 int i, j ;
337 i = qTail ;
338
339
340 while ( i != qHead )
341 {
342 j = i - 1 ;
343 if ( j < 0 )
344 j = QUEUE_SIZE - 1 ;
345
346 if ( junctionQueue[j].start < start
347 || ( junctionQueue[j].start == start && junctionQueue[j].end < end ) )
348 break ;
349
350 junctionQueue[i] = junctionQueue[j] ;
351 --i ;
352
353 if ( i < 0 )
354 i = QUEUE_SIZE - 1 ;
355 }
356
357 junctionQueue[i].start = start ;
358 junctionQueue[i].end = end ;
359 /*if ( !secondary )
360 {
361 junctionQueue[i].readCnt = 1 ;
362 junctionQueue[i].secReadCnt = 0 ;
363 }
364 else
365 {
366 junctionQueue[i].readCnt = 0 ;
367 junctionQueue[i].secReadCnt = 1 ;
368 }*/
369 junctionQueue[i].strand = strand ;
370 junctionQueue[i].leftAnchor = l ;
371 junctionQueue[i].rightAnchor = r ;
372
373 strcpy( junctionQueue[i].head.id, col[0] ) ;
374 junctionQueue[i].head.valid = validRead ;
375 junctionQueue[i].head.leftAnchor = l ;
376 junctionQueue[i].head.rightAnchor = r ;
377 junctionQueue[i].head.left = junctionQueue[i].head.right = NULL ;
378 //junctionQueue[i].head.secondary = secondary ;
379 junctionQueue[i].head.NH = NH ;
380 junctionQueue[i].head.cnt = 1 ;
381 junctionQueue[i].head.editDistance = editDistance ;
382
383 ++qTail ;
384 if ( qTail >= QUEUE_SIZE )
385 qTail = 0 ;
386 }
387
388 // Search the queue, and remove the heads if its end(start) is smaller than prune.(Because it is impossible to have that junction again)
389 // Add the junction to the queue, if it is not in it.
390 // Return true, if it finds the junction. Otherwise, return false.
391 bool SearchQueue( int start, int end, int prune, int l, int r )
392 {
393 int i ;
394
395 // Test whether this read might be a false alignment.
396 i = qHead ;
397 while ( i != qTail )
398 {
399 struct _readTree *rt ;
400 if ( ( junctionQueue[i].start == start && junctionQueue[i].end < end ) ||
401 ( junctionQueue[i].start > start && junctionQueue[i].end == end ) )
402 {
403 // This alignment is false ;
404 rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ;
405 // the commented out logic because it is handled by contradicted reads
406 if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) )
407 {
408 if ( rt->leftAnchor <= flank || rt->rightAnchor <= flank )//|| rt->secondary )
409 {
410 if ( l > flank && r > flank )//&& !secondary )
411 rt->valid = false ;
412 else
413 validRead = false ;
414 }
415 else
416 {
417 // Ignore this read
418 //return true ;
419 validRead = false ;
420 }
421 }
422 }
423 else if ( ( junctionQueue[i].start == start && junctionQueue[i].end > end ) ||
424 ( junctionQueue[i].start < start && junctionQueue[i].end == end ) )
425 {
426 // This other alignment is false ;
427 rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ;
428 //if ( rt != NULL )
429 if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) )
430 {
431 if ( l <= flank || r <= flank )//|| secondary )
432 {
433 if ( rt->leftAnchor > flank && rt->rightAnchor > flank )//&& !rt->secondary )
434 validRead = false ;
435 else
436 rt->valid = false ;
437 }
438 else
439 rt->valid = false ;
440 }
441 }
442 ++i ;
443 if ( i >= QUEUE_SIZE )
444 i = 0 ;
445 }
446
447 //if ( start == 8077108 && end == 8078439 )
448 // exit( 1 ) ;
449 i = qHead ;
450
451 while ( i != qTail )
452 {
453 if ( junctionQueue[i].start == start &&
454 junctionQueue[i].end == end )
455 {
456 if (junctionQueue[i].strand == '?' && junctionQueue[i].strand != strand )
457 {
458 junctionQueue[i].strand = strand ;
459 }
460 InsertReadTree( &junctionQueue[i].head, col[0], l, r ) ;
461 return true ;
462 }
463
464 if ( junctionQueue[i].end < prune && i == qHead )
465 {
466 // pop
467 PrintJunction( col[2], junctionQueue[i] ) ;
468 ClearReadTree( junctionQueue[i].head.left ) ;
469 ClearReadTree( junctionQueue[i].head.right ) ;
470 ++qHead ;
471 if ( qHead >= QUEUE_SIZE )
472 qHead = 0 ;
473 }
474 ++i ;
475 if ( i >= QUEUE_SIZE )
476 i = 0 ;
477 }
478
479 InsertQueue( start, end, l, r ) ;
480 return false ;
481 }
482
483 // Compute the junctions based on the CIGAR
484 bool CompareJunctions( int startLocation, char *cigar )
485 {
486 int currentLocation = startLocation ; // Current location on the reference genome
487 int i, j ;
488 int num ;
489 int newJuncCnt = 0 ; // The # of junctions in the read, and the # of new junctions among them.
490
491 struct _cigarSeg cigarSeg[1000] ; // A segment of the cigar.
492 int ccnt = 0 ; // cigarSeg cnt
493
494 j = 0 ;
495 num = 0 ;
496 validRead = true ;
497
498 for ( i = 0 ; cigar[i] ; ++i )
499 {
500 if ( cigar[i] >= '0' && cigar[i] <= '9' )
501 {
502 num = num * 10 + cigar[i] - '0' ;
503 }
504 else
505 {
506 cigarSeg[ccnt].len = num ;
507 cigarSeg[ccnt].type = cigar[i] ;
508 ++ccnt ;
509 num = 0 ;
510 }
511 }
512
513 // Filter low complex alignment.
514 // Only applies this to alignments does not have a strand information.
515 if ( strand == '?' )
516 {
517 if ( noncanonStrandInfo != -1 )
518 {
519 if ( ( noncanonStrandInfo & filterYS ) != 0 )
520 validRead = false ;
521 }
522 else
523 {
524 int softStart = -1 ;
525 int softEnd = 0 ;
526 if ( cigarSeg[0].type == 'S' )
527 softStart = cigarSeg[0].len ;
528 if ( cigarSeg[ ccnt - 1 ].type == 'S' )
529 softEnd = cigarSeg[ ccnt - 1 ].len ;
530 int readLen = strlen( col[9] ) ;
531 int count[5] = { 0, 0, 0, 0, 0 } ;
532
533 int pos = 0 ;
534 for ( i = 0 ; i < ccnt ; ++i )
535 {
536 switch ( cigarSeg[i].type )
537 {
538 case 'S':
539 pos += cigarSeg[i].len ;
540 case 'M':
541 case 'I':
542 {
543 for ( j = 0 ; j < cigarSeg[i].len ; ++j )
544 ++count[ nucToNum[ col[9][pos + j] - 'A' ] ] ;
545 pos += j ;
546 } break ;
547 case 'N':
548 {
549 int max = 0 ;
550 int sum = 0 ;
551 for ( j = 0 ; j < 5 ; ++j )
552 {
553 if ( count[j] > max )
554 max = count[j] ;
555 sum += count[j] ;
556 }
557 if ( max > 0.8 * sum )
558 validRead = false ;
559 count[0] = count[1] = count[2] = count[3] = count[4] = 0 ;
560 } break ;
561 case 'H':
562 case 'P':
563 case 'D':
564 default: break ;
565 }
566 }
567
568 int max = 0 ;
569 int sum = 0 ;
570 for ( j = 0 ; j < 5 ; ++j )
571 {
572 if ( count[j] > max )
573 max = count[j] ;
574 sum += count[j] ;
575 }
576 if ( max > 0.8 * sum )
577 validRead = false ;
578 /* count[0] = count[1] = count[2] = count[3] = count[4] = 0 ;
579
580
581 for ( i = softStart + 1 ; i < readLen - softEnd ; ++i )
582 {
583 switch ( col[9][i] )
584 {
585 case 'A': ++count[0] ; break ;
586 case 'C': ++count[1] ; break ;
587 case 'G': ++count[2] ; break ;
588 case 'T': ++count[3] ; break ;
589 default: ++count[4] ;
590 }
591 }
592 int max = 0 ;
593 for ( j = 0 ; j < 5 ; ++j )
594 if ( count[j] > max )
595 max = count[j] ;
596 if ( max > 0.6 * ( readLen - softEnd - softStart - 1 ) )
597 validRead = false ;*/
598 }
599 }
600
601 // Test whether contradict with mate pair
602 if ( col[6][0] == '=' )
603 {
604 currentLocation = startLocation ;
605 for ( i = 0 ; i < ccnt ; ++i )
606 {
607 if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H'
608 || cigarSeg[i].type == 'P' )
609 continue ;
610 else if ( cigarSeg[i].type == 'N' )
611 {
612 if ( mateStart >= currentLocation && mateStart <= currentLocation + cigarSeg[i].len - 1 )
613 {
614 /*if ( cigarSeg[i].len == 91486 )
615 {
616 printf( "%s %d %d\n", currentLocation, mateStart ) ;
617 exit( 1 ) ;
618 }*/
619 // ignore this read
620 //return false ;
621 InsertContradictedReads( contradictedReads, col[0], mateStart ) ;
622 validRead = false ;
623 break ;
624 }
625 }
626 currentLocation += cigarSeg[i].len ;
627 }
628
629 i = qHead ;
630 // Search if it falls in the splices junction created by its mate.
631 while ( i != qTail )
632 {
633 if ( mateStart < junctionQueue[i].start && junctionQueue[i].start <= startLocation
634 && startLocation <= junctionQueue[i].end
635 && SearchContradictedReads( contradictedReads, col[0], startLocation ) )
636 {
637 validRead = false ;
638 break ;
639 }
640 ++i ;
641 if ( i >= QUEUE_SIZE )
642 i = 0 ;
643 }
644 }
645
646 currentLocation = startLocation ;
647 for ( i = 0 ; i < ccnt ; ++i )
648 {
649 if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H'
650 || cigarSeg[i].type == 'P' )
651 continue ;
652 else if ( cigarSeg[i].type == 'N' )
653 {
654 int left, right ;
655 int tmp ;
656 tmp = i ;
657 while ( i > 0 && ( cigarSeg[i - 1].type == 'I' || cigarSeg[i - 1].type == 'D' ) )
658 --i ;
659 if ( i > 0 && cigarSeg[i - 1].type == 'M' )
660 {
661 left = cigarSeg[i - 1].len ;
662 if ( i >= 2 && cigarSeg[i - 2].type == 'N' && left <= flank )
663 {
664 left = flank + 1 ;
665 }
666 }
667 else
668 left = 0 ;
669
670 i = tmp ;
671 while ( i < ccnt && ( cigarSeg[i + 1].type == 'I' || cigarSeg[i + 1].type == 'D' ) )
672 ++i ;
673 if ( i < ccnt && cigarSeg[i + 1].type == 'M' )
674 {
675 right = cigarSeg[i + 1].len ;
676 if ( i + 2 < ccnt && cigarSeg[i + 2].type == 'N' && right <= flank )
677 {
678 right = flank + 1 ;
679 }
680 }
681 else
682 right = 0 ;
683 i = tmp ;
684 if ( !SearchQueue( currentLocation, currentLocation + cigarSeg[i].len - 1, startLocation, left, right ) )
685 {
686 ++newJuncCnt ;
687 }
688 }
689 currentLocation += cigarSeg[i].len ;
690 }
691 /*else if ( cigar[i] == 'I' )
692 {
693 num = 0 ;
694 }
695 else if ( cigar[i] == 'N' )
696 {
697 if ( !SearchQueue( currentLocation, currentLocation + num - 1, startLocation ) )
698 {
699 ++newJuncCnt ;
700 //if ( flagPrintJunction )
701 // printf( "%s %d %d\n", col[2], currentLocation - 2, currentLocation + len - 1 ) ;
702 }
703 currentLocation += num ;
704 num = 0 ;
705 }
706 else // Other operations, like M, D,...,
707 {
708 currentLocation += num ;
709 num = 0 ;
710 }*/
711
712 junctionCnt += newJuncCnt ;
713
714 if ( newJuncCnt )
715 return true ;
716 else
717 return false ;
718 }
719
720 void cigar2string( bam1_core_t *c, uint32_t *in_cigar, char *out_cigar )
721 {
722 int k, op, l ;
723 char opcode ;
724
725 *out_cigar = '\0' ;
726 for ( k = 0 ; k < c->n_cigar ; ++k )
727 {
728 op = in_cigar[k] & BAM_CIGAR_MASK ;
729 l = in_cigar[k] >> BAM_CIGAR_SHIFT ;
730 switch (op)
731 {
732 case BAM_CMATCH: opcode = 'M' ; break ;
733 case BAM_CINS: opcode = 'I' ; break ;
734 case BAM_CDEL: opcode = 'D' ; break ;
735 case BAM_CREF_SKIP: opcode = 'N' ; break ;
736 case BAM_CSOFT_CLIP: opcode = 'S' ; break ;
737 case BAM_CHARD_CLIP: opcode = 'H' ; break ;
738 case BAM_CPAD: opcode = 'P' ; break ;
739 }
740 sprintf( out_cigar + strlen( out_cigar ), "%d%c", l, opcode ) ;
741 }
742 }
743
744
745
746 int main( int argc, char *argv[] )
747 {
748 FILE *fp ;
749 samfile_t *fpsam ;
750 bam1_t *b = NULL ;
751 bool useSam = true ;
752
753 int i, len ;
754 int startLocation ;
755 bool flagRemove = false ;
756 char prevChrome[103] ;
757
758 anchorBoth = false ;
759
760 flagPrintJunction = false ;
761 flagPrintAll = false ;
762 flagStrict = false ;
763 junctionCnt = 0 ;
764 bool hasMateReadIdSuffix = false ;
765
766 strcpy( prevChrome, "" ) ;
767 flagRemove = true ;
768 flagPrintJunction = true ;
769 flank = 8 ;
770 filterYS = 4 ;
771
772 contradictedReads = NULL ;
773
774 // processing the argument list
775 for ( i = 1 ; i < argc ; ++i )
776 {
777 if ( !strcmp( argv[i], "-h" ) )
778 {
779 PrintHelp() ;
780 return 0 ;
781 }
782 else if ( !strcmp( argv[i], "-r" ) )
783 {
784 flagRemove = true ;
785 }
786 else if ( !strcmp( argv[i], "-j" ) )
787 {
788 strcpy( prevChrome, "" ) ;
789 flagRemove = true ;
790 flagPrintJunction = true ;
791 flank = atoi( argv[i + 1] ) ;
792 if ( i + 2 < argc && !strcmp( argv[i+2], "-B" ) )
793 {
794 anchorBoth = true ;
795 ++i ;
796 }
797 ++i ;
798 }
799 else if ( !strcmp( argv[i], "-a" ) )
800 {
801 flagPrintAll = true ;
802 }
803 else if ( !strcmp( argv[i], "--strict" ) )
804 {
805 flagStrict = true ;
806 }
807 else if ( !strcmp( argv[i], "-y" ) )
808 {
809 filterYS = atoi( argv[i + 1] ) ;
810 ++i ;
811 }
812 else if ( !strcmp( argv[i], "--hasMateIdSuffix" ) )
813 {
814 hasMateReadIdSuffix = true ;
815 ++i ;
816 }
817 else if ( i > 1 )
818 {
819 printf( "Unknown option %s\n", argv[i] ) ;
820 exit( 1 ) ;
821 }
822 }
823 if ( argc == 1 )
824 {
825 PrintHelp() ;
826 return 0 ;
827 }
828
829 len = strlen( argv[1] ) ;
830 if ( argv[1][len-3] == 'b' || argv[1][len-3] == 'B' )
831 {
832 if ( !( fpsam = samopen( argv[1], "rb", 0 ) ) )
833 return 0 ;
834
835 if ( !fpsam->header )
836 {
837 //samclose( fpsam ) ;
838 //fpsam = samopen( argv[1], "r", 0 ) ;
839 //if ( !fpsam->header )
840 //{
841 useSam = false ;
842 fp = fopen( argv[1], "r" ) ;
843 //}
844 }
845 }
846 else
847 {
848 useSam = false ;
849 fp = NULL ;
850 if ( !strcmp( argv[1], "-" ) )
851 fp = stdin ;
852 else
853 fp = fopen( argv[1], "r" ) ;
854 if ( fp == NULL )
855 {
856 printf( "Could not open file %s\n", argv[1] ) ;
857 return 0 ;
858 }
859 }
860
861 while ( 1 )
862 {
863 int flag = 0 ;
864 if ( useSam )
865 {
866 if ( b )
867 bam_destroy1( b ) ;
868 b = bam_init1() ;
869 if ( samread( fpsam, b ) <= 0 )
870 break ;
871 if ( b->core.tid >= 0 )
872 strcpy( col[2], fpsam->header->target_name[b->core.tid] ) ;
873 else
874 strcpy( col[2], "-1" ) ;
875 cigar2string( &(b->core), bam1_cigar( b ), col[5] ) ;
876 strcpy( col[0], bam1_qname( b ) ) ;
877 flag = b->core.flag ;
878 if ( bam_aux_get( b, "NH" ) )
879 {
880 /*if ( bam_aux2i( bam_aux_get(b, "NH" ) ) >= 2 )
881 {
882 secondary = true ;
883 }
884 else
885 secondary = false ;*/
886
887 NH = bam_aux2i( bam_aux_get( b, "NH" ) ) ;
888 }
889 else
890 {
891 //secondary = false ;
892 NH = 1 ;
893 }
894
895 if ( bam_aux_get( b, "NM" ) )
896 {
897 editDistance = bam_aux2i( bam_aux_get( b, "NM" ) ) ;
898 }
899 else if ( bam_aux_get( b, "nM" ) )
900 {
901 editDistance = bam_aux2i( bam_aux_get( b, "nM" ) ) ;
902 }
903 else
904 editDistance = 0 ;
905
906 mateStart = b->core.mpos + 1 ;
907 if ( b->core.mtid == b->core.tid )
908 col[6][0] = '=' ;
909 else
910 col[6][0] = '*' ;
911
912 if ( b->core.l_qseq < 20 )
913 continue ;
914
915 for ( i = 0 ; i < b->core.l_qseq ; ++i )
916 {
917 int bit = bam1_seqi( bam1_seq( b ), i ) ;
918 switch ( bit )
919 {
920 case 1: col[9][i] = 'A' ; break ;
921 case 2: col[9][i] = 'C' ; break ;
922 case 4: col[9][i] = 'G' ; break ;
923 case 8: col[9][i] = 'T' ; break ;
924 case 15: col[9][i] = 'N' ; break ;
925 default: col[9][i] = 'A' ; break ;
926 }
927 }
928 col[9][i] = '\0' ;
929
930 /*if ( flag & 0x100 )
931 secondary = true ;
932 else
933 secondary = false ;*/
934 }
935 else
936 {
937 char *p ;
938 if ( !fgets( line, LINE_SIZE, fp ) )
939 break ;
940 if ( line[0] == '\0' || line[0] == '@' )
941 continue ;
942 sscanf( line, "%s%s%s%s%s%s%s%s%s%s%s", col, col + 1, col + 2, col + 3, col + 4,
943 col + 5, col + 6, col + 7, col + 8, col + 9, col + 10 ) ;
944
945 flag = atoi( col[1] ) ;
946 if ( p = strstr( line, "NH" ) )
947 {
948 int k = 0 ;
949 p += 5 ;
950 while ( *p != ' ' && *p != '\t' && *p != '\0')
951 {
952 k = k * 10 + *p - '0' ;
953 ++p ;
954 }
955 /*if ( k >= 2 )
956 {
957 secondary = true ;
958 }
959 else
960 secondary = false ;*/
961 NH = k ;
962 }
963 else
964 {
965 //secondary = false ;
966 NH = 1 ;
967 }
968 if ( ( p = strstr( line, "NM" ) ) || ( p = strstr( line, "nM" ) ) )
969 {
970 int k = 0 ;
971 p += 5 ;
972 while ( *p != ' ' && *p != '\t' && *p != '\0')
973 {
974 k = k * 10 + *p - '0' ;
975 ++p ;
976 }
977 editDistance = k ;
978 }
979 else
980 editDistance = 0 ;
981
982 mateStart = atoi( col[7] ) ;
983 /*if ( flag & 0x100 )
984 secondary = true ;
985 else
986 secondary = false ;*/
987
988 }
989 samFlag = flag ;
990 for ( i = 0 ; col[5][i] ; ++i )
991 if ( col[5][i] == 'N' )
992 break ;
993
994 if ( !col[5][i] )
995 continue ;
996
997 // remove .1, .2 or /1, /2 suffix
998 if ( hasMateReadIdSuffix )
999 {
1000 char *s = col[0] ;
1001 int len = strlen( s ) ;
1002 if ( len >= 2 && ( s[len - 1] == '1' || s[len - 1] == '2' )
1003 && ( s[len - 2] == '.' || s[len - 2] == '/' ) )
1004 {
1005 s[len - 2] = '\0' ;
1006 }
1007 }
1008
1009 if ( useSam )
1010 {
1011 if ( bam_aux_get( b, "XS" ) )
1012 {
1013 strand = bam_aux2A( bam_aux_get( b, "XS" ) ) ;
1014 if ( bam_aux_get( b, "YS" ) )
1015 {
1016 noncanonStrandInfo = bam_aux2i( bam_aux_get( b, "YS" ) ) ;
1017 }
1018 else
1019 {
1020 noncanonStrandInfo = -1 ;
1021 }
1022 }
1023 else
1024 {
1025 strand = '?' ;
1026 noncanonStrandInfo = -1 ;
1027 }
1028 startLocation = b->core.pos + 1 ;
1029 }
1030 else
1031 {
1032 if ( strstr( line, "XS:A:-" ) ) // on negative strand
1033 strand = '-' ;
1034 else if ( strstr( line, "XS:A:+" ) )
1035 strand = '+' ;
1036 else
1037 {
1038 strand = '?' ;
1039 char *p = strstr( line, "YS:i:" ) ;
1040 if ( p != NULL )
1041 {
1042 p += 5 ;
1043 noncanonStrandInfo = atoi( p ) ;
1044 }
1045 else
1046 noncanonStrandInfo = -1 ;
1047 }
1048 startLocation = atoi( col[3] ) ;
1049 }
1050
1051 // Found the junctions from the read.
1052 if ( strcmp( prevChrome, col[2] ) )
1053 {
1054 if ( flagPrintJunction )
1055 {
1056 // Print the remaining elements in the queue
1057 i = qHead ;
1058 while ( i != qTail )
1059 {
1060 PrintJunction( prevChrome, junctionQueue[i] ) ;
1061 ++i ;
1062 if ( i >= QUEUE_SIZE )
1063 i = 0 ;
1064 }
1065 }
1066
1067 // new chromosome
1068 ClearReadTree( contradictedReads ) ;
1069 contradictedReads = NULL ;
1070 qHead = qTail = 0 ;
1071 strcpy( prevChrome, col[2] ) ;
1072 }
1073
1074 if ( flagRemove )
1075 {
1076 if ( CompareJunctions( startLocation, col[5] ) )
1077 {
1078 // Test whether this read has new junctions
1079 //++junctionCnt ;
1080 if ( !flagPrintJunction )
1081 printf( "%s", line ) ;
1082 }
1083 }
1084 else
1085 {
1086 ++junctionCnt ;
1087 printf( "%s", line ) ;
1088 }
1089 //printf( "hi2 %s\n", col[0] ) ;
1090 }
1091
1092 if ( flagPrintJunction )
1093 {
1094 // Print the remaining elements in the queue
1095 i = qHead ;
1096 while ( i != qTail )
1097 {
1098 PrintJunction( prevChrome, junctionQueue[i] ) ;
1099 ++i ;
1100 if ( i >= QUEUE_SIZE )
1101 i = 0 ;
1102 }
1103 }
1104
1105 //fprintf( stderr, "The number of junctions: %d\n", junctionCnt ) ;
1106 return 0 ;
1107 }
1108