Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/FindJunction.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 /* | |
2 Find junctions from the SAM file generated by Tophat2. | |
3 The SAM file should be sorted. | |
4 The junction is the start and the end coordinates of the spliced region in this program. The output of the junction is a bit different. | |
5 | |
6 Usage: ./a.out input [option] >output. | |
7 ./a.out -h for help. | |
8 */ | |
9 #include <stdio.h> | |
10 #include <string.h> | |
11 #include <stdlib.h> | |
12 | |
13 #include "sam.h" | |
14 | |
15 #define LINE_SIZE 4097 | |
16 #define QUEUE_SIZE 10001 | |
17 #define HASH_MAX 1000003 | |
18 | |
19 struct _readTree | |
20 { | |
21 char id[256] ; | |
22 int leftAnchor, rightAnchor ; | |
23 int pos ; | |
24 //bool secondary ; | |
25 bool valid ; | |
26 int editDistance ; | |
27 int NH, cnt ; // if cnt < NH, then it has real secondary match for this splice junction | |
28 struct _readTree *left, *right ; | |
29 | |
30 int flag ;// The flag from sam head. | |
31 } ; | |
32 | |
33 // The structure of a junction | |
34 struct _junction | |
35 { | |
36 int start, end ; | |
37 int readCnt ; // The # of reads containing this junction | |
38 int secReadCnt ; // The # of reads whose secondary alignment has this junction. | |
39 char strand ; // On '+' or '-' strand | |
40 int leftAnchor, rightAnchor ; // The longest left and right anchor | |
41 int oppositeAnchor ; // The longest anchor of the shorter side. | |
42 int uniqEditDistance, secEditDistance ; | |
43 struct _readTree head ; | |
44 } ; | |
45 | |
46 char line[LINE_SIZE] ; | |
47 char col[11][LINE_SIZE] ; // The option fields is not needed. | |
48 char strand ; // Extract XS field | |
49 char noncanonStrandInfo ; | |
50 //bool secondary ; | |
51 int NH ; | |
52 int editDistance ; | |
53 int mateStart ; | |
54 int filterYS ; | |
55 int samFlag ; | |
56 | |
57 struct _junction junctionQueue[QUEUE_SIZE] ; // Expected only a few junctions in it for each read. This queue is sorted. | |
58 | |
59 int qHead, qTail ; | |
60 | |
61 bool flagPrintJunction ; | |
62 bool flagPrintAll ; | |
63 bool flagStrict ; | |
64 int junctionCnt ; | |
65 bool anchorBoth ; | |
66 bool validRead ; | |
67 | |
68 int flank ; | |
69 struct _cigarSeg | |
70 { | |
71 int len ; | |
72 char type ; | |
73 } ; | |
74 | |
75 struct _readTree *contradictedReads ; | |
76 | |
77 char nucToNum[26] = { 0, 4, 1, 4, 4, 4, 2, | |
78 4, 4, 4, 4, 4, 4, 4, | |
79 4, 4, 4, 4, 4, 3, | |
80 4, 4, 4, 4, 4, 4 } ; | |
81 | |
82 void PrintHelp() | |
83 { | |
84 printf( | |
85 "Prints reads from the SAM/BAM file that containing junctions.\n" | |
86 "Usage: ./a.out input [option]>output\n" | |
87 "Options:\n" | |
88 "\t-j xx [-B]: Output the junctions using 1-based coordinates. The format is \"reference id\" start end \"# of read\" strand.(They are sorted)\n and the xx is an integer means the maximum unqualified anchor length for a splice junction(default=8). If -B, the splice junction must be supported by a read whose both anchors are longer than xx.\n" | |
89 "\t-a: Output all the junctions, and use non-positive support number to indicate unqualified junctions.\n" | |
90 "\t-y: If the bits from YS field of bam matches the argument, we filter the alignment (default: 4).\n" | |
91 ) ; | |
92 } | |
93 | |
94 void GetJunctionInfo( struct _junction &junc, struct _readTree *p ) | |
95 { | |
96 if ( p == NULL ) | |
97 return ; | |
98 | |
99 if ( p->valid ) | |
100 { | |
101 //if ( junc.start == 22381343 + 1 && junc.end == 22904987 - 1 ) | |
102 // printf( "%s %d %d %d\n", p->id, p->leftAnchor, p->rightAnchor, p->flag ) ; | |
103 | |
104 | |
105 | |
106 if ( p->cnt < p->NH ) | |
107 { | |
108 junc.secEditDistance += p->editDistance ; | |
109 ++junc.secReadCnt ; | |
110 } | |
111 else | |
112 { | |
113 junc.uniqEditDistance += p->editDistance ; | |
114 ++junc.readCnt ; | |
115 } | |
116 int l = p->leftAnchor, r = p->rightAnchor ; | |
117 | |
118 if ( !anchorBoth ) | |
119 { | |
120 if ( l > junc.leftAnchor ) | |
121 junc.leftAnchor = l ; | |
122 if ( r > junc.rightAnchor ) | |
123 junc.rightAnchor = r ; | |
124 | |
125 if ( l <= r && l > junc.oppositeAnchor ) | |
126 junc.oppositeAnchor = l ; | |
127 else if ( r < l && r > junc.oppositeAnchor ) | |
128 junc.oppositeAnchor = r ; | |
129 } | |
130 else | |
131 { | |
132 if ( l > flank && r > flank ) | |
133 { | |
134 junc.leftAnchor = l ; | |
135 junc.rightAnchor = r ; | |
136 } | |
137 | |
138 if ( l <= r && l > junc.oppositeAnchor ) | |
139 junc.oppositeAnchor = l ; | |
140 else if ( r < l && r > junc.oppositeAnchor ) | |
141 junc.oppositeAnchor = r ; | |
142 } | |
143 } | |
144 GetJunctionInfo( junc, p->left ) ; | |
145 GetJunctionInfo( junc, p->right ) ; | |
146 } | |
147 | |
148 void PrintJunctionReads( struct _junction &junc, struct _readTree *p ) | |
149 { | |
150 if ( p == NULL ) | |
151 return ; | |
152 | |
153 if ( p->valid ) | |
154 printf( "%s\n", p->id ) ; | |
155 PrintJunctionReads( junc, p->left ) ; | |
156 PrintJunctionReads( junc, p->right ) ; | |
157 } | |
158 | |
159 void PrintJunction( char *chrome, struct _junction &junc ) | |
160 { | |
161 int sum ; | |
162 junc.leftAnchor = 0 ; | |
163 junc.rightAnchor = 0 ; | |
164 junc.oppositeAnchor = 0 ; | |
165 junc.readCnt = 0 ; | |
166 junc.secReadCnt = 0 ; | |
167 junc.uniqEditDistance = 0 ; | |
168 junc.secEditDistance = 0 ; | |
169 GetJunctionInfo( junc, &junc.head ) ; | |
170 | |
171 sum = junc.readCnt + junc.secReadCnt ; | |
172 | |
173 if ( junc.leftAnchor <= flank || junc.rightAnchor <= flank || ( junc.readCnt + junc.secReadCnt <= 0 ) ) | |
174 { | |
175 if ( flagPrintAll ) | |
176 { | |
177 sum = -sum ; | |
178 } | |
179 else | |
180 return ; | |
181 } | |
182 | |
183 /*if ( junc.end - junc.start + 1 >= 200000 ) | |
184 { | |
185 if ( junc.secReadCnt > 0 ) | |
186 junc.secReadCnt = 1 ; | |
187 }*/ | |
188 | |
189 /*if ( junc.oppositeAnchor <= ( ( flank / 2 < 1 ) ? flank / 2 : 1 ) && ( junc.readCnt + junc.secReadCnt ) <= 10 ) | |
190 { | |
191 if ( junc.readCnt > 0 ) | |
192 junc.readCnt = 1 ; | |
193 if ( junc.secReadCnt > 0 ) | |
194 junc.secReadCnt = 0 ; | |
195 }*/ | |
196 | |
197 printf( "%s %d %d %d %c %d %d %d %d\n", chrome, junc.start - 1, junc.end + 1, sum, junc.strand, | |
198 junc.readCnt, junc.secReadCnt, junc.uniqEditDistance, junc.secEditDistance ) ; | |
199 //PrintJunctionReads( junc, &junc.head ) ; | |
200 } | |
201 | |
202 void ClearReadTree( struct _readTree *p ) | |
203 { | |
204 if ( p == NULL ) | |
205 return ; | |
206 ClearReadTree( p->left ) ; | |
207 ClearReadTree( p->right ) ; | |
208 free( p ) ; | |
209 } | |
210 | |
211 // Insert to the read tree | |
212 bool InsertReadTree( struct _readTree *p, char *id, int l, int r ) | |
213 { | |
214 int tmp = strcmp( p->id, id ) ; | |
215 if ( tmp == 0 ) | |
216 { | |
217 p->cnt += 1 ; | |
218 return true ; | |
219 } | |
220 else if ( tmp < 0 ) | |
221 { | |
222 if ( p->left ) | |
223 return InsertReadTree( p->left, id, l, r ) ; | |
224 else | |
225 { | |
226 p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; | |
227 strcpy( p->left->id, id ) ; | |
228 p->left->leftAnchor = l ; | |
229 p->left->rightAnchor = r ; | |
230 //p->left->secondary = secondary ; | |
231 p->left->editDistance = editDistance ; | |
232 p->left->left = p->left->right = NULL ; | |
233 p->left->valid = validRead ; | |
234 p->left->cnt = 1 ; | |
235 p->left->NH = NH ; | |
236 p->left->flag = samFlag ; | |
237 return false ; | |
238 } | |
239 } | |
240 else | |
241 { | |
242 if ( p->right ) | |
243 return InsertReadTree( p->right, id, l, r ) ; | |
244 else | |
245 { | |
246 p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; | |
247 strcpy( p->right->id, id ) ; | |
248 p->right->leftAnchor = l ; | |
249 p->right->rightAnchor = r ; | |
250 //p->right->secondary = secondary ; | |
251 p->right->editDistance = editDistance ; | |
252 p->right->left = p->right->right = NULL ; | |
253 p->right->valid = validRead ; | |
254 p->right->cnt = 1 ; | |
255 p->right->NH = NH ; | |
256 p->right->flag = samFlag ; | |
257 return false ; | |
258 } | |
259 } | |
260 } | |
261 | |
262 | |
263 bool SearchContradictedReads( struct _readTree *p, char *id, int pos ) | |
264 { | |
265 if ( p == NULL ) | |
266 return false ; | |
267 int tmp = strcmp( p->id, id ) ; | |
268 if ( tmp == 0 && p->pos == pos ) | |
269 return true ; | |
270 else if ( tmp <= 0 ) | |
271 return SearchContradictedReads( p->left, id, pos ) ; | |
272 else | |
273 return SearchContradictedReads( p->right, id, pos ) ; | |
274 } | |
275 | |
276 bool InsertContradictedReads( struct _readTree *p, char *id, int pos ) | |
277 { | |
278 if ( p == NULL ) | |
279 { | |
280 contradictedReads = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; | |
281 strcpy( contradictedReads->id, id ) ; | |
282 contradictedReads->pos = pos ; | |
283 contradictedReads->left = contradictedReads->right = NULL ; | |
284 return false ; | |
285 } | |
286 int tmp = strcmp( p->id, id ) ; | |
287 if ( tmp == 0 && p->pos == pos ) | |
288 return true ; | |
289 else if ( tmp <= 0 ) | |
290 { | |
291 if ( p->left ) | |
292 return InsertContradictedReads( p->left, id, pos ) ; | |
293 else | |
294 { | |
295 p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; | |
296 strcpy( p->left->id, id ) ; | |
297 p->left->pos = pos ; | |
298 p->left->left = p->left->right = NULL ; | |
299 return false ; | |
300 } | |
301 } | |
302 else | |
303 { | |
304 if ( p->right ) | |
305 return InsertContradictedReads( p->right, id, pos ) ; | |
306 else | |
307 { | |
308 p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; | |
309 strcpy( p->right->id, id ) ; | |
310 p->right->pos = pos ; | |
311 p->right->left = p->right->right = NULL ; | |
312 return false ; | |
313 } | |
314 } | |
315 } | |
316 | |
317 struct _readTree *GetReadTreeNode( struct _readTree *p, char *id ) | |
318 { | |
319 if ( p == NULL ) | |
320 return NULL ; | |
321 int tmp = strcmp( p->id, id ) ; | |
322 if ( tmp == 0 ) | |
323 return p ; | |
324 else if ( tmp < 0 ) | |
325 return GetReadTreeNode( p->left, id ) ; | |
326 else | |
327 return GetReadTreeNode( p->right, id ) ; | |
328 } | |
329 | |
330 // Insert the new junction into the queue, | |
331 // and make sure the queue is sorted. | |
332 // Assume each junction range is only on one strand. | |
333 // l, r is the left and right anchor from a read | |
334 void InsertQueue( int start, int end, int l, int r ) | |
335 { | |
336 int i, j ; | |
337 i = qTail ; | |
338 | |
339 | |
340 while ( i != qHead ) | |
341 { | |
342 j = i - 1 ; | |
343 if ( j < 0 ) | |
344 j = QUEUE_SIZE - 1 ; | |
345 | |
346 if ( junctionQueue[j].start < start | |
347 || ( junctionQueue[j].start == start && junctionQueue[j].end < end ) ) | |
348 break ; | |
349 | |
350 junctionQueue[i] = junctionQueue[j] ; | |
351 --i ; | |
352 | |
353 if ( i < 0 ) | |
354 i = QUEUE_SIZE - 1 ; | |
355 } | |
356 | |
357 junctionQueue[i].start = start ; | |
358 junctionQueue[i].end = end ; | |
359 /*if ( !secondary ) | |
360 { | |
361 junctionQueue[i].readCnt = 1 ; | |
362 junctionQueue[i].secReadCnt = 0 ; | |
363 } | |
364 else | |
365 { | |
366 junctionQueue[i].readCnt = 0 ; | |
367 junctionQueue[i].secReadCnt = 1 ; | |
368 }*/ | |
369 junctionQueue[i].strand = strand ; | |
370 junctionQueue[i].leftAnchor = l ; | |
371 junctionQueue[i].rightAnchor = r ; | |
372 | |
373 strcpy( junctionQueue[i].head.id, col[0] ) ; | |
374 junctionQueue[i].head.valid = validRead ; | |
375 junctionQueue[i].head.leftAnchor = l ; | |
376 junctionQueue[i].head.rightAnchor = r ; | |
377 junctionQueue[i].head.left = junctionQueue[i].head.right = NULL ; | |
378 //junctionQueue[i].head.secondary = secondary ; | |
379 junctionQueue[i].head.NH = NH ; | |
380 junctionQueue[i].head.cnt = 1 ; | |
381 junctionQueue[i].head.editDistance = editDistance ; | |
382 | |
383 ++qTail ; | |
384 if ( qTail >= QUEUE_SIZE ) | |
385 qTail = 0 ; | |
386 } | |
387 | |
388 // Search the queue, and remove the heads if its end(start) is smaller than prune.(Because it is impossible to have that junction again) | |
389 // Add the junction to the queue, if it is not in it. | |
390 // Return true, if it finds the junction. Otherwise, return false. | |
391 bool SearchQueue( int start, int end, int prune, int l, int r ) | |
392 { | |
393 int i ; | |
394 | |
395 // Test whether this read might be a false alignment. | |
396 i = qHead ; | |
397 while ( i != qTail ) | |
398 { | |
399 struct _readTree *rt ; | |
400 if ( ( junctionQueue[i].start == start && junctionQueue[i].end < end ) || | |
401 ( junctionQueue[i].start > start && junctionQueue[i].end == end ) ) | |
402 { | |
403 // This alignment is false ; | |
404 rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ; | |
405 // the commented out logic because it is handled by contradicted reads | |
406 if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) ) | |
407 { | |
408 if ( rt->leftAnchor <= flank || rt->rightAnchor <= flank )//|| rt->secondary ) | |
409 { | |
410 if ( l > flank && r > flank )//&& !secondary ) | |
411 rt->valid = false ; | |
412 else | |
413 validRead = false ; | |
414 } | |
415 else | |
416 { | |
417 // Ignore this read | |
418 //return true ; | |
419 validRead = false ; | |
420 } | |
421 } | |
422 } | |
423 else if ( ( junctionQueue[i].start == start && junctionQueue[i].end > end ) || | |
424 ( junctionQueue[i].start < start && junctionQueue[i].end == end ) ) | |
425 { | |
426 // This other alignment is false ; | |
427 rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ; | |
428 //if ( rt != NULL ) | |
429 if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) ) | |
430 { | |
431 if ( l <= flank || r <= flank )//|| secondary ) | |
432 { | |
433 if ( rt->leftAnchor > flank && rt->rightAnchor > flank )//&& !rt->secondary ) | |
434 validRead = false ; | |
435 else | |
436 rt->valid = false ; | |
437 } | |
438 else | |
439 rt->valid = false ; | |
440 } | |
441 } | |
442 ++i ; | |
443 if ( i >= QUEUE_SIZE ) | |
444 i = 0 ; | |
445 } | |
446 | |
447 //if ( start == 8077108 && end == 8078439 ) | |
448 // exit( 1 ) ; | |
449 i = qHead ; | |
450 | |
451 while ( i != qTail ) | |
452 { | |
453 if ( junctionQueue[i].start == start && | |
454 junctionQueue[i].end == end ) | |
455 { | |
456 if (junctionQueue[i].strand == '?' && junctionQueue[i].strand != strand ) | |
457 { | |
458 junctionQueue[i].strand = strand ; | |
459 } | |
460 InsertReadTree( &junctionQueue[i].head, col[0], l, r ) ; | |
461 return true ; | |
462 } | |
463 | |
464 if ( junctionQueue[i].end < prune && i == qHead ) | |
465 { | |
466 // pop | |
467 PrintJunction( col[2], junctionQueue[i] ) ; | |
468 ClearReadTree( junctionQueue[i].head.left ) ; | |
469 ClearReadTree( junctionQueue[i].head.right ) ; | |
470 ++qHead ; | |
471 if ( qHead >= QUEUE_SIZE ) | |
472 qHead = 0 ; | |
473 } | |
474 ++i ; | |
475 if ( i >= QUEUE_SIZE ) | |
476 i = 0 ; | |
477 } | |
478 | |
479 InsertQueue( start, end, l, r ) ; | |
480 return false ; | |
481 } | |
482 | |
483 // Compute the junctions based on the CIGAR | |
484 bool CompareJunctions( int startLocation, char *cigar ) | |
485 { | |
486 int currentLocation = startLocation ; // Current location on the reference genome | |
487 int i, j ; | |
488 int num ; | |
489 int newJuncCnt = 0 ; // The # of junctions in the read, and the # of new junctions among them. | |
490 | |
491 struct _cigarSeg cigarSeg[1000] ; // A segment of the cigar. | |
492 int ccnt = 0 ; // cigarSeg cnt | |
493 | |
494 j = 0 ; | |
495 num = 0 ; | |
496 validRead = true ; | |
497 | |
498 for ( i = 0 ; cigar[i] ; ++i ) | |
499 { | |
500 if ( cigar[i] >= '0' && cigar[i] <= '9' ) | |
501 { | |
502 num = num * 10 + cigar[i] - '0' ; | |
503 } | |
504 else | |
505 { | |
506 cigarSeg[ccnt].len = num ; | |
507 cigarSeg[ccnt].type = cigar[i] ; | |
508 ++ccnt ; | |
509 num = 0 ; | |
510 } | |
511 } | |
512 | |
513 // Filter low complex alignment. | |
514 // Only applies this to alignments does not have a strand information. | |
515 if ( strand == '?' ) | |
516 { | |
517 if ( noncanonStrandInfo != -1 ) | |
518 { | |
519 if ( ( noncanonStrandInfo & filterYS ) != 0 ) | |
520 validRead = false ; | |
521 } | |
522 else | |
523 { | |
524 int softStart = -1 ; | |
525 int softEnd = 0 ; | |
526 if ( cigarSeg[0].type == 'S' ) | |
527 softStart = cigarSeg[0].len ; | |
528 if ( cigarSeg[ ccnt - 1 ].type == 'S' ) | |
529 softEnd = cigarSeg[ ccnt - 1 ].len ; | |
530 int readLen = strlen( col[9] ) ; | |
531 int count[5] = { 0, 0, 0, 0, 0 } ; | |
532 | |
533 int pos = 0 ; | |
534 for ( i = 0 ; i < ccnt ; ++i ) | |
535 { | |
536 switch ( cigarSeg[i].type ) | |
537 { | |
538 case 'S': | |
539 pos += cigarSeg[i].len ; | |
540 case 'M': | |
541 case 'I': | |
542 { | |
543 for ( j = 0 ; j < cigarSeg[i].len ; ++j ) | |
544 ++count[ nucToNum[ col[9][pos + j] - 'A' ] ] ; | |
545 pos += j ; | |
546 } break ; | |
547 case 'N': | |
548 { | |
549 int max = 0 ; | |
550 int sum = 0 ; | |
551 for ( j = 0 ; j < 5 ; ++j ) | |
552 { | |
553 if ( count[j] > max ) | |
554 max = count[j] ; | |
555 sum += count[j] ; | |
556 } | |
557 if ( max > 0.8 * sum ) | |
558 validRead = false ; | |
559 count[0] = count[1] = count[2] = count[3] = count[4] = 0 ; | |
560 } break ; | |
561 case 'H': | |
562 case 'P': | |
563 case 'D': | |
564 default: break ; | |
565 } | |
566 } | |
567 | |
568 int max = 0 ; | |
569 int sum = 0 ; | |
570 for ( j = 0 ; j < 5 ; ++j ) | |
571 { | |
572 if ( count[j] > max ) | |
573 max = count[j] ; | |
574 sum += count[j] ; | |
575 } | |
576 if ( max > 0.8 * sum ) | |
577 validRead = false ; | |
578 /* count[0] = count[1] = count[2] = count[3] = count[4] = 0 ; | |
579 | |
580 | |
581 for ( i = softStart + 1 ; i < readLen - softEnd ; ++i ) | |
582 { | |
583 switch ( col[9][i] ) | |
584 { | |
585 case 'A': ++count[0] ; break ; | |
586 case 'C': ++count[1] ; break ; | |
587 case 'G': ++count[2] ; break ; | |
588 case 'T': ++count[3] ; break ; | |
589 default: ++count[4] ; | |
590 } | |
591 } | |
592 int max = 0 ; | |
593 for ( j = 0 ; j < 5 ; ++j ) | |
594 if ( count[j] > max ) | |
595 max = count[j] ; | |
596 if ( max > 0.6 * ( readLen - softEnd - softStart - 1 ) ) | |
597 validRead = false ;*/ | |
598 } | |
599 } | |
600 | |
601 // Test whether contradict with mate pair | |
602 if ( col[6][0] == '=' ) | |
603 { | |
604 currentLocation = startLocation ; | |
605 for ( i = 0 ; i < ccnt ; ++i ) | |
606 { | |
607 if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H' | |
608 || cigarSeg[i].type == 'P' ) | |
609 continue ; | |
610 else if ( cigarSeg[i].type == 'N' ) | |
611 { | |
612 if ( mateStart >= currentLocation && mateStart <= currentLocation + cigarSeg[i].len - 1 ) | |
613 { | |
614 /*if ( cigarSeg[i].len == 91486 ) | |
615 { | |
616 printf( "%s %d %d\n", currentLocation, mateStart ) ; | |
617 exit( 1 ) ; | |
618 }*/ | |
619 // ignore this read | |
620 //return false ; | |
621 InsertContradictedReads( contradictedReads, col[0], mateStart ) ; | |
622 validRead = false ; | |
623 break ; | |
624 } | |
625 } | |
626 currentLocation += cigarSeg[i].len ; | |
627 } | |
628 | |
629 i = qHead ; | |
630 // Search if it falls in the splices junction created by its mate. | |
631 while ( i != qTail ) | |
632 { | |
633 if ( mateStart < junctionQueue[i].start && junctionQueue[i].start <= startLocation | |
634 && startLocation <= junctionQueue[i].end | |
635 && SearchContradictedReads( contradictedReads, col[0], startLocation ) ) | |
636 { | |
637 validRead = false ; | |
638 break ; | |
639 } | |
640 ++i ; | |
641 if ( i >= QUEUE_SIZE ) | |
642 i = 0 ; | |
643 } | |
644 } | |
645 | |
646 currentLocation = startLocation ; | |
647 for ( i = 0 ; i < ccnt ; ++i ) | |
648 { | |
649 if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H' | |
650 || cigarSeg[i].type == 'P' ) | |
651 continue ; | |
652 else if ( cigarSeg[i].type == 'N' ) | |
653 { | |
654 int left, right ; | |
655 int tmp ; | |
656 tmp = i ; | |
657 while ( i > 0 && ( cigarSeg[i - 1].type == 'I' || cigarSeg[i - 1].type == 'D' ) ) | |
658 --i ; | |
659 if ( i > 0 && cigarSeg[i - 1].type == 'M' ) | |
660 { | |
661 left = cigarSeg[i - 1].len ; | |
662 if ( i >= 2 && cigarSeg[i - 2].type == 'N' && left <= flank ) | |
663 { | |
664 left = flank + 1 ; | |
665 } | |
666 } | |
667 else | |
668 left = 0 ; | |
669 | |
670 i = tmp ; | |
671 while ( i < ccnt && ( cigarSeg[i + 1].type == 'I' || cigarSeg[i + 1].type == 'D' ) ) | |
672 ++i ; | |
673 if ( i < ccnt && cigarSeg[i + 1].type == 'M' ) | |
674 { | |
675 right = cigarSeg[i + 1].len ; | |
676 if ( i + 2 < ccnt && cigarSeg[i + 2].type == 'N' && right <= flank ) | |
677 { | |
678 right = flank + 1 ; | |
679 } | |
680 } | |
681 else | |
682 right = 0 ; | |
683 i = tmp ; | |
684 if ( !SearchQueue( currentLocation, currentLocation + cigarSeg[i].len - 1, startLocation, left, right ) ) | |
685 { | |
686 ++newJuncCnt ; | |
687 } | |
688 } | |
689 currentLocation += cigarSeg[i].len ; | |
690 } | |
691 /*else if ( cigar[i] == 'I' ) | |
692 { | |
693 num = 0 ; | |
694 } | |
695 else if ( cigar[i] == 'N' ) | |
696 { | |
697 if ( !SearchQueue( currentLocation, currentLocation + num - 1, startLocation ) ) | |
698 { | |
699 ++newJuncCnt ; | |
700 //if ( flagPrintJunction ) | |
701 // printf( "%s %d %d\n", col[2], currentLocation - 2, currentLocation + len - 1 ) ; | |
702 } | |
703 currentLocation += num ; | |
704 num = 0 ; | |
705 } | |
706 else // Other operations, like M, D,..., | |
707 { | |
708 currentLocation += num ; | |
709 num = 0 ; | |
710 }*/ | |
711 | |
712 junctionCnt += newJuncCnt ; | |
713 | |
714 if ( newJuncCnt ) | |
715 return true ; | |
716 else | |
717 return false ; | |
718 } | |
719 | |
720 void cigar2string( bam1_core_t *c, uint32_t *in_cigar, char *out_cigar ) | |
721 { | |
722 int k, op, l ; | |
723 char opcode ; | |
724 | |
725 *out_cigar = '\0' ; | |
726 for ( k = 0 ; k < c->n_cigar ; ++k ) | |
727 { | |
728 op = in_cigar[k] & BAM_CIGAR_MASK ; | |
729 l = in_cigar[k] >> BAM_CIGAR_SHIFT ; | |
730 switch (op) | |
731 { | |
732 case BAM_CMATCH: opcode = 'M' ; break ; | |
733 case BAM_CINS: opcode = 'I' ; break ; | |
734 case BAM_CDEL: opcode = 'D' ; break ; | |
735 case BAM_CREF_SKIP: opcode = 'N' ; break ; | |
736 case BAM_CSOFT_CLIP: opcode = 'S' ; break ; | |
737 case BAM_CHARD_CLIP: opcode = 'H' ; break ; | |
738 case BAM_CPAD: opcode = 'P' ; break ; | |
739 } | |
740 sprintf( out_cigar + strlen( out_cigar ), "%d%c", l, opcode ) ; | |
741 } | |
742 } | |
743 | |
744 | |
745 | |
746 int main( int argc, char *argv[] ) | |
747 { | |
748 FILE *fp ; | |
749 samfile_t *fpsam ; | |
750 bam1_t *b = NULL ; | |
751 bool useSam = true ; | |
752 | |
753 int i, len ; | |
754 int startLocation ; | |
755 bool flagRemove = false ; | |
756 char prevChrome[103] ; | |
757 | |
758 anchorBoth = false ; | |
759 | |
760 flagPrintJunction = false ; | |
761 flagPrintAll = false ; | |
762 flagStrict = false ; | |
763 junctionCnt = 0 ; | |
764 bool hasMateReadIdSuffix = false ; | |
765 | |
766 strcpy( prevChrome, "" ) ; | |
767 flagRemove = true ; | |
768 flagPrintJunction = true ; | |
769 flank = 8 ; | |
770 filterYS = 4 ; | |
771 | |
772 contradictedReads = NULL ; | |
773 | |
774 // processing the argument list | |
775 for ( i = 1 ; i < argc ; ++i ) | |
776 { | |
777 if ( !strcmp( argv[i], "-h" ) ) | |
778 { | |
779 PrintHelp() ; | |
780 return 0 ; | |
781 } | |
782 else if ( !strcmp( argv[i], "-r" ) ) | |
783 { | |
784 flagRemove = true ; | |
785 } | |
786 else if ( !strcmp( argv[i], "-j" ) ) | |
787 { | |
788 strcpy( prevChrome, "" ) ; | |
789 flagRemove = true ; | |
790 flagPrintJunction = true ; | |
791 flank = atoi( argv[i + 1] ) ; | |
792 if ( i + 2 < argc && !strcmp( argv[i+2], "-B" ) ) | |
793 { | |
794 anchorBoth = true ; | |
795 ++i ; | |
796 } | |
797 ++i ; | |
798 } | |
799 else if ( !strcmp( argv[i], "-a" ) ) | |
800 { | |
801 flagPrintAll = true ; | |
802 } | |
803 else if ( !strcmp( argv[i], "--strict" ) ) | |
804 { | |
805 flagStrict = true ; | |
806 } | |
807 else if ( !strcmp( argv[i], "-y" ) ) | |
808 { | |
809 filterYS = atoi( argv[i + 1] ) ; | |
810 ++i ; | |
811 } | |
812 else if ( !strcmp( argv[i], "--hasMateIdSuffix" ) ) | |
813 { | |
814 hasMateReadIdSuffix = true ; | |
815 ++i ; | |
816 } | |
817 else if ( i > 1 ) | |
818 { | |
819 printf( "Unknown option %s\n", argv[i] ) ; | |
820 exit( 1 ) ; | |
821 } | |
822 } | |
823 if ( argc == 1 ) | |
824 { | |
825 PrintHelp() ; | |
826 return 0 ; | |
827 } | |
828 | |
829 len = strlen( argv[1] ) ; | |
830 if ( argv[1][len-3] == 'b' || argv[1][len-3] == 'B' ) | |
831 { | |
832 if ( !( fpsam = samopen( argv[1], "rb", 0 ) ) ) | |
833 return 0 ; | |
834 | |
835 if ( !fpsam->header ) | |
836 { | |
837 //samclose( fpsam ) ; | |
838 //fpsam = samopen( argv[1], "r", 0 ) ; | |
839 //if ( !fpsam->header ) | |
840 //{ | |
841 useSam = false ; | |
842 fp = fopen( argv[1], "r" ) ; | |
843 //} | |
844 } | |
845 } | |
846 else | |
847 { | |
848 useSam = false ; | |
849 fp = NULL ; | |
850 if ( !strcmp( argv[1], "-" ) ) | |
851 fp = stdin ; | |
852 else | |
853 fp = fopen( argv[1], "r" ) ; | |
854 if ( fp == NULL ) | |
855 { | |
856 printf( "Could not open file %s\n", argv[1] ) ; | |
857 return 0 ; | |
858 } | |
859 } | |
860 | |
861 while ( 1 ) | |
862 { | |
863 int flag = 0 ; | |
864 if ( useSam ) | |
865 { | |
866 if ( b ) | |
867 bam_destroy1( b ) ; | |
868 b = bam_init1() ; | |
869 if ( samread( fpsam, b ) <= 0 ) | |
870 break ; | |
871 if ( b->core.tid >= 0 ) | |
872 strcpy( col[2], fpsam->header->target_name[b->core.tid] ) ; | |
873 else | |
874 strcpy( col[2], "-1" ) ; | |
875 cigar2string( &(b->core), bam1_cigar( b ), col[5] ) ; | |
876 strcpy( col[0], bam1_qname( b ) ) ; | |
877 flag = b->core.flag ; | |
878 if ( bam_aux_get( b, "NH" ) ) | |
879 { | |
880 /*if ( bam_aux2i( bam_aux_get(b, "NH" ) ) >= 2 ) | |
881 { | |
882 secondary = true ; | |
883 } | |
884 else | |
885 secondary = false ;*/ | |
886 | |
887 NH = bam_aux2i( bam_aux_get( b, "NH" ) ) ; | |
888 } | |
889 else | |
890 { | |
891 //secondary = false ; | |
892 NH = 1 ; | |
893 } | |
894 | |
895 if ( bam_aux_get( b, "NM" ) ) | |
896 { | |
897 editDistance = bam_aux2i( bam_aux_get( b, "NM" ) ) ; | |
898 } | |
899 else if ( bam_aux_get( b, "nM" ) ) | |
900 { | |
901 editDistance = bam_aux2i( bam_aux_get( b, "nM" ) ) ; | |
902 } | |
903 else | |
904 editDistance = 0 ; | |
905 | |
906 mateStart = b->core.mpos + 1 ; | |
907 if ( b->core.mtid == b->core.tid ) | |
908 col[6][0] = '=' ; | |
909 else | |
910 col[6][0] = '*' ; | |
911 | |
912 if ( b->core.l_qseq < 20 ) | |
913 continue ; | |
914 | |
915 for ( i = 0 ; i < b->core.l_qseq ; ++i ) | |
916 { | |
917 int bit = bam1_seqi( bam1_seq( b ), i ) ; | |
918 switch ( bit ) | |
919 { | |
920 case 1: col[9][i] = 'A' ; break ; | |
921 case 2: col[9][i] = 'C' ; break ; | |
922 case 4: col[9][i] = 'G' ; break ; | |
923 case 8: col[9][i] = 'T' ; break ; | |
924 case 15: col[9][i] = 'N' ; break ; | |
925 default: col[9][i] = 'A' ; break ; | |
926 } | |
927 } | |
928 col[9][i] = '\0' ; | |
929 | |
930 /*if ( flag & 0x100 ) | |
931 secondary = true ; | |
932 else | |
933 secondary = false ;*/ | |
934 } | |
935 else | |
936 { | |
937 char *p ; | |
938 if ( !fgets( line, LINE_SIZE, fp ) ) | |
939 break ; | |
940 if ( line[0] == '\0' || line[0] == '@' ) | |
941 continue ; | |
942 sscanf( line, "%s%s%s%s%s%s%s%s%s%s%s", col, col + 1, col + 2, col + 3, col + 4, | |
943 col + 5, col + 6, col + 7, col + 8, col + 9, col + 10 ) ; | |
944 | |
945 flag = atoi( col[1] ) ; | |
946 if ( p = strstr( line, "NH" ) ) | |
947 { | |
948 int k = 0 ; | |
949 p += 5 ; | |
950 while ( *p != ' ' && *p != '\t' && *p != '\0') | |
951 { | |
952 k = k * 10 + *p - '0' ; | |
953 ++p ; | |
954 } | |
955 /*if ( k >= 2 ) | |
956 { | |
957 secondary = true ; | |
958 } | |
959 else | |
960 secondary = false ;*/ | |
961 NH = k ; | |
962 } | |
963 else | |
964 { | |
965 //secondary = false ; | |
966 NH = 1 ; | |
967 } | |
968 if ( ( p = strstr( line, "NM" ) ) || ( p = strstr( line, "nM" ) ) ) | |
969 { | |
970 int k = 0 ; | |
971 p += 5 ; | |
972 while ( *p != ' ' && *p != '\t' && *p != '\0') | |
973 { | |
974 k = k * 10 + *p - '0' ; | |
975 ++p ; | |
976 } | |
977 editDistance = k ; | |
978 } | |
979 else | |
980 editDistance = 0 ; | |
981 | |
982 mateStart = atoi( col[7] ) ; | |
983 /*if ( flag & 0x100 ) | |
984 secondary = true ; | |
985 else | |
986 secondary = false ;*/ | |
987 | |
988 } | |
989 samFlag = flag ; | |
990 for ( i = 0 ; col[5][i] ; ++i ) | |
991 if ( col[5][i] == 'N' ) | |
992 break ; | |
993 | |
994 if ( !col[5][i] ) | |
995 continue ; | |
996 | |
997 // remove .1, .2 or /1, /2 suffix | |
998 if ( hasMateReadIdSuffix ) | |
999 { | |
1000 char *s = col[0] ; | |
1001 int len = strlen( s ) ; | |
1002 if ( len >= 2 && ( s[len - 1] == '1' || s[len - 1] == '2' ) | |
1003 && ( s[len - 2] == '.' || s[len - 2] == '/' ) ) | |
1004 { | |
1005 s[len - 2] = '\0' ; | |
1006 } | |
1007 } | |
1008 | |
1009 if ( useSam ) | |
1010 { | |
1011 if ( bam_aux_get( b, "XS" ) ) | |
1012 { | |
1013 strand = bam_aux2A( bam_aux_get( b, "XS" ) ) ; | |
1014 if ( bam_aux_get( b, "YS" ) ) | |
1015 { | |
1016 noncanonStrandInfo = bam_aux2i( bam_aux_get( b, "YS" ) ) ; | |
1017 } | |
1018 else | |
1019 { | |
1020 noncanonStrandInfo = -1 ; | |
1021 } | |
1022 } | |
1023 else | |
1024 { | |
1025 strand = '?' ; | |
1026 noncanonStrandInfo = -1 ; | |
1027 } | |
1028 startLocation = b->core.pos + 1 ; | |
1029 } | |
1030 else | |
1031 { | |
1032 if ( strstr( line, "XS:A:-" ) ) // on negative strand | |
1033 strand = '-' ; | |
1034 else if ( strstr( line, "XS:A:+" ) ) | |
1035 strand = '+' ; | |
1036 else | |
1037 { | |
1038 strand = '?' ; | |
1039 char *p = strstr( line, "YS:i:" ) ; | |
1040 if ( p != NULL ) | |
1041 { | |
1042 p += 5 ; | |
1043 noncanonStrandInfo = atoi( p ) ; | |
1044 } | |
1045 else | |
1046 noncanonStrandInfo = -1 ; | |
1047 } | |
1048 startLocation = atoi( col[3] ) ; | |
1049 } | |
1050 | |
1051 // Found the junctions from the read. | |
1052 if ( strcmp( prevChrome, col[2] ) ) | |
1053 { | |
1054 if ( flagPrintJunction ) | |
1055 { | |
1056 // Print the remaining elements in the queue | |
1057 i = qHead ; | |
1058 while ( i != qTail ) | |
1059 { | |
1060 PrintJunction( prevChrome, junctionQueue[i] ) ; | |
1061 ++i ; | |
1062 if ( i >= QUEUE_SIZE ) | |
1063 i = 0 ; | |
1064 } | |
1065 } | |
1066 | |
1067 // new chromosome | |
1068 ClearReadTree( contradictedReads ) ; | |
1069 contradictedReads = NULL ; | |
1070 qHead = qTail = 0 ; | |
1071 strcpy( prevChrome, col[2] ) ; | |
1072 } | |
1073 | |
1074 if ( flagRemove ) | |
1075 { | |
1076 if ( CompareJunctions( startLocation, col[5] ) ) | |
1077 { | |
1078 // Test whether this read has new junctions | |
1079 //++junctionCnt ; | |
1080 if ( !flagPrintJunction ) | |
1081 printf( "%s", line ) ; | |
1082 } | |
1083 } | |
1084 else | |
1085 { | |
1086 ++junctionCnt ; | |
1087 printf( "%s", line ) ; | |
1088 } | |
1089 //printf( "hi2 %s\n", col[0] ) ; | |
1090 } | |
1091 | |
1092 if ( flagPrintJunction ) | |
1093 { | |
1094 // Print the remaining elements in the queue | |
1095 i = qHead ; | |
1096 while ( i != qTail ) | |
1097 { | |
1098 PrintJunction( prevChrome, junctionQueue[i] ) ; | |
1099 ++i ; | |
1100 if ( i >= QUEUE_SIZE ) | |
1101 i = 0 ; | |
1102 } | |
1103 } | |
1104 | |
1105 //fprintf( stderr, "The number of junctions: %d\n", junctionCnt ) ; | |
1106 return 0 ; | |
1107 } | |
1108 |