Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/FindJunction.cpp @ 0:903fc43d6227 draft default tip
Uploaded
| author | lsong10 |
|---|---|
| date | Fri, 26 Mar 2021 16:52:45 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:903fc43d6227 |
|---|---|
| 1 /* | |
| 2 Find junctions from the SAM file generated by Tophat2. | |
| 3 The SAM file should be sorted. | |
| 4 The junction is the start and the end coordinates of the spliced region in this program. The output of the junction is a bit different. | |
| 5 | |
| 6 Usage: ./a.out input [option] >output. | |
| 7 ./a.out -h for help. | |
| 8 */ | |
| 9 #include <stdio.h> | |
| 10 #include <string.h> | |
| 11 #include <stdlib.h> | |
| 12 | |
| 13 #include "sam.h" | |
| 14 | |
| 15 #define LINE_SIZE 4097 | |
| 16 #define QUEUE_SIZE 10001 | |
| 17 #define HASH_MAX 1000003 | |
| 18 | |
| 19 struct _readTree | |
| 20 { | |
| 21 char id[256] ; | |
| 22 int leftAnchor, rightAnchor ; | |
| 23 int pos ; | |
| 24 //bool secondary ; | |
| 25 bool valid ; | |
| 26 int editDistance ; | |
| 27 int NH, cnt ; // if cnt < NH, then it has real secondary match for this splice junction | |
| 28 struct _readTree *left, *right ; | |
| 29 | |
| 30 int flag ;// The flag from sam head. | |
| 31 } ; | |
| 32 | |
| 33 // The structure of a junction | |
| 34 struct _junction | |
| 35 { | |
| 36 int start, end ; | |
| 37 int readCnt ; // The # of reads containing this junction | |
| 38 int secReadCnt ; // The # of reads whose secondary alignment has this junction. | |
| 39 char strand ; // On '+' or '-' strand | |
| 40 int leftAnchor, rightAnchor ; // The longest left and right anchor | |
| 41 int oppositeAnchor ; // The longest anchor of the shorter side. | |
| 42 int uniqEditDistance, secEditDistance ; | |
| 43 struct _readTree head ; | |
| 44 } ; | |
| 45 | |
| 46 char line[LINE_SIZE] ; | |
| 47 char col[11][LINE_SIZE] ; // The option fields is not needed. | |
| 48 char strand ; // Extract XS field | |
| 49 char noncanonStrandInfo ; | |
| 50 //bool secondary ; | |
| 51 int NH ; | |
| 52 int editDistance ; | |
| 53 int mateStart ; | |
| 54 int filterYS ; | |
| 55 int samFlag ; | |
| 56 | |
| 57 struct _junction junctionQueue[QUEUE_SIZE] ; // Expected only a few junctions in it for each read. This queue is sorted. | |
| 58 | |
| 59 int qHead, qTail ; | |
| 60 | |
| 61 bool flagPrintJunction ; | |
| 62 bool flagPrintAll ; | |
| 63 bool flagStrict ; | |
| 64 int junctionCnt ; | |
| 65 bool anchorBoth ; | |
| 66 bool validRead ; | |
| 67 | |
| 68 int flank ; | |
| 69 struct _cigarSeg | |
| 70 { | |
| 71 int len ; | |
| 72 char type ; | |
| 73 } ; | |
| 74 | |
| 75 struct _readTree *contradictedReads ; | |
| 76 | |
| 77 char nucToNum[26] = { 0, 4, 1, 4, 4, 4, 2, | |
| 78 4, 4, 4, 4, 4, 4, 4, | |
| 79 4, 4, 4, 4, 4, 3, | |
| 80 4, 4, 4, 4, 4, 4 } ; | |
| 81 | |
| 82 void PrintHelp() | |
| 83 { | |
| 84 printf( | |
| 85 "Prints reads from the SAM/BAM file that containing junctions.\n" | |
| 86 "Usage: ./a.out input [option]>output\n" | |
| 87 "Options:\n" | |
| 88 "\t-j xx [-B]: Output the junctions using 1-based coordinates. The format is \"reference id\" start end \"# of read\" strand.(They are sorted)\n and the xx is an integer means the maximum unqualified anchor length for a splice junction(default=8). If -B, the splice junction must be supported by a read whose both anchors are longer than xx.\n" | |
| 89 "\t-a: Output all the junctions, and use non-positive support number to indicate unqualified junctions.\n" | |
| 90 "\t-y: If the bits from YS field of bam matches the argument, we filter the alignment (default: 4).\n" | |
| 91 ) ; | |
| 92 } | |
| 93 | |
| 94 void GetJunctionInfo( struct _junction &junc, struct _readTree *p ) | |
| 95 { | |
| 96 if ( p == NULL ) | |
| 97 return ; | |
| 98 | |
| 99 if ( p->valid ) | |
| 100 { | |
| 101 //if ( junc.start == 22381343 + 1 && junc.end == 22904987 - 1 ) | |
| 102 // printf( "%s %d %d %d\n", p->id, p->leftAnchor, p->rightAnchor, p->flag ) ; | |
| 103 | |
| 104 | |
| 105 | |
| 106 if ( p->cnt < p->NH ) | |
| 107 { | |
| 108 junc.secEditDistance += p->editDistance ; | |
| 109 ++junc.secReadCnt ; | |
| 110 } | |
| 111 else | |
| 112 { | |
| 113 junc.uniqEditDistance += p->editDistance ; | |
| 114 ++junc.readCnt ; | |
| 115 } | |
| 116 int l = p->leftAnchor, r = p->rightAnchor ; | |
| 117 | |
| 118 if ( !anchorBoth ) | |
| 119 { | |
| 120 if ( l > junc.leftAnchor ) | |
| 121 junc.leftAnchor = l ; | |
| 122 if ( r > junc.rightAnchor ) | |
| 123 junc.rightAnchor = r ; | |
| 124 | |
| 125 if ( l <= r && l > junc.oppositeAnchor ) | |
| 126 junc.oppositeAnchor = l ; | |
| 127 else if ( r < l && r > junc.oppositeAnchor ) | |
| 128 junc.oppositeAnchor = r ; | |
| 129 } | |
| 130 else | |
| 131 { | |
| 132 if ( l > flank && r > flank ) | |
| 133 { | |
| 134 junc.leftAnchor = l ; | |
| 135 junc.rightAnchor = r ; | |
| 136 } | |
| 137 | |
| 138 if ( l <= r && l > junc.oppositeAnchor ) | |
| 139 junc.oppositeAnchor = l ; | |
| 140 else if ( r < l && r > junc.oppositeAnchor ) | |
| 141 junc.oppositeAnchor = r ; | |
| 142 } | |
| 143 } | |
| 144 GetJunctionInfo( junc, p->left ) ; | |
| 145 GetJunctionInfo( junc, p->right ) ; | |
| 146 } | |
| 147 | |
| 148 void PrintJunctionReads( struct _junction &junc, struct _readTree *p ) | |
| 149 { | |
| 150 if ( p == NULL ) | |
| 151 return ; | |
| 152 | |
| 153 if ( p->valid ) | |
| 154 printf( "%s\n", p->id ) ; | |
| 155 PrintJunctionReads( junc, p->left ) ; | |
| 156 PrintJunctionReads( junc, p->right ) ; | |
| 157 } | |
| 158 | |
| 159 void PrintJunction( char *chrome, struct _junction &junc ) | |
| 160 { | |
| 161 int sum ; | |
| 162 junc.leftAnchor = 0 ; | |
| 163 junc.rightAnchor = 0 ; | |
| 164 junc.oppositeAnchor = 0 ; | |
| 165 junc.readCnt = 0 ; | |
| 166 junc.secReadCnt = 0 ; | |
| 167 junc.uniqEditDistance = 0 ; | |
| 168 junc.secEditDistance = 0 ; | |
| 169 GetJunctionInfo( junc, &junc.head ) ; | |
| 170 | |
| 171 sum = junc.readCnt + junc.secReadCnt ; | |
| 172 | |
| 173 if ( junc.leftAnchor <= flank || junc.rightAnchor <= flank || ( junc.readCnt + junc.secReadCnt <= 0 ) ) | |
| 174 { | |
| 175 if ( flagPrintAll ) | |
| 176 { | |
| 177 sum = -sum ; | |
| 178 } | |
| 179 else | |
| 180 return ; | |
| 181 } | |
| 182 | |
| 183 /*if ( junc.end - junc.start + 1 >= 200000 ) | |
| 184 { | |
| 185 if ( junc.secReadCnt > 0 ) | |
| 186 junc.secReadCnt = 1 ; | |
| 187 }*/ | |
| 188 | |
| 189 /*if ( junc.oppositeAnchor <= ( ( flank / 2 < 1 ) ? flank / 2 : 1 ) && ( junc.readCnt + junc.secReadCnt ) <= 10 ) | |
| 190 { | |
| 191 if ( junc.readCnt > 0 ) | |
| 192 junc.readCnt = 1 ; | |
| 193 if ( junc.secReadCnt > 0 ) | |
| 194 junc.secReadCnt = 0 ; | |
| 195 }*/ | |
| 196 | |
| 197 printf( "%s %d %d %d %c %d %d %d %d\n", chrome, junc.start - 1, junc.end + 1, sum, junc.strand, | |
| 198 junc.readCnt, junc.secReadCnt, junc.uniqEditDistance, junc.secEditDistance ) ; | |
| 199 //PrintJunctionReads( junc, &junc.head ) ; | |
| 200 } | |
| 201 | |
| 202 void ClearReadTree( struct _readTree *p ) | |
| 203 { | |
| 204 if ( p == NULL ) | |
| 205 return ; | |
| 206 ClearReadTree( p->left ) ; | |
| 207 ClearReadTree( p->right ) ; | |
| 208 free( p ) ; | |
| 209 } | |
| 210 | |
| 211 // Insert to the read tree | |
| 212 bool InsertReadTree( struct _readTree *p, char *id, int l, int r ) | |
| 213 { | |
| 214 int tmp = strcmp( p->id, id ) ; | |
| 215 if ( tmp == 0 ) | |
| 216 { | |
| 217 p->cnt += 1 ; | |
| 218 return true ; | |
| 219 } | |
| 220 else if ( tmp < 0 ) | |
| 221 { | |
| 222 if ( p->left ) | |
| 223 return InsertReadTree( p->left, id, l, r ) ; | |
| 224 else | |
| 225 { | |
| 226 p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; | |
| 227 strcpy( p->left->id, id ) ; | |
| 228 p->left->leftAnchor = l ; | |
| 229 p->left->rightAnchor = r ; | |
| 230 //p->left->secondary = secondary ; | |
| 231 p->left->editDistance = editDistance ; | |
| 232 p->left->left = p->left->right = NULL ; | |
| 233 p->left->valid = validRead ; | |
| 234 p->left->cnt = 1 ; | |
| 235 p->left->NH = NH ; | |
| 236 p->left->flag = samFlag ; | |
| 237 return false ; | |
| 238 } | |
| 239 } | |
| 240 else | |
| 241 { | |
| 242 if ( p->right ) | |
| 243 return InsertReadTree( p->right, id, l, r ) ; | |
| 244 else | |
| 245 { | |
| 246 p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; | |
| 247 strcpy( p->right->id, id ) ; | |
| 248 p->right->leftAnchor = l ; | |
| 249 p->right->rightAnchor = r ; | |
| 250 //p->right->secondary = secondary ; | |
| 251 p->right->editDistance = editDistance ; | |
| 252 p->right->left = p->right->right = NULL ; | |
| 253 p->right->valid = validRead ; | |
| 254 p->right->cnt = 1 ; | |
| 255 p->right->NH = NH ; | |
| 256 p->right->flag = samFlag ; | |
| 257 return false ; | |
| 258 } | |
| 259 } | |
| 260 } | |
| 261 | |
| 262 | |
| 263 bool SearchContradictedReads( struct _readTree *p, char *id, int pos ) | |
| 264 { | |
| 265 if ( p == NULL ) | |
| 266 return false ; | |
| 267 int tmp = strcmp( p->id, id ) ; | |
| 268 if ( tmp == 0 && p->pos == pos ) | |
| 269 return true ; | |
| 270 else if ( tmp <= 0 ) | |
| 271 return SearchContradictedReads( p->left, id, pos ) ; | |
| 272 else | |
| 273 return SearchContradictedReads( p->right, id, pos ) ; | |
| 274 } | |
| 275 | |
| 276 bool InsertContradictedReads( struct _readTree *p, char *id, int pos ) | |
| 277 { | |
| 278 if ( p == NULL ) | |
| 279 { | |
| 280 contradictedReads = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; | |
| 281 strcpy( contradictedReads->id, id ) ; | |
| 282 contradictedReads->pos = pos ; | |
| 283 contradictedReads->left = contradictedReads->right = NULL ; | |
| 284 return false ; | |
| 285 } | |
| 286 int tmp = strcmp( p->id, id ) ; | |
| 287 if ( tmp == 0 && p->pos == pos ) | |
| 288 return true ; | |
| 289 else if ( tmp <= 0 ) | |
| 290 { | |
| 291 if ( p->left ) | |
| 292 return InsertContradictedReads( p->left, id, pos ) ; | |
| 293 else | |
| 294 { | |
| 295 p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; | |
| 296 strcpy( p->left->id, id ) ; | |
| 297 p->left->pos = pos ; | |
| 298 p->left->left = p->left->right = NULL ; | |
| 299 return false ; | |
| 300 } | |
| 301 } | |
| 302 else | |
| 303 { | |
| 304 if ( p->right ) | |
| 305 return InsertContradictedReads( p->right, id, pos ) ; | |
| 306 else | |
| 307 { | |
| 308 p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; | |
| 309 strcpy( p->right->id, id ) ; | |
| 310 p->right->pos = pos ; | |
| 311 p->right->left = p->right->right = NULL ; | |
| 312 return false ; | |
| 313 } | |
| 314 } | |
| 315 } | |
| 316 | |
| 317 struct _readTree *GetReadTreeNode( struct _readTree *p, char *id ) | |
| 318 { | |
| 319 if ( p == NULL ) | |
| 320 return NULL ; | |
| 321 int tmp = strcmp( p->id, id ) ; | |
| 322 if ( tmp == 0 ) | |
| 323 return p ; | |
| 324 else if ( tmp < 0 ) | |
| 325 return GetReadTreeNode( p->left, id ) ; | |
| 326 else | |
| 327 return GetReadTreeNode( p->right, id ) ; | |
| 328 } | |
| 329 | |
| 330 // Insert the new junction into the queue, | |
| 331 // and make sure the queue is sorted. | |
| 332 // Assume each junction range is only on one strand. | |
| 333 // l, r is the left and right anchor from a read | |
| 334 void InsertQueue( int start, int end, int l, int r ) | |
| 335 { | |
| 336 int i, j ; | |
| 337 i = qTail ; | |
| 338 | |
| 339 | |
| 340 while ( i != qHead ) | |
| 341 { | |
| 342 j = i - 1 ; | |
| 343 if ( j < 0 ) | |
| 344 j = QUEUE_SIZE - 1 ; | |
| 345 | |
| 346 if ( junctionQueue[j].start < start | |
| 347 || ( junctionQueue[j].start == start && junctionQueue[j].end < end ) ) | |
| 348 break ; | |
| 349 | |
| 350 junctionQueue[i] = junctionQueue[j] ; | |
| 351 --i ; | |
| 352 | |
| 353 if ( i < 0 ) | |
| 354 i = QUEUE_SIZE - 1 ; | |
| 355 } | |
| 356 | |
| 357 junctionQueue[i].start = start ; | |
| 358 junctionQueue[i].end = end ; | |
| 359 /*if ( !secondary ) | |
| 360 { | |
| 361 junctionQueue[i].readCnt = 1 ; | |
| 362 junctionQueue[i].secReadCnt = 0 ; | |
| 363 } | |
| 364 else | |
| 365 { | |
| 366 junctionQueue[i].readCnt = 0 ; | |
| 367 junctionQueue[i].secReadCnt = 1 ; | |
| 368 }*/ | |
| 369 junctionQueue[i].strand = strand ; | |
| 370 junctionQueue[i].leftAnchor = l ; | |
| 371 junctionQueue[i].rightAnchor = r ; | |
| 372 | |
| 373 strcpy( junctionQueue[i].head.id, col[0] ) ; | |
| 374 junctionQueue[i].head.valid = validRead ; | |
| 375 junctionQueue[i].head.leftAnchor = l ; | |
| 376 junctionQueue[i].head.rightAnchor = r ; | |
| 377 junctionQueue[i].head.left = junctionQueue[i].head.right = NULL ; | |
| 378 //junctionQueue[i].head.secondary = secondary ; | |
| 379 junctionQueue[i].head.NH = NH ; | |
| 380 junctionQueue[i].head.cnt = 1 ; | |
| 381 junctionQueue[i].head.editDistance = editDistance ; | |
| 382 | |
| 383 ++qTail ; | |
| 384 if ( qTail >= QUEUE_SIZE ) | |
| 385 qTail = 0 ; | |
| 386 } | |
| 387 | |
| 388 // Search the queue, and remove the heads if its end(start) is smaller than prune.(Because it is impossible to have that junction again) | |
| 389 // Add the junction to the queue, if it is not in it. | |
| 390 // Return true, if it finds the junction. Otherwise, return false. | |
| 391 bool SearchQueue( int start, int end, int prune, int l, int r ) | |
| 392 { | |
| 393 int i ; | |
| 394 | |
| 395 // Test whether this read might be a false alignment. | |
| 396 i = qHead ; | |
| 397 while ( i != qTail ) | |
| 398 { | |
| 399 struct _readTree *rt ; | |
| 400 if ( ( junctionQueue[i].start == start && junctionQueue[i].end < end ) || | |
| 401 ( junctionQueue[i].start > start && junctionQueue[i].end == end ) ) | |
| 402 { | |
| 403 // This alignment is false ; | |
| 404 rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ; | |
| 405 // the commented out logic because it is handled by contradicted reads | |
| 406 if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) ) | |
| 407 { | |
| 408 if ( rt->leftAnchor <= flank || rt->rightAnchor <= flank )//|| rt->secondary ) | |
| 409 { | |
| 410 if ( l > flank && r > flank )//&& !secondary ) | |
| 411 rt->valid = false ; | |
| 412 else | |
| 413 validRead = false ; | |
| 414 } | |
| 415 else | |
| 416 { | |
| 417 // Ignore this read | |
| 418 //return true ; | |
| 419 validRead = false ; | |
| 420 } | |
| 421 } | |
| 422 } | |
| 423 else if ( ( junctionQueue[i].start == start && junctionQueue[i].end > end ) || | |
| 424 ( junctionQueue[i].start < start && junctionQueue[i].end == end ) ) | |
| 425 { | |
| 426 // This other alignment is false ; | |
| 427 rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ; | |
| 428 //if ( rt != NULL ) | |
| 429 if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) ) | |
| 430 { | |
| 431 if ( l <= flank || r <= flank )//|| secondary ) | |
| 432 { | |
| 433 if ( rt->leftAnchor > flank && rt->rightAnchor > flank )//&& !rt->secondary ) | |
| 434 validRead = false ; | |
| 435 else | |
| 436 rt->valid = false ; | |
| 437 } | |
| 438 else | |
| 439 rt->valid = false ; | |
| 440 } | |
| 441 } | |
| 442 ++i ; | |
| 443 if ( i >= QUEUE_SIZE ) | |
| 444 i = 0 ; | |
| 445 } | |
| 446 | |
| 447 //if ( start == 8077108 && end == 8078439 ) | |
| 448 // exit( 1 ) ; | |
| 449 i = qHead ; | |
| 450 | |
| 451 while ( i != qTail ) | |
| 452 { | |
| 453 if ( junctionQueue[i].start == start && | |
| 454 junctionQueue[i].end == end ) | |
| 455 { | |
| 456 if (junctionQueue[i].strand == '?' && junctionQueue[i].strand != strand ) | |
| 457 { | |
| 458 junctionQueue[i].strand = strand ; | |
| 459 } | |
| 460 InsertReadTree( &junctionQueue[i].head, col[0], l, r ) ; | |
| 461 return true ; | |
| 462 } | |
| 463 | |
| 464 if ( junctionQueue[i].end < prune && i == qHead ) | |
| 465 { | |
| 466 // pop | |
| 467 PrintJunction( col[2], junctionQueue[i] ) ; | |
| 468 ClearReadTree( junctionQueue[i].head.left ) ; | |
| 469 ClearReadTree( junctionQueue[i].head.right ) ; | |
| 470 ++qHead ; | |
| 471 if ( qHead >= QUEUE_SIZE ) | |
| 472 qHead = 0 ; | |
| 473 } | |
| 474 ++i ; | |
| 475 if ( i >= QUEUE_SIZE ) | |
| 476 i = 0 ; | |
| 477 } | |
| 478 | |
| 479 InsertQueue( start, end, l, r ) ; | |
| 480 return false ; | |
| 481 } | |
| 482 | |
| 483 // Compute the junctions based on the CIGAR | |
| 484 bool CompareJunctions( int startLocation, char *cigar ) | |
| 485 { | |
| 486 int currentLocation = startLocation ; // Current location on the reference genome | |
| 487 int i, j ; | |
| 488 int num ; | |
| 489 int newJuncCnt = 0 ; // The # of junctions in the read, and the # of new junctions among them. | |
| 490 | |
| 491 struct _cigarSeg cigarSeg[1000] ; // A segment of the cigar. | |
| 492 int ccnt = 0 ; // cigarSeg cnt | |
| 493 | |
| 494 j = 0 ; | |
| 495 num = 0 ; | |
| 496 validRead = true ; | |
| 497 | |
| 498 for ( i = 0 ; cigar[i] ; ++i ) | |
| 499 { | |
| 500 if ( cigar[i] >= '0' && cigar[i] <= '9' ) | |
| 501 { | |
| 502 num = num * 10 + cigar[i] - '0' ; | |
| 503 } | |
| 504 else | |
| 505 { | |
| 506 cigarSeg[ccnt].len = num ; | |
| 507 cigarSeg[ccnt].type = cigar[i] ; | |
| 508 ++ccnt ; | |
| 509 num = 0 ; | |
| 510 } | |
| 511 } | |
| 512 | |
| 513 // Filter low complex alignment. | |
| 514 // Only applies this to alignments does not have a strand information. | |
| 515 if ( strand == '?' ) | |
| 516 { | |
| 517 if ( noncanonStrandInfo != -1 ) | |
| 518 { | |
| 519 if ( ( noncanonStrandInfo & filterYS ) != 0 ) | |
| 520 validRead = false ; | |
| 521 } | |
| 522 else | |
| 523 { | |
| 524 int softStart = -1 ; | |
| 525 int softEnd = 0 ; | |
| 526 if ( cigarSeg[0].type == 'S' ) | |
| 527 softStart = cigarSeg[0].len ; | |
| 528 if ( cigarSeg[ ccnt - 1 ].type == 'S' ) | |
| 529 softEnd = cigarSeg[ ccnt - 1 ].len ; | |
| 530 int readLen = strlen( col[9] ) ; | |
| 531 int count[5] = { 0, 0, 0, 0, 0 } ; | |
| 532 | |
| 533 int pos = 0 ; | |
| 534 for ( i = 0 ; i < ccnt ; ++i ) | |
| 535 { | |
| 536 switch ( cigarSeg[i].type ) | |
| 537 { | |
| 538 case 'S': | |
| 539 pos += cigarSeg[i].len ; | |
| 540 case 'M': | |
| 541 case 'I': | |
| 542 { | |
| 543 for ( j = 0 ; j < cigarSeg[i].len ; ++j ) | |
| 544 ++count[ nucToNum[ col[9][pos + j] - 'A' ] ] ; | |
| 545 pos += j ; | |
| 546 } break ; | |
| 547 case 'N': | |
| 548 { | |
| 549 int max = 0 ; | |
| 550 int sum = 0 ; | |
| 551 for ( j = 0 ; j < 5 ; ++j ) | |
| 552 { | |
| 553 if ( count[j] > max ) | |
| 554 max = count[j] ; | |
| 555 sum += count[j] ; | |
| 556 } | |
| 557 if ( max > 0.8 * sum ) | |
| 558 validRead = false ; | |
| 559 count[0] = count[1] = count[2] = count[3] = count[4] = 0 ; | |
| 560 } break ; | |
| 561 case 'H': | |
| 562 case 'P': | |
| 563 case 'D': | |
| 564 default: break ; | |
| 565 } | |
| 566 } | |
| 567 | |
| 568 int max = 0 ; | |
| 569 int sum = 0 ; | |
| 570 for ( j = 0 ; j < 5 ; ++j ) | |
| 571 { | |
| 572 if ( count[j] > max ) | |
| 573 max = count[j] ; | |
| 574 sum += count[j] ; | |
| 575 } | |
| 576 if ( max > 0.8 * sum ) | |
| 577 validRead = false ; | |
| 578 /* count[0] = count[1] = count[2] = count[3] = count[4] = 0 ; | |
| 579 | |
| 580 | |
| 581 for ( i = softStart + 1 ; i < readLen - softEnd ; ++i ) | |
| 582 { | |
| 583 switch ( col[9][i] ) | |
| 584 { | |
| 585 case 'A': ++count[0] ; break ; | |
| 586 case 'C': ++count[1] ; break ; | |
| 587 case 'G': ++count[2] ; break ; | |
| 588 case 'T': ++count[3] ; break ; | |
| 589 default: ++count[4] ; | |
| 590 } | |
| 591 } | |
| 592 int max = 0 ; | |
| 593 for ( j = 0 ; j < 5 ; ++j ) | |
| 594 if ( count[j] > max ) | |
| 595 max = count[j] ; | |
| 596 if ( max > 0.6 * ( readLen - softEnd - softStart - 1 ) ) | |
| 597 validRead = false ;*/ | |
| 598 } | |
| 599 } | |
| 600 | |
| 601 // Test whether contradict with mate pair | |
| 602 if ( col[6][0] == '=' ) | |
| 603 { | |
| 604 currentLocation = startLocation ; | |
| 605 for ( i = 0 ; i < ccnt ; ++i ) | |
| 606 { | |
| 607 if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H' | |
| 608 || cigarSeg[i].type == 'P' ) | |
| 609 continue ; | |
| 610 else if ( cigarSeg[i].type == 'N' ) | |
| 611 { | |
| 612 if ( mateStart >= currentLocation && mateStart <= currentLocation + cigarSeg[i].len - 1 ) | |
| 613 { | |
| 614 /*if ( cigarSeg[i].len == 91486 ) | |
| 615 { | |
| 616 printf( "%s %d %d\n", currentLocation, mateStart ) ; | |
| 617 exit( 1 ) ; | |
| 618 }*/ | |
| 619 // ignore this read | |
| 620 //return false ; | |
| 621 InsertContradictedReads( contradictedReads, col[0], mateStart ) ; | |
| 622 validRead = false ; | |
| 623 break ; | |
| 624 } | |
| 625 } | |
| 626 currentLocation += cigarSeg[i].len ; | |
| 627 } | |
| 628 | |
| 629 i = qHead ; | |
| 630 // Search if it falls in the splices junction created by its mate. | |
| 631 while ( i != qTail ) | |
| 632 { | |
| 633 if ( mateStart < junctionQueue[i].start && junctionQueue[i].start <= startLocation | |
| 634 && startLocation <= junctionQueue[i].end | |
| 635 && SearchContradictedReads( contradictedReads, col[0], startLocation ) ) | |
| 636 { | |
| 637 validRead = false ; | |
| 638 break ; | |
| 639 } | |
| 640 ++i ; | |
| 641 if ( i >= QUEUE_SIZE ) | |
| 642 i = 0 ; | |
| 643 } | |
| 644 } | |
| 645 | |
| 646 currentLocation = startLocation ; | |
| 647 for ( i = 0 ; i < ccnt ; ++i ) | |
| 648 { | |
| 649 if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H' | |
| 650 || cigarSeg[i].type == 'P' ) | |
| 651 continue ; | |
| 652 else if ( cigarSeg[i].type == 'N' ) | |
| 653 { | |
| 654 int left, right ; | |
| 655 int tmp ; | |
| 656 tmp = i ; | |
| 657 while ( i > 0 && ( cigarSeg[i - 1].type == 'I' || cigarSeg[i - 1].type == 'D' ) ) | |
| 658 --i ; | |
| 659 if ( i > 0 && cigarSeg[i - 1].type == 'M' ) | |
| 660 { | |
| 661 left = cigarSeg[i - 1].len ; | |
| 662 if ( i >= 2 && cigarSeg[i - 2].type == 'N' && left <= flank ) | |
| 663 { | |
| 664 left = flank + 1 ; | |
| 665 } | |
| 666 } | |
| 667 else | |
| 668 left = 0 ; | |
| 669 | |
| 670 i = tmp ; | |
| 671 while ( i < ccnt && ( cigarSeg[i + 1].type == 'I' || cigarSeg[i + 1].type == 'D' ) ) | |
| 672 ++i ; | |
| 673 if ( i < ccnt && cigarSeg[i + 1].type == 'M' ) | |
| 674 { | |
| 675 right = cigarSeg[i + 1].len ; | |
| 676 if ( i + 2 < ccnt && cigarSeg[i + 2].type == 'N' && right <= flank ) | |
| 677 { | |
| 678 right = flank + 1 ; | |
| 679 } | |
| 680 } | |
| 681 else | |
| 682 right = 0 ; | |
| 683 i = tmp ; | |
| 684 if ( !SearchQueue( currentLocation, currentLocation + cigarSeg[i].len - 1, startLocation, left, right ) ) | |
| 685 { | |
| 686 ++newJuncCnt ; | |
| 687 } | |
| 688 } | |
| 689 currentLocation += cigarSeg[i].len ; | |
| 690 } | |
| 691 /*else if ( cigar[i] == 'I' ) | |
| 692 { | |
| 693 num = 0 ; | |
| 694 } | |
| 695 else if ( cigar[i] == 'N' ) | |
| 696 { | |
| 697 if ( !SearchQueue( currentLocation, currentLocation + num - 1, startLocation ) ) | |
| 698 { | |
| 699 ++newJuncCnt ; | |
| 700 //if ( flagPrintJunction ) | |
| 701 // printf( "%s %d %d\n", col[2], currentLocation - 2, currentLocation + len - 1 ) ; | |
| 702 } | |
| 703 currentLocation += num ; | |
| 704 num = 0 ; | |
| 705 } | |
| 706 else // Other operations, like M, D,..., | |
| 707 { | |
| 708 currentLocation += num ; | |
| 709 num = 0 ; | |
| 710 }*/ | |
| 711 | |
| 712 junctionCnt += newJuncCnt ; | |
| 713 | |
| 714 if ( newJuncCnt ) | |
| 715 return true ; | |
| 716 else | |
| 717 return false ; | |
| 718 } | |
| 719 | |
| 720 void cigar2string( bam1_core_t *c, uint32_t *in_cigar, char *out_cigar ) | |
| 721 { | |
| 722 int k, op, l ; | |
| 723 char opcode ; | |
| 724 | |
| 725 *out_cigar = '\0' ; | |
| 726 for ( k = 0 ; k < c->n_cigar ; ++k ) | |
| 727 { | |
| 728 op = in_cigar[k] & BAM_CIGAR_MASK ; | |
| 729 l = in_cigar[k] >> BAM_CIGAR_SHIFT ; | |
| 730 switch (op) | |
| 731 { | |
| 732 case BAM_CMATCH: opcode = 'M' ; break ; | |
| 733 case BAM_CINS: opcode = 'I' ; break ; | |
| 734 case BAM_CDEL: opcode = 'D' ; break ; | |
| 735 case BAM_CREF_SKIP: opcode = 'N' ; break ; | |
| 736 case BAM_CSOFT_CLIP: opcode = 'S' ; break ; | |
| 737 case BAM_CHARD_CLIP: opcode = 'H' ; break ; | |
| 738 case BAM_CPAD: opcode = 'P' ; break ; | |
| 739 } | |
| 740 sprintf( out_cigar + strlen( out_cigar ), "%d%c", l, opcode ) ; | |
| 741 } | |
| 742 } | |
| 743 | |
| 744 | |
| 745 | |
| 746 int main( int argc, char *argv[] ) | |
| 747 { | |
| 748 FILE *fp ; | |
| 749 samfile_t *fpsam ; | |
| 750 bam1_t *b = NULL ; | |
| 751 bool useSam = true ; | |
| 752 | |
| 753 int i, len ; | |
| 754 int startLocation ; | |
| 755 bool flagRemove = false ; | |
| 756 char prevChrome[103] ; | |
| 757 | |
| 758 anchorBoth = false ; | |
| 759 | |
| 760 flagPrintJunction = false ; | |
| 761 flagPrintAll = false ; | |
| 762 flagStrict = false ; | |
| 763 junctionCnt = 0 ; | |
| 764 bool hasMateReadIdSuffix = false ; | |
| 765 | |
| 766 strcpy( prevChrome, "" ) ; | |
| 767 flagRemove = true ; | |
| 768 flagPrintJunction = true ; | |
| 769 flank = 8 ; | |
| 770 filterYS = 4 ; | |
| 771 | |
| 772 contradictedReads = NULL ; | |
| 773 | |
| 774 // processing the argument list | |
| 775 for ( i = 1 ; i < argc ; ++i ) | |
| 776 { | |
| 777 if ( !strcmp( argv[i], "-h" ) ) | |
| 778 { | |
| 779 PrintHelp() ; | |
| 780 return 0 ; | |
| 781 } | |
| 782 else if ( !strcmp( argv[i], "-r" ) ) | |
| 783 { | |
| 784 flagRemove = true ; | |
| 785 } | |
| 786 else if ( !strcmp( argv[i], "-j" ) ) | |
| 787 { | |
| 788 strcpy( prevChrome, "" ) ; | |
| 789 flagRemove = true ; | |
| 790 flagPrintJunction = true ; | |
| 791 flank = atoi( argv[i + 1] ) ; | |
| 792 if ( i + 2 < argc && !strcmp( argv[i+2], "-B" ) ) | |
| 793 { | |
| 794 anchorBoth = true ; | |
| 795 ++i ; | |
| 796 } | |
| 797 ++i ; | |
| 798 } | |
| 799 else if ( !strcmp( argv[i], "-a" ) ) | |
| 800 { | |
| 801 flagPrintAll = true ; | |
| 802 } | |
| 803 else if ( !strcmp( argv[i], "--strict" ) ) | |
| 804 { | |
| 805 flagStrict = true ; | |
| 806 } | |
| 807 else if ( !strcmp( argv[i], "-y" ) ) | |
| 808 { | |
| 809 filterYS = atoi( argv[i + 1] ) ; | |
| 810 ++i ; | |
| 811 } | |
| 812 else if ( !strcmp( argv[i], "--hasMateIdSuffix" ) ) | |
| 813 { | |
| 814 hasMateReadIdSuffix = true ; | |
| 815 ++i ; | |
| 816 } | |
| 817 else if ( i > 1 ) | |
| 818 { | |
| 819 printf( "Unknown option %s\n", argv[i] ) ; | |
| 820 exit( 1 ) ; | |
| 821 } | |
| 822 } | |
| 823 if ( argc == 1 ) | |
| 824 { | |
| 825 PrintHelp() ; | |
| 826 return 0 ; | |
| 827 } | |
| 828 | |
| 829 len = strlen( argv[1] ) ; | |
| 830 if ( argv[1][len-3] == 'b' || argv[1][len-3] == 'B' ) | |
| 831 { | |
| 832 if ( !( fpsam = samopen( argv[1], "rb", 0 ) ) ) | |
| 833 return 0 ; | |
| 834 | |
| 835 if ( !fpsam->header ) | |
| 836 { | |
| 837 //samclose( fpsam ) ; | |
| 838 //fpsam = samopen( argv[1], "r", 0 ) ; | |
| 839 //if ( !fpsam->header ) | |
| 840 //{ | |
| 841 useSam = false ; | |
| 842 fp = fopen( argv[1], "r" ) ; | |
| 843 //} | |
| 844 } | |
| 845 } | |
| 846 else | |
| 847 { | |
| 848 useSam = false ; | |
| 849 fp = NULL ; | |
| 850 if ( !strcmp( argv[1], "-" ) ) | |
| 851 fp = stdin ; | |
| 852 else | |
| 853 fp = fopen( argv[1], "r" ) ; | |
| 854 if ( fp == NULL ) | |
| 855 { | |
| 856 printf( "Could not open file %s\n", argv[1] ) ; | |
| 857 return 0 ; | |
| 858 } | |
| 859 } | |
| 860 | |
| 861 while ( 1 ) | |
| 862 { | |
| 863 int flag = 0 ; | |
| 864 if ( useSam ) | |
| 865 { | |
| 866 if ( b ) | |
| 867 bam_destroy1( b ) ; | |
| 868 b = bam_init1() ; | |
| 869 if ( samread( fpsam, b ) <= 0 ) | |
| 870 break ; | |
| 871 if ( b->core.tid >= 0 ) | |
| 872 strcpy( col[2], fpsam->header->target_name[b->core.tid] ) ; | |
| 873 else | |
| 874 strcpy( col[2], "-1" ) ; | |
| 875 cigar2string( &(b->core), bam1_cigar( b ), col[5] ) ; | |
| 876 strcpy( col[0], bam1_qname( b ) ) ; | |
| 877 flag = b->core.flag ; | |
| 878 if ( bam_aux_get( b, "NH" ) ) | |
| 879 { | |
| 880 /*if ( bam_aux2i( bam_aux_get(b, "NH" ) ) >= 2 ) | |
| 881 { | |
| 882 secondary = true ; | |
| 883 } | |
| 884 else | |
| 885 secondary = false ;*/ | |
| 886 | |
| 887 NH = bam_aux2i( bam_aux_get( b, "NH" ) ) ; | |
| 888 } | |
| 889 else | |
| 890 { | |
| 891 //secondary = false ; | |
| 892 NH = 1 ; | |
| 893 } | |
| 894 | |
| 895 if ( bam_aux_get( b, "NM" ) ) | |
| 896 { | |
| 897 editDistance = bam_aux2i( bam_aux_get( b, "NM" ) ) ; | |
| 898 } | |
| 899 else if ( bam_aux_get( b, "nM" ) ) | |
| 900 { | |
| 901 editDistance = bam_aux2i( bam_aux_get( b, "nM" ) ) ; | |
| 902 } | |
| 903 else | |
| 904 editDistance = 0 ; | |
| 905 | |
| 906 mateStart = b->core.mpos + 1 ; | |
| 907 if ( b->core.mtid == b->core.tid ) | |
| 908 col[6][0] = '=' ; | |
| 909 else | |
| 910 col[6][0] = '*' ; | |
| 911 | |
| 912 if ( b->core.l_qseq < 20 ) | |
| 913 continue ; | |
| 914 | |
| 915 for ( i = 0 ; i < b->core.l_qseq ; ++i ) | |
| 916 { | |
| 917 int bit = bam1_seqi( bam1_seq( b ), i ) ; | |
| 918 switch ( bit ) | |
| 919 { | |
| 920 case 1: col[9][i] = 'A' ; break ; | |
| 921 case 2: col[9][i] = 'C' ; break ; | |
| 922 case 4: col[9][i] = 'G' ; break ; | |
| 923 case 8: col[9][i] = 'T' ; break ; | |
| 924 case 15: col[9][i] = 'N' ; break ; | |
| 925 default: col[9][i] = 'A' ; break ; | |
| 926 } | |
| 927 } | |
| 928 col[9][i] = '\0' ; | |
| 929 | |
| 930 /*if ( flag & 0x100 ) | |
| 931 secondary = true ; | |
| 932 else | |
| 933 secondary = false ;*/ | |
| 934 } | |
| 935 else | |
| 936 { | |
| 937 char *p ; | |
| 938 if ( !fgets( line, LINE_SIZE, fp ) ) | |
| 939 break ; | |
| 940 if ( line[0] == '\0' || line[0] == '@' ) | |
| 941 continue ; | |
| 942 sscanf( line, "%s%s%s%s%s%s%s%s%s%s%s", col, col + 1, col + 2, col + 3, col + 4, | |
| 943 col + 5, col + 6, col + 7, col + 8, col + 9, col + 10 ) ; | |
| 944 | |
| 945 flag = atoi( col[1] ) ; | |
| 946 if ( p = strstr( line, "NH" ) ) | |
| 947 { | |
| 948 int k = 0 ; | |
| 949 p += 5 ; | |
| 950 while ( *p != ' ' && *p != '\t' && *p != '\0') | |
| 951 { | |
| 952 k = k * 10 + *p - '0' ; | |
| 953 ++p ; | |
| 954 } | |
| 955 /*if ( k >= 2 ) | |
| 956 { | |
| 957 secondary = true ; | |
| 958 } | |
| 959 else | |
| 960 secondary = false ;*/ | |
| 961 NH = k ; | |
| 962 } | |
| 963 else | |
| 964 { | |
| 965 //secondary = false ; | |
| 966 NH = 1 ; | |
| 967 } | |
| 968 if ( ( p = strstr( line, "NM" ) ) || ( p = strstr( line, "nM" ) ) ) | |
| 969 { | |
| 970 int k = 0 ; | |
| 971 p += 5 ; | |
| 972 while ( *p != ' ' && *p != '\t' && *p != '\0') | |
| 973 { | |
| 974 k = k * 10 + *p - '0' ; | |
| 975 ++p ; | |
| 976 } | |
| 977 editDistance = k ; | |
| 978 } | |
| 979 else | |
| 980 editDistance = 0 ; | |
| 981 | |
| 982 mateStart = atoi( col[7] ) ; | |
| 983 /*if ( flag & 0x100 ) | |
| 984 secondary = true ; | |
| 985 else | |
| 986 secondary = false ;*/ | |
| 987 | |
| 988 } | |
| 989 samFlag = flag ; | |
| 990 for ( i = 0 ; col[5][i] ; ++i ) | |
| 991 if ( col[5][i] == 'N' ) | |
| 992 break ; | |
| 993 | |
| 994 if ( !col[5][i] ) | |
| 995 continue ; | |
| 996 | |
| 997 // remove .1, .2 or /1, /2 suffix | |
| 998 if ( hasMateReadIdSuffix ) | |
| 999 { | |
| 1000 char *s = col[0] ; | |
| 1001 int len = strlen( s ) ; | |
| 1002 if ( len >= 2 && ( s[len - 1] == '1' || s[len - 1] == '2' ) | |
| 1003 && ( s[len - 2] == '.' || s[len - 2] == '/' ) ) | |
| 1004 { | |
| 1005 s[len - 2] = '\0' ; | |
| 1006 } | |
| 1007 } | |
| 1008 | |
| 1009 if ( useSam ) | |
| 1010 { | |
| 1011 if ( bam_aux_get( b, "XS" ) ) | |
| 1012 { | |
| 1013 strand = bam_aux2A( bam_aux_get( b, "XS" ) ) ; | |
| 1014 if ( bam_aux_get( b, "YS" ) ) | |
| 1015 { | |
| 1016 noncanonStrandInfo = bam_aux2i( bam_aux_get( b, "YS" ) ) ; | |
| 1017 } | |
| 1018 else | |
| 1019 { | |
| 1020 noncanonStrandInfo = -1 ; | |
| 1021 } | |
| 1022 } | |
| 1023 else | |
| 1024 { | |
| 1025 strand = '?' ; | |
| 1026 noncanonStrandInfo = -1 ; | |
| 1027 } | |
| 1028 startLocation = b->core.pos + 1 ; | |
| 1029 } | |
| 1030 else | |
| 1031 { | |
| 1032 if ( strstr( line, "XS:A:-" ) ) // on negative strand | |
| 1033 strand = '-' ; | |
| 1034 else if ( strstr( line, "XS:A:+" ) ) | |
| 1035 strand = '+' ; | |
| 1036 else | |
| 1037 { | |
| 1038 strand = '?' ; | |
| 1039 char *p = strstr( line, "YS:i:" ) ; | |
| 1040 if ( p != NULL ) | |
| 1041 { | |
| 1042 p += 5 ; | |
| 1043 noncanonStrandInfo = atoi( p ) ; | |
| 1044 } | |
| 1045 else | |
| 1046 noncanonStrandInfo = -1 ; | |
| 1047 } | |
| 1048 startLocation = atoi( col[3] ) ; | |
| 1049 } | |
| 1050 | |
| 1051 // Found the junctions from the read. | |
| 1052 if ( strcmp( prevChrome, col[2] ) ) | |
| 1053 { | |
| 1054 if ( flagPrintJunction ) | |
| 1055 { | |
| 1056 // Print the remaining elements in the queue | |
| 1057 i = qHead ; | |
| 1058 while ( i != qTail ) | |
| 1059 { | |
| 1060 PrintJunction( prevChrome, junctionQueue[i] ) ; | |
| 1061 ++i ; | |
| 1062 if ( i >= QUEUE_SIZE ) | |
| 1063 i = 0 ; | |
| 1064 } | |
| 1065 } | |
| 1066 | |
| 1067 // new chromosome | |
| 1068 ClearReadTree( contradictedReads ) ; | |
| 1069 contradictedReads = NULL ; | |
| 1070 qHead = qTail = 0 ; | |
| 1071 strcpy( prevChrome, col[2] ) ; | |
| 1072 } | |
| 1073 | |
| 1074 if ( flagRemove ) | |
| 1075 { | |
| 1076 if ( CompareJunctions( startLocation, col[5] ) ) | |
| 1077 { | |
| 1078 // Test whether this read has new junctions | |
| 1079 //++junctionCnt ; | |
| 1080 if ( !flagPrintJunction ) | |
| 1081 printf( "%s", line ) ; | |
| 1082 } | |
| 1083 } | |
| 1084 else | |
| 1085 { | |
| 1086 ++junctionCnt ; | |
| 1087 printf( "%s", line ) ; | |
| 1088 } | |
| 1089 //printf( "hi2 %s\n", col[0] ) ; | |
| 1090 } | |
| 1091 | |
| 1092 if ( flagPrintJunction ) | |
| 1093 { | |
| 1094 // Print the remaining elements in the queue | |
| 1095 i = qHead ; | |
| 1096 while ( i != qTail ) | |
| 1097 { | |
| 1098 PrintJunction( prevChrome, junctionQueue[i] ) ; | |
| 1099 ++i ; | |
| 1100 if ( i >= QUEUE_SIZE ) | |
| 1101 i = 0 ; | |
| 1102 } | |
| 1103 } | |
| 1104 | |
| 1105 //fprintf( stderr, "The number of junctions: %d\n", junctionCnt ) ; | |
| 1106 return 0 ; | |
| 1107 } | |
| 1108 |
