Mercurial > repos > lsong10 > psiclass
view PsiCLASS-1.0.2/FindJunction.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line source
/* Find junctions from the SAM file generated by Tophat2. The SAM file should be sorted. The junction is the start and the end coordinates of the spliced region in this program. The output of the junction is a bit different. Usage: ./a.out input [option] >output. ./a.out -h for help. */ #include <stdio.h> #include <string.h> #include <stdlib.h> #include "sam.h" #define LINE_SIZE 4097 #define QUEUE_SIZE 10001 #define HASH_MAX 1000003 struct _readTree { char id[256] ; int leftAnchor, rightAnchor ; int pos ; //bool secondary ; bool valid ; int editDistance ; int NH, cnt ; // if cnt < NH, then it has real secondary match for this splice junction struct _readTree *left, *right ; int flag ;// The flag from sam head. } ; // The structure of a junction struct _junction { int start, end ; int readCnt ; // The # of reads containing this junction int secReadCnt ; // The # of reads whose secondary alignment has this junction. char strand ; // On '+' or '-' strand int leftAnchor, rightAnchor ; // The longest left and right anchor int oppositeAnchor ; // The longest anchor of the shorter side. int uniqEditDistance, secEditDistance ; struct _readTree head ; } ; char line[LINE_SIZE] ; char col[11][LINE_SIZE] ; // The option fields is not needed. char strand ; // Extract XS field char noncanonStrandInfo ; //bool secondary ; int NH ; int editDistance ; int mateStart ; int filterYS ; int samFlag ; struct _junction junctionQueue[QUEUE_SIZE] ; // Expected only a few junctions in it for each read. This queue is sorted. int qHead, qTail ; bool flagPrintJunction ; bool flagPrintAll ; bool flagStrict ; int junctionCnt ; bool anchorBoth ; bool validRead ; int flank ; struct _cigarSeg { int len ; char type ; } ; struct _readTree *contradictedReads ; char nucToNum[26] = { 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4 } ; void PrintHelp() { printf( "Prints reads from the SAM/BAM file that containing junctions.\n" "Usage: ./a.out input [option]>output\n" "Options:\n" "\t-j xx [-B]: Output the junctions using 1-based coordinates. The format is \"reference id\" start end \"# of read\" strand.(They are sorted)\n and the xx is an integer means the maximum unqualified anchor length for a splice junction(default=8). If -B, the splice junction must be supported by a read whose both anchors are longer than xx.\n" "\t-a: Output all the junctions, and use non-positive support number to indicate unqualified junctions.\n" "\t-y: If the bits from YS field of bam matches the argument, we filter the alignment (default: 4).\n" ) ; } void GetJunctionInfo( struct _junction &junc, struct _readTree *p ) { if ( p == NULL ) return ; if ( p->valid ) { //if ( junc.start == 22381343 + 1 && junc.end == 22904987 - 1 ) // printf( "%s %d %d %d\n", p->id, p->leftAnchor, p->rightAnchor, p->flag ) ; if ( p->cnt < p->NH ) { junc.secEditDistance += p->editDistance ; ++junc.secReadCnt ; } else { junc.uniqEditDistance += p->editDistance ; ++junc.readCnt ; } int l = p->leftAnchor, r = p->rightAnchor ; if ( !anchorBoth ) { if ( l > junc.leftAnchor ) junc.leftAnchor = l ; if ( r > junc.rightAnchor ) junc.rightAnchor = r ; if ( l <= r && l > junc.oppositeAnchor ) junc.oppositeAnchor = l ; else if ( r < l && r > junc.oppositeAnchor ) junc.oppositeAnchor = r ; } else { if ( l > flank && r > flank ) { junc.leftAnchor = l ; junc.rightAnchor = r ; } if ( l <= r && l > junc.oppositeAnchor ) junc.oppositeAnchor = l ; else if ( r < l && r > junc.oppositeAnchor ) junc.oppositeAnchor = r ; } } GetJunctionInfo( junc, p->left ) ; GetJunctionInfo( junc, p->right ) ; } void PrintJunctionReads( struct _junction &junc, struct _readTree *p ) { if ( p == NULL ) return ; if ( p->valid ) printf( "%s\n", p->id ) ; PrintJunctionReads( junc, p->left ) ; PrintJunctionReads( junc, p->right ) ; } void PrintJunction( char *chrome, struct _junction &junc ) { int sum ; junc.leftAnchor = 0 ; junc.rightAnchor = 0 ; junc.oppositeAnchor = 0 ; junc.readCnt = 0 ; junc.secReadCnt = 0 ; junc.uniqEditDistance = 0 ; junc.secEditDistance = 0 ; GetJunctionInfo( junc, &junc.head ) ; sum = junc.readCnt + junc.secReadCnt ; if ( junc.leftAnchor <= flank || junc.rightAnchor <= flank || ( junc.readCnt + junc.secReadCnt <= 0 ) ) { if ( flagPrintAll ) { sum = -sum ; } else return ; } /*if ( junc.end - junc.start + 1 >= 200000 ) { if ( junc.secReadCnt > 0 ) junc.secReadCnt = 1 ; }*/ /*if ( junc.oppositeAnchor <= ( ( flank / 2 < 1 ) ? flank / 2 : 1 ) && ( junc.readCnt + junc.secReadCnt ) <= 10 ) { if ( junc.readCnt > 0 ) junc.readCnt = 1 ; if ( junc.secReadCnt > 0 ) junc.secReadCnt = 0 ; }*/ printf( "%s %d %d %d %c %d %d %d %d\n", chrome, junc.start - 1, junc.end + 1, sum, junc.strand, junc.readCnt, junc.secReadCnt, junc.uniqEditDistance, junc.secEditDistance ) ; //PrintJunctionReads( junc, &junc.head ) ; } void ClearReadTree( struct _readTree *p ) { if ( p == NULL ) return ; ClearReadTree( p->left ) ; ClearReadTree( p->right ) ; free( p ) ; } // Insert to the read tree bool InsertReadTree( struct _readTree *p, char *id, int l, int r ) { int tmp = strcmp( p->id, id ) ; if ( tmp == 0 ) { p->cnt += 1 ; return true ; } else if ( tmp < 0 ) { if ( p->left ) return InsertReadTree( p->left, id, l, r ) ; else { p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; strcpy( p->left->id, id ) ; p->left->leftAnchor = l ; p->left->rightAnchor = r ; //p->left->secondary = secondary ; p->left->editDistance = editDistance ; p->left->left = p->left->right = NULL ; p->left->valid = validRead ; p->left->cnt = 1 ; p->left->NH = NH ; p->left->flag = samFlag ; return false ; } } else { if ( p->right ) return InsertReadTree( p->right, id, l, r ) ; else { p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; strcpy( p->right->id, id ) ; p->right->leftAnchor = l ; p->right->rightAnchor = r ; //p->right->secondary = secondary ; p->right->editDistance = editDistance ; p->right->left = p->right->right = NULL ; p->right->valid = validRead ; p->right->cnt = 1 ; p->right->NH = NH ; p->right->flag = samFlag ; return false ; } } } bool SearchContradictedReads( struct _readTree *p, char *id, int pos ) { if ( p == NULL ) return false ; int tmp = strcmp( p->id, id ) ; if ( tmp == 0 && p->pos == pos ) return true ; else if ( tmp <= 0 ) return SearchContradictedReads( p->left, id, pos ) ; else return SearchContradictedReads( p->right, id, pos ) ; } bool InsertContradictedReads( struct _readTree *p, char *id, int pos ) { if ( p == NULL ) { contradictedReads = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; strcpy( contradictedReads->id, id ) ; contradictedReads->pos = pos ; contradictedReads->left = contradictedReads->right = NULL ; return false ; } int tmp = strcmp( p->id, id ) ; if ( tmp == 0 && p->pos == pos ) return true ; else if ( tmp <= 0 ) { if ( p->left ) return InsertContradictedReads( p->left, id, pos ) ; else { p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; strcpy( p->left->id, id ) ; p->left->pos = pos ; p->left->left = p->left->right = NULL ; return false ; } } else { if ( p->right ) return InsertContradictedReads( p->right, id, pos ) ; else { p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; strcpy( p->right->id, id ) ; p->right->pos = pos ; p->right->left = p->right->right = NULL ; return false ; } } } struct _readTree *GetReadTreeNode( struct _readTree *p, char *id ) { if ( p == NULL ) return NULL ; int tmp = strcmp( p->id, id ) ; if ( tmp == 0 ) return p ; else if ( tmp < 0 ) return GetReadTreeNode( p->left, id ) ; else return GetReadTreeNode( p->right, id ) ; } // Insert the new junction into the queue, // and make sure the queue is sorted. // Assume each junction range is only on one strand. // l, r is the left and right anchor from a read void InsertQueue( int start, int end, int l, int r ) { int i, j ; i = qTail ; while ( i != qHead ) { j = i - 1 ; if ( j < 0 ) j = QUEUE_SIZE - 1 ; if ( junctionQueue[j].start < start || ( junctionQueue[j].start == start && junctionQueue[j].end < end ) ) break ; junctionQueue[i] = junctionQueue[j] ; --i ; if ( i < 0 ) i = QUEUE_SIZE - 1 ; } junctionQueue[i].start = start ; junctionQueue[i].end = end ; /*if ( !secondary ) { junctionQueue[i].readCnt = 1 ; junctionQueue[i].secReadCnt = 0 ; } else { junctionQueue[i].readCnt = 0 ; junctionQueue[i].secReadCnt = 1 ; }*/ junctionQueue[i].strand = strand ; junctionQueue[i].leftAnchor = l ; junctionQueue[i].rightAnchor = r ; strcpy( junctionQueue[i].head.id, col[0] ) ; junctionQueue[i].head.valid = validRead ; junctionQueue[i].head.leftAnchor = l ; junctionQueue[i].head.rightAnchor = r ; junctionQueue[i].head.left = junctionQueue[i].head.right = NULL ; //junctionQueue[i].head.secondary = secondary ; junctionQueue[i].head.NH = NH ; junctionQueue[i].head.cnt = 1 ; junctionQueue[i].head.editDistance = editDistance ; ++qTail ; if ( qTail >= QUEUE_SIZE ) qTail = 0 ; } // Search the queue, and remove the heads if its end(start) is smaller than prune.(Because it is impossible to have that junction again) // Add the junction to the queue, if it is not in it. // Return true, if it finds the junction. Otherwise, return false. bool SearchQueue( int start, int end, int prune, int l, int r ) { int i ; // Test whether this read might be a false alignment. i = qHead ; while ( i != qTail ) { struct _readTree *rt ; if ( ( junctionQueue[i].start == start && junctionQueue[i].end < end ) || ( junctionQueue[i].start > start && junctionQueue[i].end == end ) ) { // This alignment is false ; rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ; // the commented out logic because it is handled by contradicted reads if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) ) { if ( rt->leftAnchor <= flank || rt->rightAnchor <= flank )//|| rt->secondary ) { if ( l > flank && r > flank )//&& !secondary ) rt->valid = false ; else validRead = false ; } else { // Ignore this read //return true ; validRead = false ; } } } else if ( ( junctionQueue[i].start == start && junctionQueue[i].end > end ) || ( junctionQueue[i].start < start && junctionQueue[i].end == end ) ) { // This other alignment is false ; rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ; //if ( rt != NULL ) if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) ) { if ( l <= flank || r <= flank )//|| secondary ) { if ( rt->leftAnchor > flank && rt->rightAnchor > flank )//&& !rt->secondary ) validRead = false ; else rt->valid = false ; } else rt->valid = false ; } } ++i ; if ( i >= QUEUE_SIZE ) i = 0 ; } //if ( start == 8077108 && end == 8078439 ) // exit( 1 ) ; i = qHead ; while ( i != qTail ) { if ( junctionQueue[i].start == start && junctionQueue[i].end == end ) { if (junctionQueue[i].strand == '?' && junctionQueue[i].strand != strand ) { junctionQueue[i].strand = strand ; } InsertReadTree( &junctionQueue[i].head, col[0], l, r ) ; return true ; } if ( junctionQueue[i].end < prune && i == qHead ) { // pop PrintJunction( col[2], junctionQueue[i] ) ; ClearReadTree( junctionQueue[i].head.left ) ; ClearReadTree( junctionQueue[i].head.right ) ; ++qHead ; if ( qHead >= QUEUE_SIZE ) qHead = 0 ; } ++i ; if ( i >= QUEUE_SIZE ) i = 0 ; } InsertQueue( start, end, l, r ) ; return false ; } // Compute the junctions based on the CIGAR bool CompareJunctions( int startLocation, char *cigar ) { int currentLocation = startLocation ; // Current location on the reference genome int i, j ; int num ; int newJuncCnt = 0 ; // The # of junctions in the read, and the # of new junctions among them. struct _cigarSeg cigarSeg[1000] ; // A segment of the cigar. int ccnt = 0 ; // cigarSeg cnt j = 0 ; num = 0 ; validRead = true ; for ( i = 0 ; cigar[i] ; ++i ) { if ( cigar[i] >= '0' && cigar[i] <= '9' ) { num = num * 10 + cigar[i] - '0' ; } else { cigarSeg[ccnt].len = num ; cigarSeg[ccnt].type = cigar[i] ; ++ccnt ; num = 0 ; } } // Filter low complex alignment. // Only applies this to alignments does not have a strand information. if ( strand == '?' ) { if ( noncanonStrandInfo != -1 ) { if ( ( noncanonStrandInfo & filterYS ) != 0 ) validRead = false ; } else { int softStart = -1 ; int softEnd = 0 ; if ( cigarSeg[0].type == 'S' ) softStart = cigarSeg[0].len ; if ( cigarSeg[ ccnt - 1 ].type == 'S' ) softEnd = cigarSeg[ ccnt - 1 ].len ; int readLen = strlen( col[9] ) ; int count[5] = { 0, 0, 0, 0, 0 } ; int pos = 0 ; for ( i = 0 ; i < ccnt ; ++i ) { switch ( cigarSeg[i].type ) { case 'S': pos += cigarSeg[i].len ; case 'M': case 'I': { for ( j = 0 ; j < cigarSeg[i].len ; ++j ) ++count[ nucToNum[ col[9][pos + j] - 'A' ] ] ; pos += j ; } break ; case 'N': { int max = 0 ; int sum = 0 ; for ( j = 0 ; j < 5 ; ++j ) { if ( count[j] > max ) max = count[j] ; sum += count[j] ; } if ( max > 0.8 * sum ) validRead = false ; count[0] = count[1] = count[2] = count[3] = count[4] = 0 ; } break ; case 'H': case 'P': case 'D': default: break ; } } int max = 0 ; int sum = 0 ; for ( j = 0 ; j < 5 ; ++j ) { if ( count[j] > max ) max = count[j] ; sum += count[j] ; } if ( max > 0.8 * sum ) validRead = false ; /* count[0] = count[1] = count[2] = count[3] = count[4] = 0 ; for ( i = softStart + 1 ; i < readLen - softEnd ; ++i ) { switch ( col[9][i] ) { case 'A': ++count[0] ; break ; case 'C': ++count[1] ; break ; case 'G': ++count[2] ; break ; case 'T': ++count[3] ; break ; default: ++count[4] ; } } int max = 0 ; for ( j = 0 ; j < 5 ; ++j ) if ( count[j] > max ) max = count[j] ; if ( max > 0.6 * ( readLen - softEnd - softStart - 1 ) ) validRead = false ;*/ } } // Test whether contradict with mate pair if ( col[6][0] == '=' ) { currentLocation = startLocation ; for ( i = 0 ; i < ccnt ; ++i ) { if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H' || cigarSeg[i].type == 'P' ) continue ; else if ( cigarSeg[i].type == 'N' ) { if ( mateStart >= currentLocation && mateStart <= currentLocation + cigarSeg[i].len - 1 ) { /*if ( cigarSeg[i].len == 91486 ) { printf( "%s %d %d\n", currentLocation, mateStart ) ; exit( 1 ) ; }*/ // ignore this read //return false ; InsertContradictedReads( contradictedReads, col[0], mateStart ) ; validRead = false ; break ; } } currentLocation += cigarSeg[i].len ; } i = qHead ; // Search if it falls in the splices junction created by its mate. while ( i != qTail ) { if ( mateStart < junctionQueue[i].start && junctionQueue[i].start <= startLocation && startLocation <= junctionQueue[i].end && SearchContradictedReads( contradictedReads, col[0], startLocation ) ) { validRead = false ; break ; } ++i ; if ( i >= QUEUE_SIZE ) i = 0 ; } } currentLocation = startLocation ; for ( i = 0 ; i < ccnt ; ++i ) { if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H' || cigarSeg[i].type == 'P' ) continue ; else if ( cigarSeg[i].type == 'N' ) { int left, right ; int tmp ; tmp = i ; while ( i > 0 && ( cigarSeg[i - 1].type == 'I' || cigarSeg[i - 1].type == 'D' ) ) --i ; if ( i > 0 && cigarSeg[i - 1].type == 'M' ) { left = cigarSeg[i - 1].len ; if ( i >= 2 && cigarSeg[i - 2].type == 'N' && left <= flank ) { left = flank + 1 ; } } else left = 0 ; i = tmp ; while ( i < ccnt && ( cigarSeg[i + 1].type == 'I' || cigarSeg[i + 1].type == 'D' ) ) ++i ; if ( i < ccnt && cigarSeg[i + 1].type == 'M' ) { right = cigarSeg[i + 1].len ; if ( i + 2 < ccnt && cigarSeg[i + 2].type == 'N' && right <= flank ) { right = flank + 1 ; } } else right = 0 ; i = tmp ; if ( !SearchQueue( currentLocation, currentLocation + cigarSeg[i].len - 1, startLocation, left, right ) ) { ++newJuncCnt ; } } currentLocation += cigarSeg[i].len ; } /*else if ( cigar[i] == 'I' ) { num = 0 ; } else if ( cigar[i] == 'N' ) { if ( !SearchQueue( currentLocation, currentLocation + num - 1, startLocation ) ) { ++newJuncCnt ; //if ( flagPrintJunction ) // printf( "%s %d %d\n", col[2], currentLocation - 2, currentLocation + len - 1 ) ; } currentLocation += num ; num = 0 ; } else // Other operations, like M, D,..., { currentLocation += num ; num = 0 ; }*/ junctionCnt += newJuncCnt ; if ( newJuncCnt ) return true ; else return false ; } void cigar2string( bam1_core_t *c, uint32_t *in_cigar, char *out_cigar ) { int k, op, l ; char opcode ; *out_cigar = '\0' ; for ( k = 0 ; k < c->n_cigar ; ++k ) { op = in_cigar[k] & BAM_CIGAR_MASK ; l = in_cigar[k] >> BAM_CIGAR_SHIFT ; switch (op) { case BAM_CMATCH: opcode = 'M' ; break ; case BAM_CINS: opcode = 'I' ; break ; case BAM_CDEL: opcode = 'D' ; break ; case BAM_CREF_SKIP: opcode = 'N' ; break ; case BAM_CSOFT_CLIP: opcode = 'S' ; break ; case BAM_CHARD_CLIP: opcode = 'H' ; break ; case BAM_CPAD: opcode = 'P' ; break ; } sprintf( out_cigar + strlen( out_cigar ), "%d%c", l, opcode ) ; } } int main( int argc, char *argv[] ) { FILE *fp ; samfile_t *fpsam ; bam1_t *b = NULL ; bool useSam = true ; int i, len ; int startLocation ; bool flagRemove = false ; char prevChrome[103] ; anchorBoth = false ; flagPrintJunction = false ; flagPrintAll = false ; flagStrict = false ; junctionCnt = 0 ; bool hasMateReadIdSuffix = false ; strcpy( prevChrome, "" ) ; flagRemove = true ; flagPrintJunction = true ; flank = 8 ; filterYS = 4 ; contradictedReads = NULL ; // processing the argument list for ( i = 1 ; i < argc ; ++i ) { if ( !strcmp( argv[i], "-h" ) ) { PrintHelp() ; return 0 ; } else if ( !strcmp( argv[i], "-r" ) ) { flagRemove = true ; } else if ( !strcmp( argv[i], "-j" ) ) { strcpy( prevChrome, "" ) ; flagRemove = true ; flagPrintJunction = true ; flank = atoi( argv[i + 1] ) ; if ( i + 2 < argc && !strcmp( argv[i+2], "-B" ) ) { anchorBoth = true ; ++i ; } ++i ; } else if ( !strcmp( argv[i], "-a" ) ) { flagPrintAll = true ; } else if ( !strcmp( argv[i], "--strict" ) ) { flagStrict = true ; } else if ( !strcmp( argv[i], "-y" ) ) { filterYS = atoi( argv[i + 1] ) ; ++i ; } else if ( !strcmp( argv[i], "--hasMateIdSuffix" ) ) { hasMateReadIdSuffix = true ; ++i ; } else if ( i > 1 ) { printf( "Unknown option %s\n", argv[i] ) ; exit( 1 ) ; } } if ( argc == 1 ) { PrintHelp() ; return 0 ; } len = strlen( argv[1] ) ; if ( argv[1][len-3] == 'b' || argv[1][len-3] == 'B' ) { if ( !( fpsam = samopen( argv[1], "rb", 0 ) ) ) return 0 ; if ( !fpsam->header ) { //samclose( fpsam ) ; //fpsam = samopen( argv[1], "r", 0 ) ; //if ( !fpsam->header ) //{ useSam = false ; fp = fopen( argv[1], "r" ) ; //} } } else { useSam = false ; fp = NULL ; if ( !strcmp( argv[1], "-" ) ) fp = stdin ; else fp = fopen( argv[1], "r" ) ; if ( fp == NULL ) { printf( "Could not open file %s\n", argv[1] ) ; return 0 ; } } while ( 1 ) { int flag = 0 ; if ( useSam ) { if ( b ) bam_destroy1( b ) ; b = bam_init1() ; if ( samread( fpsam, b ) <= 0 ) break ; if ( b->core.tid >= 0 ) strcpy( col[2], fpsam->header->target_name[b->core.tid] ) ; else strcpy( col[2], "-1" ) ; cigar2string( &(b->core), bam1_cigar( b ), col[5] ) ; strcpy( col[0], bam1_qname( b ) ) ; flag = b->core.flag ; if ( bam_aux_get( b, "NH" ) ) { /*if ( bam_aux2i( bam_aux_get(b, "NH" ) ) >= 2 ) { secondary = true ; } else secondary = false ;*/ NH = bam_aux2i( bam_aux_get( b, "NH" ) ) ; } else { //secondary = false ; NH = 1 ; } if ( bam_aux_get( b, "NM" ) ) { editDistance = bam_aux2i( bam_aux_get( b, "NM" ) ) ; } else if ( bam_aux_get( b, "nM" ) ) { editDistance = bam_aux2i( bam_aux_get( b, "nM" ) ) ; } else editDistance = 0 ; mateStart = b->core.mpos + 1 ; if ( b->core.mtid == b->core.tid ) col[6][0] = '=' ; else col[6][0] = '*' ; if ( b->core.l_qseq < 20 ) continue ; for ( i = 0 ; i < b->core.l_qseq ; ++i ) { int bit = bam1_seqi( bam1_seq( b ), i ) ; switch ( bit ) { case 1: col[9][i] = 'A' ; break ; case 2: col[9][i] = 'C' ; break ; case 4: col[9][i] = 'G' ; break ; case 8: col[9][i] = 'T' ; break ; case 15: col[9][i] = 'N' ; break ; default: col[9][i] = 'A' ; break ; } } col[9][i] = '\0' ; /*if ( flag & 0x100 ) secondary = true ; else secondary = false ;*/ } else { char *p ; if ( !fgets( line, LINE_SIZE, fp ) ) break ; if ( line[0] == '\0' || line[0] == '@' ) continue ; sscanf( line, "%s%s%s%s%s%s%s%s%s%s%s", col, col + 1, col + 2, col + 3, col + 4, col + 5, col + 6, col + 7, col + 8, col + 9, col + 10 ) ; flag = atoi( col[1] ) ; if ( p = strstr( line, "NH" ) ) { int k = 0 ; p += 5 ; while ( *p != ' ' && *p != '\t' && *p != '\0') { k = k * 10 + *p - '0' ; ++p ; } /*if ( k >= 2 ) { secondary = true ; } else secondary = false ;*/ NH = k ; } else { //secondary = false ; NH = 1 ; } if ( ( p = strstr( line, "NM" ) ) || ( p = strstr( line, "nM" ) ) ) { int k = 0 ; p += 5 ; while ( *p != ' ' && *p != '\t' && *p != '\0') { k = k * 10 + *p - '0' ; ++p ; } editDistance = k ; } else editDistance = 0 ; mateStart = atoi( col[7] ) ; /*if ( flag & 0x100 ) secondary = true ; else secondary = false ;*/ } samFlag = flag ; for ( i = 0 ; col[5][i] ; ++i ) if ( col[5][i] == 'N' ) break ; if ( !col[5][i] ) continue ; // remove .1, .2 or /1, /2 suffix if ( hasMateReadIdSuffix ) { char *s = col[0] ; int len = strlen( s ) ; if ( len >= 2 && ( s[len - 1] == '1' || s[len - 1] == '2' ) && ( s[len - 2] == '.' || s[len - 2] == '/' ) ) { s[len - 2] = '\0' ; } } if ( useSam ) { if ( bam_aux_get( b, "XS" ) ) { strand = bam_aux2A( bam_aux_get( b, "XS" ) ) ; if ( bam_aux_get( b, "YS" ) ) { noncanonStrandInfo = bam_aux2i( bam_aux_get( b, "YS" ) ) ; } else { noncanonStrandInfo = -1 ; } } else { strand = '?' ; noncanonStrandInfo = -1 ; } startLocation = b->core.pos + 1 ; } else { if ( strstr( line, "XS:A:-" ) ) // on negative strand strand = '-' ; else if ( strstr( line, "XS:A:+" ) ) strand = '+' ; else { strand = '?' ; char *p = strstr( line, "YS:i:" ) ; if ( p != NULL ) { p += 5 ; noncanonStrandInfo = atoi( p ) ; } else noncanonStrandInfo = -1 ; } startLocation = atoi( col[3] ) ; } // Found the junctions from the read. if ( strcmp( prevChrome, col[2] ) ) { if ( flagPrintJunction ) { // Print the remaining elements in the queue i = qHead ; while ( i != qTail ) { PrintJunction( prevChrome, junctionQueue[i] ) ; ++i ; if ( i >= QUEUE_SIZE ) i = 0 ; } } // new chromosome ClearReadTree( contradictedReads ) ; contradictedReads = NULL ; qHead = qTail = 0 ; strcpy( prevChrome, col[2] ) ; } if ( flagRemove ) { if ( CompareJunctions( startLocation, col[5] ) ) { // Test whether this read has new junctions //++junctionCnt ; if ( !flagPrintJunction ) printf( "%s", line ) ; } } else { ++junctionCnt ; printf( "%s", line ) ; } //printf( "hi2 %s\n", col[0] ) ; } if ( flagPrintJunction ) { // Print the remaining elements in the queue i = qHead ; while ( i != qTail ) { PrintJunction( prevChrome, junctionQueue[i] ) ; ++i ; if ( i >= QUEUE_SIZE ) i = 0 ; } } //fprintf( stderr, "The number of junctions: %d\n", junctionCnt ) ; return 0 ; }