Mercurial > repos > lsong10 > psiclass
diff PsiCLASS-1.0.2/FindJunction.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/FindJunction.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1108 @@ +/* + Find junctions from the SAM file generated by Tophat2. + The SAM file should be sorted. + The junction is the start and the end coordinates of the spliced region in this program. The output of the junction is a bit different. + + Usage: ./a.out input [option] >output. + ./a.out -h for help. +*/ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#include "sam.h" + +#define LINE_SIZE 4097 +#define QUEUE_SIZE 10001 +#define HASH_MAX 1000003 + +struct _readTree +{ + char id[256] ; + int leftAnchor, rightAnchor ; + int pos ; + //bool secondary ; + bool valid ; + int editDistance ; + int NH, cnt ; // if cnt < NH, then it has real secondary match for this splice junction + struct _readTree *left, *right ; + + int flag ;// The flag from sam head. +} ; + +// The structure of a junction +struct _junction +{ + int start, end ; + int readCnt ; // The # of reads containing this junction + int secReadCnt ; // The # of reads whose secondary alignment has this junction. + char strand ; // On '+' or '-' strand + int leftAnchor, rightAnchor ; // The longest left and right anchor + int oppositeAnchor ; // The longest anchor of the shorter side. + int uniqEditDistance, secEditDistance ; + struct _readTree head ; +} ; + +char line[LINE_SIZE] ; +char col[11][LINE_SIZE] ; // The option fields is not needed. +char strand ; // Extract XS field +char noncanonStrandInfo ; +//bool secondary ; +int NH ; +int editDistance ; +int mateStart ; +int filterYS ; +int samFlag ; + +struct _junction junctionQueue[QUEUE_SIZE] ; // Expected only a few junctions in it for each read. This queue is sorted. + +int qHead, qTail ; + +bool flagPrintJunction ; +bool flagPrintAll ; +bool flagStrict ; +int junctionCnt ; +bool anchorBoth ; +bool validRead ; + +int flank ; +struct _cigarSeg +{ + int len ; + char type ; +} ; + +struct _readTree *contradictedReads ; + +char nucToNum[26] = { 0, 4, 1, 4, 4, 4, 2, + 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 3, + 4, 4, 4, 4, 4, 4 } ; + +void PrintHelp() +{ + printf( + "Prints reads from the SAM/BAM file that containing junctions.\n" + "Usage: ./a.out input [option]>output\n" + "Options:\n" + "\t-j xx [-B]: Output the junctions using 1-based coordinates. The format is \"reference id\" start end \"# of read\" strand.(They are sorted)\n and the xx is an integer means the maximum unqualified anchor length for a splice junction(default=8). If -B, the splice junction must be supported by a read whose both anchors are longer than xx.\n" + "\t-a: Output all the junctions, and use non-positive support number to indicate unqualified junctions.\n" + "\t-y: If the bits from YS field of bam matches the argument, we filter the alignment (default: 4).\n" + ) ; +} + +void GetJunctionInfo( struct _junction &junc, struct _readTree *p ) +{ + if ( p == NULL ) + return ; + + if ( p->valid ) + { + //if ( junc.start == 22381343 + 1 && junc.end == 22904987 - 1 ) + // printf( "%s %d %d %d\n", p->id, p->leftAnchor, p->rightAnchor, p->flag ) ; + + + + if ( p->cnt < p->NH ) + { + junc.secEditDistance += p->editDistance ; + ++junc.secReadCnt ; + } + else + { + junc.uniqEditDistance += p->editDistance ; + ++junc.readCnt ; + } + int l = p->leftAnchor, r = p->rightAnchor ; + + if ( !anchorBoth ) + { + if ( l > junc.leftAnchor ) + junc.leftAnchor = l ; + if ( r > junc.rightAnchor ) + junc.rightAnchor = r ; + + if ( l <= r && l > junc.oppositeAnchor ) + junc.oppositeAnchor = l ; + else if ( r < l && r > junc.oppositeAnchor ) + junc.oppositeAnchor = r ; + } + else + { + if ( l > flank && r > flank ) + { + junc.leftAnchor = l ; + junc.rightAnchor = r ; + } + + if ( l <= r && l > junc.oppositeAnchor ) + junc.oppositeAnchor = l ; + else if ( r < l && r > junc.oppositeAnchor ) + junc.oppositeAnchor = r ; + } + } + GetJunctionInfo( junc, p->left ) ; + GetJunctionInfo( junc, p->right ) ; +} + +void PrintJunctionReads( struct _junction &junc, struct _readTree *p ) +{ + if ( p == NULL ) + return ; + + if ( p->valid ) + printf( "%s\n", p->id ) ; + PrintJunctionReads( junc, p->left ) ; + PrintJunctionReads( junc, p->right ) ; +} + +void PrintJunction( char *chrome, struct _junction &junc ) +{ + int sum ; + junc.leftAnchor = 0 ; + junc.rightAnchor = 0 ; + junc.oppositeAnchor = 0 ; + junc.readCnt = 0 ; + junc.secReadCnt = 0 ; + junc.uniqEditDistance = 0 ; + junc.secEditDistance = 0 ; + GetJunctionInfo( junc, &junc.head ) ; + + sum = junc.readCnt + junc.secReadCnt ; + + if ( junc.leftAnchor <= flank || junc.rightAnchor <= flank || ( junc.readCnt + junc.secReadCnt <= 0 ) ) + { + if ( flagPrintAll ) + { + sum = -sum ; + } + else + return ; + } + + /*if ( junc.end - junc.start + 1 >= 200000 ) + { + if ( junc.secReadCnt > 0 ) + junc.secReadCnt = 1 ; + }*/ + + /*if ( junc.oppositeAnchor <= ( ( flank / 2 < 1 ) ? flank / 2 : 1 ) && ( junc.readCnt + junc.secReadCnt ) <= 10 ) + { + if ( junc.readCnt > 0 ) + junc.readCnt = 1 ; + if ( junc.secReadCnt > 0 ) + junc.secReadCnt = 0 ; + }*/ + + printf( "%s %d %d %d %c %d %d %d %d\n", chrome, junc.start - 1, junc.end + 1, sum, junc.strand, + junc.readCnt, junc.secReadCnt, junc.uniqEditDistance, junc.secEditDistance ) ; + //PrintJunctionReads( junc, &junc.head ) ; +} + +void ClearReadTree( struct _readTree *p ) +{ + if ( p == NULL ) + return ; + ClearReadTree( p->left ) ; + ClearReadTree( p->right ) ; + free( p ) ; +} + +// Insert to the read tree +bool InsertReadTree( struct _readTree *p, char *id, int l, int r ) +{ + int tmp = strcmp( p->id, id ) ; + if ( tmp == 0 ) + { + p->cnt += 1 ; + return true ; + } + else if ( tmp < 0 ) + { + if ( p->left ) + return InsertReadTree( p->left, id, l, r ) ; + else + { + p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; + strcpy( p->left->id, id ) ; + p->left->leftAnchor = l ; + p->left->rightAnchor = r ; + //p->left->secondary = secondary ; + p->left->editDistance = editDistance ; + p->left->left = p->left->right = NULL ; + p->left->valid = validRead ; + p->left->cnt = 1 ; + p->left->NH = NH ; + p->left->flag = samFlag ; + return false ; + } + } + else + { + if ( p->right ) + return InsertReadTree( p->right, id, l, r ) ; + else + { + p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; + strcpy( p->right->id, id ) ; + p->right->leftAnchor = l ; + p->right->rightAnchor = r ; + //p->right->secondary = secondary ; + p->right->editDistance = editDistance ; + p->right->left = p->right->right = NULL ; + p->right->valid = validRead ; + p->right->cnt = 1 ; + p->right->NH = NH ; + p->right->flag = samFlag ; + return false ; + } + } +} + + +bool SearchContradictedReads( struct _readTree *p, char *id, int pos ) +{ + if ( p == NULL ) + return false ; + int tmp = strcmp( p->id, id ) ; + if ( tmp == 0 && p->pos == pos ) + return true ; + else if ( tmp <= 0 ) + return SearchContradictedReads( p->left, id, pos ) ; + else + return SearchContradictedReads( p->right, id, pos ) ; +} + +bool InsertContradictedReads( struct _readTree *p, char *id, int pos ) +{ + if ( p == NULL ) + { + contradictedReads = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; + strcpy( contradictedReads->id, id ) ; + contradictedReads->pos = pos ; + contradictedReads->left = contradictedReads->right = NULL ; + return false ; + } + int tmp = strcmp( p->id, id ) ; + if ( tmp == 0 && p->pos == pos ) + return true ; + else if ( tmp <= 0 ) + { + if ( p->left ) + return InsertContradictedReads( p->left, id, pos ) ; + else + { + p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; + strcpy( p->left->id, id ) ; + p->left->pos = pos ; + p->left->left = p->left->right = NULL ; + return false ; + } + } + else + { + if ( p->right ) + return InsertContradictedReads( p->right, id, pos ) ; + else + { + p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; + strcpy( p->right->id, id ) ; + p->right->pos = pos ; + p->right->left = p->right->right = NULL ; + return false ; + } + } +} + +struct _readTree *GetReadTreeNode( struct _readTree *p, char *id ) +{ + if ( p == NULL ) + return NULL ; + int tmp = strcmp( p->id, id ) ; + if ( tmp == 0 ) + return p ; + else if ( tmp < 0 ) + return GetReadTreeNode( p->left, id ) ; + else + return GetReadTreeNode( p->right, id ) ; +} + +// Insert the new junction into the queue, +// and make sure the queue is sorted. +// Assume each junction range is only on one strand. +// l, r is the left and right anchor from a read +void InsertQueue( int start, int end, int l, int r ) +{ + int i, j ; + i = qTail ; + + + while ( i != qHead ) + { + j = i - 1 ; + if ( j < 0 ) + j = QUEUE_SIZE - 1 ; + + if ( junctionQueue[j].start < start + || ( junctionQueue[j].start == start && junctionQueue[j].end < end ) ) + break ; + + junctionQueue[i] = junctionQueue[j] ; + --i ; + + if ( i < 0 ) + i = QUEUE_SIZE - 1 ; + } + + junctionQueue[i].start = start ; + junctionQueue[i].end = end ; + /*if ( !secondary ) + { + junctionQueue[i].readCnt = 1 ; + junctionQueue[i].secReadCnt = 0 ; + } + else + { + junctionQueue[i].readCnt = 0 ; + junctionQueue[i].secReadCnt = 1 ; + }*/ + junctionQueue[i].strand = strand ; + junctionQueue[i].leftAnchor = l ; + junctionQueue[i].rightAnchor = r ; + + strcpy( junctionQueue[i].head.id, col[0] ) ; + junctionQueue[i].head.valid = validRead ; + junctionQueue[i].head.leftAnchor = l ; + junctionQueue[i].head.rightAnchor = r ; + junctionQueue[i].head.left = junctionQueue[i].head.right = NULL ; + //junctionQueue[i].head.secondary = secondary ; + junctionQueue[i].head.NH = NH ; + junctionQueue[i].head.cnt = 1 ; + junctionQueue[i].head.editDistance = editDistance ; + + ++qTail ; + if ( qTail >= QUEUE_SIZE ) + qTail = 0 ; +} + +// Search the queue, and remove the heads if its end(start) is smaller than prune.(Because it is impossible to have that junction again) +// Add the junction to the queue, if it is not in it. +// Return true, if it finds the junction. Otherwise, return false. +bool SearchQueue( int start, int end, int prune, int l, int r ) +{ + int i ; + + // Test whether this read might be a false alignment. + i = qHead ; + while ( i != qTail ) + { + struct _readTree *rt ; + if ( ( junctionQueue[i].start == start && junctionQueue[i].end < end ) || + ( junctionQueue[i].start > start && junctionQueue[i].end == end ) ) + { + // This alignment is false ; + rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ; + // the commented out logic because it is handled by contradicted reads + if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) ) + { + if ( rt->leftAnchor <= flank || rt->rightAnchor <= flank )//|| rt->secondary ) + { + if ( l > flank && r > flank )//&& !secondary ) + rt->valid = false ; + else + validRead = false ; + } + else + { + // Ignore this read + //return true ; + validRead = false ; + } + } + } + else if ( ( junctionQueue[i].start == start && junctionQueue[i].end > end ) || + ( junctionQueue[i].start < start && junctionQueue[i].end == end ) ) + { + // This other alignment is false ; + rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ; + //if ( rt != NULL ) + if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) ) + { + if ( l <= flank || r <= flank )//|| secondary ) + { + if ( rt->leftAnchor > flank && rt->rightAnchor > flank )//&& !rt->secondary ) + validRead = false ; + else + rt->valid = false ; + } + else + rt->valid = false ; + } + } + ++i ; + if ( i >= QUEUE_SIZE ) + i = 0 ; + } + + //if ( start == 8077108 && end == 8078439 ) + // exit( 1 ) ; + i = qHead ; + + while ( i != qTail ) + { + if ( junctionQueue[i].start == start && + junctionQueue[i].end == end ) + { + if (junctionQueue[i].strand == '?' && junctionQueue[i].strand != strand ) + { + junctionQueue[i].strand = strand ; + } + InsertReadTree( &junctionQueue[i].head, col[0], l, r ) ; + return true ; + } + + if ( junctionQueue[i].end < prune && i == qHead ) + { + // pop + PrintJunction( col[2], junctionQueue[i] ) ; + ClearReadTree( junctionQueue[i].head.left ) ; + ClearReadTree( junctionQueue[i].head.right ) ; + ++qHead ; + if ( qHead >= QUEUE_SIZE ) + qHead = 0 ; + } + ++i ; + if ( i >= QUEUE_SIZE ) + i = 0 ; + } + + InsertQueue( start, end, l, r ) ; + return false ; +} + +// Compute the junctions based on the CIGAR +bool CompareJunctions( int startLocation, char *cigar ) +{ + int currentLocation = startLocation ; // Current location on the reference genome + int i, j ; + int num ; + int newJuncCnt = 0 ; // The # of junctions in the read, and the # of new junctions among them. + + struct _cigarSeg cigarSeg[1000] ; // A segment of the cigar. + int ccnt = 0 ; // cigarSeg cnt + + j = 0 ; + num = 0 ; + validRead = true ; + + for ( i = 0 ; cigar[i] ; ++i ) + { + if ( cigar[i] >= '0' && cigar[i] <= '9' ) + { + num = num * 10 + cigar[i] - '0' ; + } + else + { + cigarSeg[ccnt].len = num ; + cigarSeg[ccnt].type = cigar[i] ; + ++ccnt ; + num = 0 ; + } + } + + // Filter low complex alignment. + // Only applies this to alignments does not have a strand information. + if ( strand == '?' ) + { + if ( noncanonStrandInfo != -1 ) + { + if ( ( noncanonStrandInfo & filterYS ) != 0 ) + validRead = false ; + } + else + { + int softStart = -1 ; + int softEnd = 0 ; + if ( cigarSeg[0].type == 'S' ) + softStart = cigarSeg[0].len ; + if ( cigarSeg[ ccnt - 1 ].type == 'S' ) + softEnd = cigarSeg[ ccnt - 1 ].len ; + int readLen = strlen( col[9] ) ; + int count[5] = { 0, 0, 0, 0, 0 } ; + + int pos = 0 ; + for ( i = 0 ; i < ccnt ; ++i ) + { + switch ( cigarSeg[i].type ) + { + case 'S': + pos += cigarSeg[i].len ; + case 'M': + case 'I': + { + for ( j = 0 ; j < cigarSeg[i].len ; ++j ) + ++count[ nucToNum[ col[9][pos + j] - 'A' ] ] ; + pos += j ; + } break ; + case 'N': + { + int max = 0 ; + int sum = 0 ; + for ( j = 0 ; j < 5 ; ++j ) + { + if ( count[j] > max ) + max = count[j] ; + sum += count[j] ; + } + if ( max > 0.8 * sum ) + validRead = false ; + count[0] = count[1] = count[2] = count[3] = count[4] = 0 ; + } break ; + case 'H': + case 'P': + case 'D': + default: break ; + } + } + + int max = 0 ; + int sum = 0 ; + for ( j = 0 ; j < 5 ; ++j ) + { + if ( count[j] > max ) + max = count[j] ; + sum += count[j] ; + } + if ( max > 0.8 * sum ) + validRead = false ; + /* count[0] = count[1] = count[2] = count[3] = count[4] = 0 ; + + + for ( i = softStart + 1 ; i < readLen - softEnd ; ++i ) + { + switch ( col[9][i] ) + { + case 'A': ++count[0] ; break ; + case 'C': ++count[1] ; break ; + case 'G': ++count[2] ; break ; + case 'T': ++count[3] ; break ; + default: ++count[4] ; + } + } + int max = 0 ; + for ( j = 0 ; j < 5 ; ++j ) + if ( count[j] > max ) + max = count[j] ; + if ( max > 0.6 * ( readLen - softEnd - softStart - 1 ) ) + validRead = false ;*/ + } + } + + // Test whether contradict with mate pair + if ( col[6][0] == '=' ) + { + currentLocation = startLocation ; + for ( i = 0 ; i < ccnt ; ++i ) + { + if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H' + || cigarSeg[i].type == 'P' ) + continue ; + else if ( cigarSeg[i].type == 'N' ) + { + if ( mateStart >= currentLocation && mateStart <= currentLocation + cigarSeg[i].len - 1 ) + { + /*if ( cigarSeg[i].len == 91486 ) + { + printf( "%s %d %d\n", currentLocation, mateStart ) ; + exit( 1 ) ; + }*/ + // ignore this read + //return false ; + InsertContradictedReads( contradictedReads, col[0], mateStart ) ; + validRead = false ; + break ; + } + } + currentLocation += cigarSeg[i].len ; + } + + i = qHead ; + // Search if it falls in the splices junction created by its mate. + while ( i != qTail ) + { + if ( mateStart < junctionQueue[i].start && junctionQueue[i].start <= startLocation + && startLocation <= junctionQueue[i].end + && SearchContradictedReads( contradictedReads, col[0], startLocation ) ) + { + validRead = false ; + break ; + } + ++i ; + if ( i >= QUEUE_SIZE ) + i = 0 ; + } + } + + currentLocation = startLocation ; + for ( i = 0 ; i < ccnt ; ++i ) + { + if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H' + || cigarSeg[i].type == 'P' ) + continue ; + else if ( cigarSeg[i].type == 'N' ) + { + int left, right ; + int tmp ; + tmp = i ; + while ( i > 0 && ( cigarSeg[i - 1].type == 'I' || cigarSeg[i - 1].type == 'D' ) ) + --i ; + if ( i > 0 && cigarSeg[i - 1].type == 'M' ) + { + left = cigarSeg[i - 1].len ; + if ( i >= 2 && cigarSeg[i - 2].type == 'N' && left <= flank ) + { + left = flank + 1 ; + } + } + else + left = 0 ; + + i = tmp ; + while ( i < ccnt && ( cigarSeg[i + 1].type == 'I' || cigarSeg[i + 1].type == 'D' ) ) + ++i ; + if ( i < ccnt && cigarSeg[i + 1].type == 'M' ) + { + right = cigarSeg[i + 1].len ; + if ( i + 2 < ccnt && cigarSeg[i + 2].type == 'N' && right <= flank ) + { + right = flank + 1 ; + } + } + else + right = 0 ; + i = tmp ; + if ( !SearchQueue( currentLocation, currentLocation + cigarSeg[i].len - 1, startLocation, left, right ) ) + { + ++newJuncCnt ; + } + } + currentLocation += cigarSeg[i].len ; + } + /*else if ( cigar[i] == 'I' ) + { + num = 0 ; + } + else if ( cigar[i] == 'N' ) + { + if ( !SearchQueue( currentLocation, currentLocation + num - 1, startLocation ) ) + { + ++newJuncCnt ; + //if ( flagPrintJunction ) + // printf( "%s %d %d\n", col[2], currentLocation - 2, currentLocation + len - 1 ) ; + } + currentLocation += num ; + num = 0 ; + } + else // Other operations, like M, D,..., + { + currentLocation += num ; + num = 0 ; + }*/ + + junctionCnt += newJuncCnt ; + + if ( newJuncCnt ) + return true ; + else + return false ; +} + +void cigar2string( bam1_core_t *c, uint32_t *in_cigar, char *out_cigar ) +{ + int k, op, l ; + char opcode ; + + *out_cigar = '\0' ; + for ( k = 0 ; k < c->n_cigar ; ++k ) + { + op = in_cigar[k] & BAM_CIGAR_MASK ; + l = in_cigar[k] >> BAM_CIGAR_SHIFT ; + switch (op) + { + case BAM_CMATCH: opcode = 'M' ; break ; + case BAM_CINS: opcode = 'I' ; break ; + case BAM_CDEL: opcode = 'D' ; break ; + case BAM_CREF_SKIP: opcode = 'N' ; break ; + case BAM_CSOFT_CLIP: opcode = 'S' ; break ; + case BAM_CHARD_CLIP: opcode = 'H' ; break ; + case BAM_CPAD: opcode = 'P' ; break ; + } + sprintf( out_cigar + strlen( out_cigar ), "%d%c", l, opcode ) ; + } +} + + + +int main( int argc, char *argv[] ) +{ + FILE *fp ; + samfile_t *fpsam ; + bam1_t *b = NULL ; + bool useSam = true ; + + int i, len ; + int startLocation ; + bool flagRemove = false ; + char prevChrome[103] ; + + anchorBoth = false ; + + flagPrintJunction = false ; + flagPrintAll = false ; + flagStrict = false ; + junctionCnt = 0 ; + bool hasMateReadIdSuffix = false ; + + strcpy( prevChrome, "" ) ; + flagRemove = true ; + flagPrintJunction = true ; + flank = 8 ; + filterYS = 4 ; + + contradictedReads = NULL ; + + // processing the argument list + for ( i = 1 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "-h" ) ) + { + PrintHelp() ; + return 0 ; + } + else if ( !strcmp( argv[i], "-r" ) ) + { + flagRemove = true ; + } + else if ( !strcmp( argv[i], "-j" ) ) + { + strcpy( prevChrome, "" ) ; + flagRemove = true ; + flagPrintJunction = true ; + flank = atoi( argv[i + 1] ) ; + if ( i + 2 < argc && !strcmp( argv[i+2], "-B" ) ) + { + anchorBoth = true ; + ++i ; + } + ++i ; + } + else if ( !strcmp( argv[i], "-a" ) ) + { + flagPrintAll = true ; + } + else if ( !strcmp( argv[i], "--strict" ) ) + { + flagStrict = true ; + } + else if ( !strcmp( argv[i], "-y" ) ) + { + filterYS = atoi( argv[i + 1] ) ; + ++i ; + } + else if ( !strcmp( argv[i], "--hasMateIdSuffix" ) ) + { + hasMateReadIdSuffix = true ; + ++i ; + } + else if ( i > 1 ) + { + printf( "Unknown option %s\n", argv[i] ) ; + exit( 1 ) ; + } + } + if ( argc == 1 ) + { + PrintHelp() ; + return 0 ; + } + + len = strlen( argv[1] ) ; + if ( argv[1][len-3] == 'b' || argv[1][len-3] == 'B' ) + { + if ( !( fpsam = samopen( argv[1], "rb", 0 ) ) ) + return 0 ; + + if ( !fpsam->header ) + { + //samclose( fpsam ) ; + //fpsam = samopen( argv[1], "r", 0 ) ; + //if ( !fpsam->header ) + //{ + useSam = false ; + fp = fopen( argv[1], "r" ) ; + //} + } + } + else + { + useSam = false ; + fp = NULL ; + if ( !strcmp( argv[1], "-" ) ) + fp = stdin ; + else + fp = fopen( argv[1], "r" ) ; + if ( fp == NULL ) + { + printf( "Could not open file %s\n", argv[1] ) ; + return 0 ; + } + } + + while ( 1 ) + { + int flag = 0 ; + if ( useSam ) + { + if ( b ) + bam_destroy1( b ) ; + b = bam_init1() ; + if ( samread( fpsam, b ) <= 0 ) + break ; + if ( b->core.tid >= 0 ) + strcpy( col[2], fpsam->header->target_name[b->core.tid] ) ; + else + strcpy( col[2], "-1" ) ; + cigar2string( &(b->core), bam1_cigar( b ), col[5] ) ; + strcpy( col[0], bam1_qname( b ) ) ; + flag = b->core.flag ; + if ( bam_aux_get( b, "NH" ) ) + { + /*if ( bam_aux2i( bam_aux_get(b, "NH" ) ) >= 2 ) + { + secondary = true ; + } + else + secondary = false ;*/ + + NH = bam_aux2i( bam_aux_get( b, "NH" ) ) ; + } + else + { + //secondary = false ; + NH = 1 ; + } + + if ( bam_aux_get( b, "NM" ) ) + { + editDistance = bam_aux2i( bam_aux_get( b, "NM" ) ) ; + } + else if ( bam_aux_get( b, "nM" ) ) + { + editDistance = bam_aux2i( bam_aux_get( b, "nM" ) ) ; + } + else + editDistance = 0 ; + + mateStart = b->core.mpos + 1 ; + if ( b->core.mtid == b->core.tid ) + col[6][0] = '=' ; + else + col[6][0] = '*' ; + + if ( b->core.l_qseq < 20 ) + continue ; + + for ( i = 0 ; i < b->core.l_qseq ; ++i ) + { + int bit = bam1_seqi( bam1_seq( b ), i ) ; + switch ( bit ) + { + case 1: col[9][i] = 'A' ; break ; + case 2: col[9][i] = 'C' ; break ; + case 4: col[9][i] = 'G' ; break ; + case 8: col[9][i] = 'T' ; break ; + case 15: col[9][i] = 'N' ; break ; + default: col[9][i] = 'A' ; break ; + } + } + col[9][i] = '\0' ; + + /*if ( flag & 0x100 ) + secondary = true ; + else + secondary = false ;*/ + } + else + { + char *p ; + if ( !fgets( line, LINE_SIZE, fp ) ) + break ; + if ( line[0] == '\0' || line[0] == '@' ) + continue ; + sscanf( line, "%s%s%s%s%s%s%s%s%s%s%s", col, col + 1, col + 2, col + 3, col + 4, + col + 5, col + 6, col + 7, col + 8, col + 9, col + 10 ) ; + + flag = atoi( col[1] ) ; + if ( p = strstr( line, "NH" ) ) + { + int k = 0 ; + p += 5 ; + while ( *p != ' ' && *p != '\t' && *p != '\0') + { + k = k * 10 + *p - '0' ; + ++p ; + } + /*if ( k >= 2 ) + { + secondary = true ; + } + else + secondary = false ;*/ + NH = k ; + } + else + { + //secondary = false ; + NH = 1 ; + } + if ( ( p = strstr( line, "NM" ) ) || ( p = strstr( line, "nM" ) ) ) + { + int k = 0 ; + p += 5 ; + while ( *p != ' ' && *p != '\t' && *p != '\0') + { + k = k * 10 + *p - '0' ; + ++p ; + } + editDistance = k ; + } + else + editDistance = 0 ; + + mateStart = atoi( col[7] ) ; + /*if ( flag & 0x100 ) + secondary = true ; + else + secondary = false ;*/ + + } + samFlag = flag ; + for ( i = 0 ; col[5][i] ; ++i ) + if ( col[5][i] == 'N' ) + break ; + + if ( !col[5][i] ) + continue ; + + // remove .1, .2 or /1, /2 suffix + if ( hasMateReadIdSuffix ) + { + char *s = col[0] ; + int len = strlen( s ) ; + if ( len >= 2 && ( s[len - 1] == '1' || s[len - 1] == '2' ) + && ( s[len - 2] == '.' || s[len - 2] == '/' ) ) + { + s[len - 2] = '\0' ; + } + } + + if ( useSam ) + { + if ( bam_aux_get( b, "XS" ) ) + { + strand = bam_aux2A( bam_aux_get( b, "XS" ) ) ; + if ( bam_aux_get( b, "YS" ) ) + { + noncanonStrandInfo = bam_aux2i( bam_aux_get( b, "YS" ) ) ; + } + else + { + noncanonStrandInfo = -1 ; + } + } + else + { + strand = '?' ; + noncanonStrandInfo = -1 ; + } + startLocation = b->core.pos + 1 ; + } + else + { + if ( strstr( line, "XS:A:-" ) ) // on negative strand + strand = '-' ; + else if ( strstr( line, "XS:A:+" ) ) + strand = '+' ; + else + { + strand = '?' ; + char *p = strstr( line, "YS:i:" ) ; + if ( p != NULL ) + { + p += 5 ; + noncanonStrandInfo = atoi( p ) ; + } + else + noncanonStrandInfo = -1 ; + } + startLocation = atoi( col[3] ) ; + } + + // Found the junctions from the read. + if ( strcmp( prevChrome, col[2] ) ) + { + if ( flagPrintJunction ) + { + // Print the remaining elements in the queue + i = qHead ; + while ( i != qTail ) + { + PrintJunction( prevChrome, junctionQueue[i] ) ; + ++i ; + if ( i >= QUEUE_SIZE ) + i = 0 ; + } + } + + // new chromosome + ClearReadTree( contradictedReads ) ; + contradictedReads = NULL ; + qHead = qTail = 0 ; + strcpy( prevChrome, col[2] ) ; + } + + if ( flagRemove ) + { + if ( CompareJunctions( startLocation, col[5] ) ) + { + // Test whether this read has new junctions + //++junctionCnt ; + if ( !flagPrintJunction ) + printf( "%s", line ) ; + } + } + else + { + ++junctionCnt ; + printf( "%s", line ) ; + } + //printf( "hi2 %s\n", col[0] ) ; + } + + if ( flagPrintJunction ) + { + // Print the remaining elements in the queue + i = qHead ; + while ( i != qTail ) + { + PrintJunction( prevChrome, junctionQueue[i] ) ; + ++i ; + if ( i >= QUEUE_SIZE ) + i = 0 ; + } + } + + //fprintf( stderr, "The number of junctions: %d\n", junctionCnt ) ; + return 0 ; +} +