%s:%d

# HG changeset patch # User lsong10 # Date 1616777565 0 # Node ID 903fc43d62270437bcc50fbbe310a2eec89f084b Uploaded diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/AddGeneName.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/AddGeneName.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,476 @@ +#include +#include +#include + +#include +#include +#include +#include + +char usage[] = "Usage: ./add-genename annotation.gtf gtflist [OPTIONS]\n" + "\t-o: the directory for output gtf files (default: ./)\n" ; + +struct _interval +{ + char chrom[97] ; + char geneName[97] ; + + int start, end ; +} ; + +char line[10000] ; +char buffer[10000], buffer2[10000], buffer3[10000] ; + +int CompInterval( const struct _interval &a, const struct _interval &b ) +{ + int chromCmp = strcmp( a.chrom, b.chrom ) ; + if ( chromCmp ) + return chromCmp ; + else if ( a.start != b.start ) + return a.start - b.start ; + else + return a.end - b.end ; +} + +bool Comp( const struct _interval &a, const struct _interval &b ) +{ + int cmp = CompInterval( a, b ) ; + if ( cmp < 0 ) + return true ; + else + return false ; + + return false ; +} + +void SortAndCleanIntervals( std::vector &it ) +{ + std::sort( it.begin(), it.end(), Comp ) ; + + int i, k ; + int size = it.size() ; + if ( size == 0 ) + return ; + + k = 1 ; + for ( i = 1 ; i < size ; ++i ) + { + if ( CompInterval( it[k - 1], it[i] ) ) + { + it[k] = it[i] ; + ++k ; + } + } + it.resize( k ) ; +} + +int GetGTFField( char *ret, char *line, const char *fid ) +{ + char *p = strstr( line, fid ) ; + if ( p == NULL ) + return 0 ; + //printf( "%s %s\n", line, fid ) ; + for ( ; *p != ' ' ; ++p ) + ; + p += 2 ; + + sscanf( p, "%s", ret ) ; + p = ret + strlen( ret ) ; + while ( p != ret && *p != '\"' ) + --p ; + *p = '\0' ; + return 1 ; +} + +void UpdateIdToAnnoId( std::vector &transcript, char *tid, int exonTag, int intronTag, std::vector &exons, std::vector &introns, + std::map &txptIdToAnnoId, std::map &geneIdToAnnoId ) +{ + int i, k ; + bool flag = false ; + // First, try whether intron works. + int ecnt = transcript.size() ; + int intronSize = introns.size() ; + int exonSize = exons.size() ; + + struct _interval itron ; + strcpy( itron.chrom, transcript[0].chrom ) ; + k = intronTag ; + for ( i = 0 ; i < ecnt - 1 && !flag ; ++i ) + { + itron.start = transcript[i].end ; + itron.end = transcript[i + 1].start ; + for ( ; k < intronSize ; ++k ) + { + //printf( "hi %d: %s %d %d; %d %s %d %d\n", 0, itron.chrom, itron.start, itron.end, k, introns[k].chrom, introns[k].start, introns[k].end ) ; + if ( strcmp( introns[k].chrom, itron.chrom ) ) + break ; + + int cmp = CompInterval( introns[k], itron ) ; + if ( cmp == 0 ) + { + txptIdToAnnoId[ std::string( tid ) ] = introns[k].geneName ; + geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = introns[k].geneName ; + flag = true ; + break ; + } + else if ( cmp > 0 ) + break ; + } + } + + // Next, try whether exon works + k = exonTag ; + for ( i = 0 ; i < ecnt && !flag ; ++i ) + { + for ( ; k < exonSize ; ++k ) + { + if ( strcmp( exons[k].chrom, transcript[i].chrom ) ) + break ; + + if ( exons[k].end < transcript[i].start ) + continue ; + else if ( exons[k].start > transcript[i].end ) + break ; + else + { + txptIdToAnnoId[ std::string( tid ) ] = exons[k].geneName ; + geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = exons[k].geneName ; + flag = true ; + break ; + } + } + } +} + +int main( int argc, char *argv[] ) +{ + int i, j, k ; + FILE *fp ; + FILE *fpList ; + char outputPath[1024] = "./" ; + char prevTid[97] = "" ; + + std::vector introns, exons ; + std::map txptIdToAnnoId, geneIdToAnnoId ; + std::map chromIdMapIntrons, chromIdMapExons ; // the index for the starting of a chrom. + + if ( argc < 3 ) + { + fprintf( stderr, "%s", usage ) ; + exit( 1 ) ; + } + + for ( i = 3 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "-o" ) ) + { + strcpy( outputPath, argv[i + 1 ] ) ; + ++i ; + } + else + { + fprintf( stderr, "Unknown argument: %s\n", argv[i] ) ; + exit( 1 ) ; + } + } + + // Get the exons, introns from the annotation. + fp = fopen( argv[1], "r" ) ; + while ( fgets( line, sizeof( line ), fp ) != NULL ) + { + if ( line[0] == '#' ) + continue ; + char tid[97] ; + char gname[97] ; + char chrom[97] ; + char type[50] ; + int start, end ; + + sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ; + if ( strcmp( type, "exon" ) ) + continue ; + if ( GetGTFField( gname, line, "gene_name" ) ) + { + struct _interval ne ; + strcpy( ne.chrom, chrom ) ; + strcpy( ne.geneName, gname ) ; + ne.start = start ; ne.end = end ; + + exons.push_back( ne ) ; + } + else + { + fprintf( stderr, "%s has no field of gene_name.\n", line ) ; + exit( 1 ) ; + } + + if ( GetGTFField( tid, line, "transcript_id" ) ) + { + if ( !strcmp( tid, prevTid ) ) + { + struct _interval ni ; + + strcpy( ni.chrom, chrom ) ; + strcpy( ni.geneName, gname ) ; + + int size = exons.size() ; + if ( size < 2 ) + continue ; + + struct _interval &e1 = exons[ size - 2 ] ; + struct _interval &e2 = exons[ size - 1 ] ; + if ( e1.start < e2.start ) + { + ni.start = e1.end ; + ni.end = e2.start ; + } + else + { + ni.start = e2.end ; + ni.end = e1.start ; + } + + introns.push_back( ni ) ; + } + else + strcpy( prevTid, tid ) ; + } + else + { + fprintf( stderr, "%s has no field of transcript_id.\n", line ) ; + exit( 1 ) ; + } + } + fclose( fp ) ; + SortAndCleanIntervals( exons ) ; + SortAndCleanIntervals( introns ) ; + int exonSize = exons.size() ; + int intronSize = introns.size() ; + // Determine the offset for each chrom on the list of features. + if ( exonSize ) + { + chromIdMapExons[ std::string( exons[0].chrom ) ] = 0 ; + for ( i = 1 ; i < exonSize ; ++i ) + { + if ( strcmp( exons[i].chrom, exons[i - 1].chrom ) ) + chromIdMapExons[ std::string( exons[i].chrom ) ] = i ; + } + } + + if ( intronSize ) + { + chromIdMapIntrons[ std::string( introns[0].chrom ) ] = 0 ; + for ( i = 1 ; i < intronSize ; ++i ) + { + if ( strcmp( introns[i].chrom, introns[i - 1].chrom ) ) + { + //printf( "%s %d\n", introns[i].chrom, i ) ; + chromIdMapIntrons[ std::string( introns[i].chrom ) ] = i ; + } + } + + //for ( i = 0 ; i < intronSize ; ++i ) + // printf( "%s\t%d\t%d\n", introns[i].chrom, introns[i].start, introns[i].end ) ; + } + //printf( "%d %d\n", exonSize, intronSize ) ; + + // Go through all the GTF files to find the map between PsiCLASS's gene_id to annotation's gene name + fpList = fopen( argv[2], "r ") ; + std::vector transcript ; + int exonTag = 0 ; + int intronTag = 0 ; + + while ( fscanf( fpList, "%s", line ) != EOF ) + { + // Set the output file + //printf( "hi: %s\n", line ) ; + char *p ; + p = line + strlen( line ) ; + while ( p != line && *p != '/' ) + { + if ( *p == '\n' ) + *p = '\0' ; + --p ; + } + sprintf( buffer, "%s/%s", outputPath, p ) ; + + // Test whether this will overwrite the input gtf file. + if ( realpath( buffer, buffer2 ) != NULL ) + { + if ( realpath( line, buffer3 ) && !strcmp( buffer2, buffer3 ) ) + { + fprintf( stderr, "Output will overwrite the input files. Please use -o to specify a different output directory.\n" ) ; + exit( 1 ) ; + } + } + + fp = fopen( line, "r" ) ; + transcript.resize( 0 ) ; // hold the exons in the transcript. + prevTid[0] = '\0' ; + exonTag = 0 ; + intronTag = 0 ; + int farthest = 0 ; + + while ( fgets( line, sizeof( line ), fp ) ) + { + char tid[97] ; + //char gname[97] ; + char chrom[97] ; + char type[50] ; + int start, end ; + + if ( line[0] == '#' ) + continue ; + + sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ; + if ( strcmp( type, "exon" ) ) + continue ; + + if ( GetGTFField( tid, line, "transcript_id" ) ) + { + struct _interval ne ; + strcpy( ne.chrom, chrom ) ; + ne.start = start ; + ne.end = end ; + GetGTFField( ne.geneName, line, "gene_id" ) ; + + if ( !strcmp( tid, prevTid ) ) + { + transcript.push_back( ne ) ; + if ( end > farthest ) + farthest = end ; + } + else if ( transcript.size() > 0 ) + { + // the non-existed chrom will be put to offset 0, and that's fine + if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) + intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; + if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) + exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; + + UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ; + + // Adjust the offset if we are done with a gene cluster + // We don't need to worry about the case of changing chrom here. + if ( !strcmp( ne.geneName, transcript[0].geneName ) && + start > farthest ) // Use farthest to avoid inteleaved gene. + { + while ( intronTag < intronSize && !strcmp( introns[intronTag ].chrom, ne.chrom ) + && introns[intronTag].end < ne.start ) + ++intronTag ; + + while ( exonTag < exonSize && !strcmp( exons[ exonTag ].chrom, ne.chrom ) + && exons[ exonTag ].end < ne.start ) + ++exonTag ; + + farthest = end ; + } + + transcript.resize( 0 ) ; + transcript.push_back( ne ) ; + + // Find the overlaps. + strcpy( prevTid, tid ) ; + } + else + strcpy( prevTid, tid ) ; + } + else + { + fprintf( stderr, "Could not find transcript_id field in GTF file: %s", line ) ; + exit( 1 ) ; + } + } + + if ( transcript.size() > 0 ) + { + if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) + intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; + if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) + exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; + + UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ; + } + fclose( fp ) ; + } + fclose( fpList ) ; + + // Add the gene_name field. + fpList = fopen( argv[2], "r" ) ; + int novelCnt = 0 ; + while ( fscanf( fpList, "%s", line ) != EOF ) + { + FILE *fpOut ; + // Set the output file + char *p ; + p = line + strlen( line ) ; + while ( p != line && *p != '/' ) + { + if ( *p == '\n' ) + *p = '\0' ; + --p ; + } + sprintf( buffer, "%s/%s", outputPath, p ) ; + fpOut = fopen( buffer, "w" ) ; + + // Process the input file + fp = fopen( line, "r" ) ; + + transcript.resize( 0 ) ; + prevTid[0] = '\0' ; + while ( fgets( line, sizeof( line ), fp ) ) + { + char gname[97] ; + if ( line[0] == '#' ) + { + fprintf( fpOut, "%s", line ) ; + continue ; + } + + if ( GetGTFField( buffer, line, "transcript_id" ) ) + { + if ( txptIdToAnnoId.find( std::string( buffer ) ) != txptIdToAnnoId.end() ) + { + int len = strlen( line ) ; + if ( line[len - 1] == '\n' ) + { + line[len - 1] = '\0' ; + --len ; + } + fprintf( fpOut, "%s gene_name \"%s\";\n", line, txptIdToAnnoId[std::string( buffer )].c_str() ) ; + continue ; + } + } + + if ( GetGTFField( gname, line, "gene_id" ) ) + { + int len = strlen( line ) ; + if ( line[len - 1] == '\n' ) + { + line[len - 1] = '\0' ; + --len ; + } + if ( geneIdToAnnoId.find( std::string( gname ) ) != geneIdToAnnoId.end() ) + { + fprintf( fpOut, "%s gene_name \"%s\";\n", line, geneIdToAnnoId[std::string(gname)].c_str() ) ; + } + else + { + sprintf( buffer, "novel_%d", novelCnt ) ; + geneIdToAnnoId[ std::string( gname ) ] = std::string( buffer ) ; + fprintf( fpOut, "%s gene_name \"%s\";\n", line, buffer ) ; + ++novelCnt ; + } + + } + else + fprintf( fpOut, "%s", line ) ; + + } + + fclose( fp ) ; + fclose( fpOut ) ; + } + + return 0 ; +} diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/AddXS.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/AddXS.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,583 @@ +#include +#include + +#include +#include +#include +#include +#include + +char usage[] = + "./addXS ref.fa [OPTIONS] < in.sam > out.sam\n" + "From bam to bam: samtools view -h in.bam | ./addXS ref.fa | samtools view -bS - > out.bam\n" ; +char samLine[65537] ; +char seq[65537] ; + +char nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2, + -1, -1, -1, -1, -1, -1, 0, // Regard 'N' as 'A' + -1, -1, -1, -1, -1, 3, + -1, -1, -1, -1, -1, -1 } ; +char numToNuc[26] = {'A', 'C', 'G', 'T'} ; + +struct _pair +{ + int a, b ; +} ; + +class BitSequence +{ +private: + int len ; + //std::vector sequence ; + uint32_t *sequence ; + +public: + BitSequence() { len = 0 ; sequence = NULL ;} + BitSequence( int l ) + { + len = 0 ; + sequence = new uint32_t[ l / 16 + 1 ] ; + } + + ~BitSequence() + { + //if ( sequence != NULL ) + // delete[] sequence ; + } + + void SetLength( int l ) + { + len = 0 ; + if ( sequence != NULL ) + delete[] sequence ; + sequence = new uint32_t[ l / 16 + 1 ] ; + } + + int GetLength() + { + return len ; + } + + void Append( char c ) + { + if ( ( len & 15 ) == 0 ) + { + sequence[ len / 16 ] = 0 ; + } + ++len ; + //printf( "%d %c\n", len, c ) ; + Set( c, len - 1 ) ; + } + + // pos is 0-based coordinate + // notice that the order within one 32 bit butcket is reversed + void Set( char c, int pos ) + { + if ( pos >= len ) + return ; + + if ( c >= 'a' && c <= 'z' ) + { + c = c - 'a' + 'A' ; + } + if ( c == 'N' ) + c = 'A' ; + + int ind = pos >> 4 ; + int offset = pos & 15 ; + int mask = ( (int)( nucToNum[c - 'A'] & 3 ) ) << ( 2 * offset ) ; + sequence[ind] = sequence[ind] | mask ; + //if ( c != 'A' ) + // printf( "%d: %c %c %d %d : %d\n", pos, c, Get(pos), ind, offset, mask ) ; + //Print() ; + } + + char Get( int pos ) + { + if ( pos >= len ) + return 'N' ; + + int ind = pos >> 4 ; + int offset = pos & 15 ; + //printf( "%d: %d\n", pos, sequence[ind] ) ; + return numToNuc[ ( ( sequence[ind] >> ( 2 * offset ) ) & 3 ) ] ; + } + + void Print() + { + int i ; + for ( i = 0 ; i < len ; ++i ) + printf( "%c", Get( i ) ) ; + printf( "\n" ) ; + } + + void Print( FILE *fp, int start, int end, bool rc ) + { + if ( !rc ) + { + for ( int i = start ; i <= end ; ++i ) + fprintf( fp, "%c", Get( i ) ) ; + } + else + { + for ( int i = end ; i >= start ; --i ) + { + char c = Get( i ) ; + if ( c == 'A' ) + c = 'T' ; + else if ( c == 'C' ) + c = 'G' ; + else if ( c == 'G' ) + c = 'C' ; + else //if ( c == 'T' ) + c = 'A' ; + fprintf( fp, "%c", c ) ; + } + } + } +} ; + +void ReverseComplement( char *s, int l ) +{ + int u, v ; + for ( u = 0, v = l - 1 ; u <= v ; ++u, --v ) + { + char tmp ; + tmp = s[v] ; + s[v] = s[u] ; + s[u] = tmp ; + + s[u] = numToNuc[ 3 - nucToNum[ s[u] - 'A' ] ] ; + s[v] = numToNuc[ 3 - nucToNum[ s[v] - 'A' ] ] ; + } +} + + +int main( int argc, char *argv[] ) +{ + int i, j, k ; + + std::vector genome ; + std::map chrToChrId ; + + char readid[200], chrom[50], mapq[10], cigar[1000], mateChrom[50] ; + char buffer[1024] ; + char pattern[1024] ; + int start, mstart, flag, tlen ; // read start and mate read start + int seqLen ; + int chrCnt = 0 ; + int chrId ; + int width = 7 ; + int score ; // 0-bit: multiple aligned, 1-bit can shift the intron, 2-bit can shift the alignment, 3-bit contain sequencing error + + if ( argc < 2 ) + { + printf( "%s", usage ) ; + return 0 ; + } + + + std::ifstream fpRef ; + fpRef.open( argv[1] ) ; + std::string line ; + + int motifStrand[1024] ; + + /*motifStrand[ 0b10110010 ] = 1 ; // GT/AG + motifStrand[ 0b01110001 ] = 0 ; // CT/AC + motifStrand[ 0b10010010 ] = 1 ;// GC/AG + motifStrand[ 0b01111001 ] = 0 ; // CT/GC + motifStrand[ 0b00110001 ] = 1 ; // AT/AC + motifStrand[ 0b10110011 ] = 0 ; // GT/AT*/ + memset( motifStrand, -1, sizeof( motifStrand )) ; + motifStrand[ 0xb2 ] = 1 ; // GT/AG + motifStrand[ 0x71 ] = 0 ; // CT/AC + motifStrand[ 0x92 ] = 1 ;// GC/AG + motifStrand[ 0x79 ] = 0 ; // CT/GC + motifStrand[ 0x31 ] = 1 ; // AT/AC + motifStrand[ 0xb3 ] = 0 ; // GT/AT + + k = 0 ; + while ( getline( fpRef, line ) ) + { + //printf( "%s\n", line.c_str() ) ; + int len = line.length() ; + if ( line[0] == '>' ) + { + //char *s = strdup( line.c_str() + 1 ) ; + if ( chrCnt > 0 ) + { + genome[ chrCnt - 1 ].SetLength( k ) ; + } + for ( i = 1 ; i < len ; ++i ) + if ( line[i] == ' ' || line[i] == '\t' ) + break ; + chrToChrId[ line.substr( 1, i - 1 ) ] = chrCnt ; + ++chrCnt ; + + BitSequence bs ; + genome.push_back( bs ) ; + + k = 0 ; + } + else + { + k += len ; + } + } + genome[ chrCnt - 1 ].SetLength( k ) ; + fpRef.close() ; + + fpRef.open( argv[1] ) ; + while ( getline( fpRef, line ) ) + { + //printf( "%s\n", line.c_str() ) ; + int len = line.length() ; + if ( line[0] == '>' ) + { + //char *s = strdup( line.c_str() + 1 ) ; + for ( i = 1 ; i < len ; ++i ) + if ( line[i] == ' ' || line[i] == '\t' ) + break ; + chrId = chrToChrId[ line.substr( 1, i - 1 ) ] ; + } + else + { + BitSequence &bs = genome[chrId] ; + for ( i = 0 ; i < len ; ++i ) + { + if ( ( line[i] >= 'A' && line[i] <= 'Z' ) || + ( line[i] >= 'a' && line[i] <= 'z' ) ) + bs.Append( line[i] ) ; + } + } + } + fpRef.close() ; + + + + while ( fgets( samLine, sizeof( samLine ), stdin ) != NULL ) + { + if ( samLine[0] == '@' ) + { + printf( "%s", samLine ) ; + continue ; + } + + sscanf( samLine, "%s %d %s %d %s %s %s %d %d %s", readid, &flag, chrom, &start, mapq, cigar, mateChrom, &mstart, &tlen, seq ) ; + struct _pair segments[1000] ; + struct _pair seqSegments[1000] ; // the segments with respect to the sequence. + int segCnt = 0 ; + int seqSegCnt = 0 ; + int num = 0 ; + int len = 0 ; + + score = 0 ; + + for ( i = 0 ; cigar[i] ; ++i ) + { + if ( cigar[i] >= '0' && cigar[i] <= '9' ) + num = num * 10 + cigar[i] - '0' ; + else if ( cigar[i] == 'I' || cigar[i] == 'S' || cigar[i] == 'H' + || cigar[i] == 'P' ) + { + num = 0 ; + } + else if ( cigar[i] == 'N' ) + { + segments[segCnt].a = start ; + segments[segCnt].b = start + len - 1 ; + ++segCnt ; + + start = start + len + num ; + len = 0 ; + num = 0 ; + } + else + { + len += num ; + num = 0 ; + } + } + + + if ( len > 0 ) + { + segments[segCnt].a = start ; + segments[segCnt].b = start + len - 1 ; + ++segCnt ; + } + + if ( segCnt <= 1 || strstr( samLine, "XS:A:" ) != NULL ) + { + printf( "%s", samLine ) ; + continue ; + } + + std::string schr( chrom ) ; + int chrId = chrToChrId[schr] ; + int strand = -1 ; + + len = strlen( samLine ) ; + samLine[len - 1] = '\0' ; + int motif = 0 ; + + BitSequence &chrSeq = genome[ chrId ] ; + + for ( i = 0 ; i < segCnt - 1 ; ++i ) + { + char m[4] ; + m[0] = chrSeq.Get( segments[i].b + 1 - 1 ) ; + m[1] = chrSeq.Get( segments[i].b + 2 - 1 ) ; + m[2] = chrSeq.Get( segments[i + 1].a - 2 - 1 ) ; + m[3] = chrSeq.Get( segments[i + 1].a - 1 - 1 ) ; + motif = 0 ; + for ( j = 0 ; j < 4 ; ++j ) + motif = ( motif << 2 ) + ( nucToNum[ m[j] - 'A' ] & 3 ); + strand = motifStrand[ motif ] ; + if ( strand != -1 ) + break ; + } + if ( strand == 1 ) + { + printf( "%s\tXS:A:+\n", samLine ) ; + continue ; + } + else if ( strand == 0 ) + { + printf( "%s\tXS:A:-\n", samLine ) ; + continue ; + } + + char *p ; + if ( ( p = strstr( samLine, "NH:i:" ) ) != NULL ) + { + p += 5 ; + num = 0 ; + while ( *p >= '0' && *p <= '9' ) + { + num = num * 10 + *p - '0' ; + ++p ; + } + if ( num > 1 ) + score |= 1 ; + } + + seqLen = strlen( seq ) ; + for ( i = 0 ; i < seqLen ; ++i ) + if ( seq[i] == 'N' ) + { + score |= 1 ; + break ; + } + + num = 0 ; + len = 0 ; + seqSegCnt = 0 ; + start = 0 ; + for ( i = 0 ; cigar[i] ; ++i ) + { + if ( cigar[i] >= '0' && cigar[i] <= '9' ) + num = num * 10 + cigar[i] - '0' ; + else if ( cigar[i] == 'D' || cigar[i] == 'S' || cigar[i] == 'H' + || cigar[i] == 'P' ) + { + num = 0 ; + } + else if ( cigar[i] == 'N' ) + { + seqSegments[seqSegCnt].a = start ; + seqSegments[seqSegCnt].b = start + len - 1 ; + ++seqSegCnt ; + + start = start + len ; + len = 0 ; + num = 0 ; + } + else + { + len += num ; + num = 0 ; + } + } + if ( len > 0 ) + { + seqSegments[seqSegCnt].a = start ; + seqSegments[seqSegCnt].b = start + len - 1 ; + ++seqSegCnt ; + } + + // Check whether we can shift the intron to a canonical splice site + for ( i = 0 ; i < segCnt - 1 ; ++i ) + { + for ( j = -width ; j < width ; ++j ) + { + if ( segments[i].b + j < segments[i].a || segments[i+1].a + j > segments[i+1].b ) + continue ; + // Look for the splice signal. + char m[4] ; + m[0] = chrSeq.Get( segments[i].b + j + 1 - 1 ) ; + m[1] = chrSeq.Get( segments[i].b + j + 2 - 1 ) ; + m[2] = chrSeq.Get( segments[i + 1].a + j - 2 - 1 ) ; + m[3] = chrSeq.Get( segments[i + 1].a + j - 1 - 1 ) ; + motif = 0 ; + for ( k = 0 ; k < 4 ; ++k ) + motif = ( motif << 2 ) + ( nucToNum[ m[k] - 'A' ] & 3 ); + strand = motifStrand[ motif ] ; + if ( strand == -1 ) + continue ; + //printf( "%d %d: %d\n", i, j, motif ) ; + + // We found a signal, then test whether we can shift the intron. + // Extract the sequence from the reference genome. + int l = 0 ; + if ( j < 0 ) + for ( k = segments[i + 1].a + j, l = 0 ; k <= segments[i + 1].a - 1 ; ++k, ++l ) + buffer[l] = chrSeq.Get( k - 1 ) ; + else + for ( k = segments[i].b + 1, l= 0 ; k <= segments[i].b + j ; ++k, ++l ) + buffer[l] = chrSeq.Get(k - 1) ; + buffer[l] = '\0' ; + + //printf( "%s\n", buffer ) ; + + // Get the coordinate on the sequence. + int tag ; + int useBegin = 0 ; // is the portion from the beginning part of a seqSegment? + int from, to ; + if ( j < 0 ) + { + tag = i ; + useBegin = 0 ; + } + else // j>0 + { + tag = i + 1 ; + useBegin = 1 ; + } + //printf( "%d %d\n", tag, useBegin ) ; + + if ( ( flag & 0x10 ) != 0 ) + { + //TODO: it seems star already fliped the sequence. + // is it true for other aligners? + + //tag = segCnt - 1 - tag ; + //useBegin = 1 - useBegin ; + + + //ReverseComplement( buffer, l ) ; + } + + if ( useBegin ) + { + from = seqSegments[tag].a ; + to = seqSegments[tag].a + l - 1 ; + } + else + { + from = seqSegments[tag].b - l + 1 ; + to = seqSegments[tag].b ; + } + //printf( "%d %d %d\n", tag, from, to ) ; + + for ( k = 0 ; k < l ; ++k ) + { + if ( seq[from + k] != buffer[k] ) + break ; + } + + + if ( k >= l ) + { + // We can shift the intron + score |= 2 ; + break ; + } + } + if ( j < width ) + break ; + } + + // Check whether the sequence is low-complexity. To do this, we test whether we can shift the seqSegments and directly inspect the sequence. + for ( i = 0 ; i < seqSegCnt ; ++i ) + { + int l ; + for ( k = 0, j = seqSegments[i].a - width ; j <= seqSegments[i].b + width ; ++j, ++k ) + buffer[k] = chrSeq.Get( j ) ; + buffer[k] = '0' ; + + for ( l = 0, j = seqSegments[i].a ; j <= seqSegments[i].b ; ++j, ++l ) + { + pattern[l] = seq[j] ; + } + pattern[l] = '\0' ; + + // It seems the aligner already flipped the read in its output? + //if ( flag & 0x10 != 0 ) + // ReverseComplement( pattern, l ) ; + + + char *p = buffer ; + int cnt = 0 ; + while ( ( p = strstr( p, pattern ) ) != NULL ) + { + ++cnt ; + ++p ; + } + if ( cnt > 1 ) + { + score |= 4 ; + break ; + } + + int count[5] = { 0, 0, 0, 0, 0 } ; + int max = 0 ; + for ( j = 0 ; j < l ; ++j ) + { + ++count[ nucToNum[ pattern[j] - 'A' ] ] ; + } + + for ( j = 0 ; j < 5 ; ++j ) + { + if ( count[j] > max ) + max = count[j] ; + } + if ( max > 0.8 * l ) + { + score |= 4 ; + break ; + } + } + + if ( ( p = strstr( samLine, "NM:i:" ) ) != NULL || ( p = strstr( samLine, "nM:i:" ) ) != NULL ) + { + int nm = atoi( p + 5 ) ; + if ( nm > 0 ) + { + score |= 8 ; + } + } + else + { + // TODO: check the nm by ourself + } + + // Check whether the alignment is concordant. + if ( ( flag & 0x1 ) != 0 && ( flag & 0x4 ) == 0 ) + { + start = segments[0].a ; + if ( strcmp( mateChrom, "=" ) + || ( flag & 0x10 ) == ( flag & 0x20 ) + || ( ( flag & 0x10 ) == 0 && ( mstart < start || mstart > start + 2000000 ) ) + || ( ( flag & 0x10 ) != 0 && ( mstart > start || mstart < start - 2000000 ) ) ) + { + score |= 16 ; + } + } + + printf( "%s\tXS:A:?\tYS:i:%d\n", samLine, score ) ; + } + + return 0 ; +} diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/BitTable.hpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/BitTable.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,471 @@ +/** + Use an 64bit integer to represent bit table. + + Li Song + July 24, 2012 + + - Modified again from May 27, 2017. +*/ + + +#ifndef _BIT_TABLE_HEADER +#define _BIT_TABLE_HEADER + +#include + +typedef unsigned long long int UINT64 ; +#define UNIT_SIZE (sizeof( UINT64 ) * 8 ) +#define UNIT_MASK ((UINT64)63) + +class BitTable +{ +private: + UINT64 *tab ; // The bits + int size ; // The size of the table + int asize ; // The size of the array (size/64). +public: + BitTable() // Intialize a bit table. + { + //tab = new UINT64[1] ; + //size = UNIT_SIZE ; + tab = NULL ; + size = 0 ; + } + + BitTable( int s ) // Initalize a bit table with size s. + { + if ( s == 0 ) + s = UNIT_SIZE ; + + if ( s & UNIT_MASK ) + { + asize = s / UNIT_SIZE + 1 ; + tab = new UINT64[ asize ] ; + } + else + { + asize = s / UNIT_SIZE ; + tab = new UINT64[ asize ] ; + } + + size = s ; + Reset() ; + } + + ~BitTable() + { + //printf( "hi %d\n", size ) ; + //printf( "%s\n", __func__ ) ; + //if ( tab != NULL ) + // delete [] tab ; + //size = -1 ; + + // Manually release the memory. + } + + void Init( int s ) // Initialize a bit table with size s. + { + if ( tab != NULL ) + delete [] tab ; + + if ( s == 0 ) + s = UNIT_SIZE ; + + if ( s & UNIT_MASK ) + { + asize = s / UNIT_SIZE + 1 ; + tab = new UINT64[ asize ] ; + } + else + { + asize = s / UNIT_SIZE ; + tab = new UINT64[ asize ] ; + } + + size = s ; + Reset() ; + } + + void Release() + { + if ( tab != NULL ) + delete[] tab ; + tab = NULL ; + } + + void Reset() // Make every value 0. + { + for ( int i = 0 ; i < asize ; ++i ) + tab[i] = 0 ; + } + + void Set( int i ) // Set the ith bit. + { + int ind, offset ; + ind = i / UNIT_SIZE ; + offset = i & UNIT_MASK ; + //printf( "%d,%d %d,%d: %llx", 311 / UNIT_SIZE, 311 & UNIT_MASK, 279 / UNIT_SIZE, 279 & UNIT_MASK, tab[ind] ) ; + tab[ind] |= ( (UINT64)1 << offset ) ; + //printf( " %llx %d\n", tab[ind], UNIT_SIZE ) ; + } + + void Unset( int i ) // Unset the ith bit + { + int ind, offset ; + ind = i / UNIT_SIZE ; + offset = i & UNIT_MASK ; + + tab[ind] &= ( (UINT64)-1 ^ ( (UINT64)1 << offset ) ) ; + } + + void Flip( int i ) // Flip the ith bit. Same as xor 1. + { + int ind, offset ; + ind = i / UNIT_SIZE ; + offset = i & UNIT_MASK ; + + tab[ind] ^= ( (UINT64)1 << offset ) ; + } + + bool Test( unsigned int i ) const // Test the ith bit. + { + if ( i >= (unsigned int)size ) + return false ; + + unsigned int ind, offset ; + ind = i / UNIT_SIZE ; + offset = i & UNIT_MASK ; + + if ( tab[ind] & ( (UINT64)1 << offset ) ) + return true ; + else + return false ; + } + + void Not() // Not all the bits. + { + int i ; + for ( i = 0 ; i < asize ; ++i ) + { + tab[i] = ~tab[i] ; + } + } + + void And( const BitTable &in ) // Do the "and" on each bits. + { + if ( asize != in.asize ) + return ; + int i ; + for ( i = 0 ; i < asize ; ++i ) + tab[i] &= in.tab[i] ; + } + + void Or( const BitTable &in ) // Do the "or" on each bits. + { + if ( asize != in.asize ) + return ; + int i ; + for ( i = 0 ; i < asize ; ++i ) + tab[i] |= in.tab[i] ; + } + + void Xor( const BitTable &in ) + { + if ( asize != in.asize ) + return ; + int i ; + for ( i = 0 ; i < asize ; ++i ) + tab[i] ^= in.tab[i] ; + } + + // Unset all the bits outside [s,e] + void MaskRegionOutside( unsigned int s, unsigned int e ) + { + int i ; + // mask out [0, s-1]. + int ind, offset ; + if ( s > 0 ) + { + ind = (s - 1) / UNIT_SIZE ; + offset = ( s - 1 ) & UNIT_MASK ; + for ( i = 0 ; i <= ind - 1 ; ++i ) + tab[i] = 0 ; + if ( offset + 1 >= 64 ) + tab[i] = 0 ; + else + tab[i] = ( tab[i] >> (UINT64)( offset + 1 ) ) << (UINT64)( offset + 1 ) ; + } + + // mask out [e+1, size-1] + if ( e < ( (unsigned int)size - 1 ) ) + { + ind = ( e + 1 ) / UNIT_SIZE ; + offset = ( e + 1 ) & UNIT_MASK ; + for ( i = ind + 1 ; i < asize ; ++i ) + tab[i] = 0 ; + + if ( UNIT_SIZE - offset >= 64 ) + tab[ind] = 0 ; + else + tab[ind] = ( tab[ind] << (UINT64)( UNIT_SIZE - offset ) ) >> (UINT64)( UNIT_SIZE - offset ) ; + } + } + + // Given further information about the position of first and last 1's + void MaskRegionOutsideInRange( int s, int e, int first, int last ) + { + int i ; + int start, to ; + start = first / UNIT_SIZE ; + to = last / UNIT_SIZE ; + // mask out [0, s-1]. + int ind, offset ; + if ( s > 0 ) + { + ind = (s - 1) / UNIT_SIZE ; + offset = ( s - 1 ) & UNIT_MASK ; + for ( i = start ; i <= ind - 1 ; ++i ) + tab[i] = 0 ; + if ( offset + 1 >= 64 ) + tab[i] = 0 ; + else + tab[i] = ( tab[i] >> (UINT64)( offset + 1 ) ) << (UINT64)( offset + 1 ) ; + } + + // mask out [e+1, size-1] + if ( e < size - 1 ) + { + ind = ( e + 1 ) / UNIT_SIZE ; + offset = ( e + 1 ) & UNIT_MASK ; + for ( i = ind + 1 ; i <= to ; ++i ) + tab[i] = 0 ; + + if ( UNIT_SIZE - offset >= 64 ) + tab[ind] = 0 ; + else + tab[ind] = ( tab[ind] << (UINT64)( UNIT_SIZE - offset ) ) >> (UINT64)( UNIT_SIZE - offset ) ; + } + } + void ShiftOneLeft() + { + int i ; + UINT64 carry = 0 ; + UINT64 lastTabMask ; + if ( ( size & UNIT_MASK ) == 0 ) + lastTabMask = (UINT64)(-1) ; + else + lastTabMask = ( 1 << ( size & UNIT_MASK ) ) - 1 ; + + for ( i = 0 ; i < asize - 1 ; ++i ) + { + UINT64 tmp = ( tab[i] >> ( UNIT_SIZE - 1 ) ) & 1 ; + tab[i] = ( tab[i] << 1 ) | carry ; + carry = tmp ; + } + tab[i] = ( ( tab[i] << 1 ) | carry ) & lastTabMask ; + } + + void ShiftOneRight() + { + int i ; + UINT64 carry = ( tab[ asize - 1 ] & 1 ) ; + tab[ asize - 1 ] >>= 1 ; + for ( i = asize - 2 ; i >= 0 ; --i ) + { + UINT64 tmp = ( tab[i] & 1 ) ; + tab[i] = ( tab[i] >> 1 ) | ( carry << ( UNIT_SIZE - 1 ) ) ; + carry = tmp ; + } + } + + bool IsAllZero() + { + int i ; + for ( i = 0 ; i < asize ; ++i ) + if ( tab[i] != 0 ) + return false ; + return true ; + } + + int Count() const // Count how many 1. + { + if ( size <= 0 ) + return 0 ; + UINT64 k ; + int i, ret = 0 ; + for ( i = 0 ; i < asize - 1 ; ++i ) + { + k = tab[i] ; + while ( k ) + //for ( j = 0 ; j < UNIT_SIZE ; ++j ) + { + if ( k & 1 ) + ++ret ; + //printf( "### %d %d %d %d\n", ret, asize, size, k ) ; + k /= 2 ; + } + } + //printf( "(%d) ", ret ) ; + k = tab[ asize - 1 ] ; + for ( i = 0 ; i <= (int)( ( size - 1 ) & UNIT_MASK ) ; ++i ) + { + if ( k & 1 ) + ++ret ; + k /= 2 ; + } + //for ( i = 0 ; i < asize ; ++i ) + // printf( "%llu ", tab[i] ) ; + //printf( "(%d)\n", ret ) ; + return ret ; + } + + void GetOnesIndices( std::vector &indices ) + { + if ( size <= 0 ) + return ; + UINT64 k ; + int i ; + int ind = 0 ; + for ( i = 0 ; i < asize - 1 ; ++i ) + { + k = tab[i] ; + ind = i * UNIT_SIZE ; + while ( k ) + //for ( j = 0 ; j < UNIT_SIZE ; ++j ) + { + if ( k & 1 ) + indices.push_back( ind ) ; + //printf( "### %d %d %d %d\n", ret, asize, size, k ) ; + k /= 2 ; + ++ind ; + } + } + //printf( "(%d) ", ret ) ; + k = tab[ asize - 1 ] ; + ind = i * UNIT_SIZE ; + for ( i = 0 ; i <= (int)( ( size - 1 ) & UNIT_MASK ) ; ++i ) + { + if ( k & 1ull ) + indices.push_back( ind ) ; + k /= 2ull ; + ++ind ; + } + } + + bool IsEqual( const BitTable &in ) const// Test wether two bit tables equal. + { + int i ; + if ( in.size != size ) + return false ; + //printf( "== %d %d\n", in.size, size ) ; + //if ( size & UNIT_MASK ) + // k = size / UNIT_SIZE + 1 ; + //else + // k = size / UNIT_SIZE ; + + for ( i = 0 ; i < asize ; ++i ) + if ( tab[i] != in.tab[i] ) + return false ; + return true ; + } + + // Return the location of the first difference. -1 if the same. + int GetFirstDifference( const BitTable &in ) const + { + int as = asize < in.asize ? asize : in.asize ; + //int s = size < in.size ? size : in.size ; + int i, j ; + + for ( i = 0 ; i < as ; ++i ) + { + if ( tab[i] != in.tab[i] ) + { + UINT64 k1 = tab[i] ; + UINT64 k2 = in.tab[i] ; + for ( j = 0 ; j < (int)UNIT_SIZE ; ++j ) + { + if ( ( k1 & 1 ) != ( k2 & 1 ) ) + { + return j + UNIT_SIZE * i ; + } + k1 >>= 1 ; + k2 >>= 1 ; + } + } + } + + for ( ; i < asize ; ++i ) + { + if ( tab[i] != 0 ) + { + UINT64 k = tab[i] ; + for ( j = 0 ; j < (int)UNIT_SIZE ; ++j ) + { + if ( k & 1 ) + return j + UNIT_SIZE * i ; + k >>= 1 ; + } + } + } + + for ( ; i < in.asize ; ++i ) + { + if ( in.tab[i] != 0 ) + { + UINT64 k = in.tab[i] ; + for ( j = 0 ; j < (int)UNIT_SIZE ; ++j ) + { + if ( k & 1 ) + return j + UNIT_SIZE * i ; + k >>= 1 ; + } + } + } + return -1 ; + } + + void Duplicate( BitTable &in ) + { + if ( tab != NULL ) + delete [] tab ; + int i ; + size = in.size ; + asize = in.asize ; + tab = new UINT64[ asize ] ; + for ( i = 0 ; i < asize ; ++i ) + tab[i] = in.tab[i] ; + } + + /*void SetBulk( int ind, uint64_t val ) + { + tab[ind] = val ; + }*/ + + void Assign( BitTable &in ) + { + int i ; + for ( i = 0 ; i < asize ; ++i ) + tab[i] = in.tab[i] ; + } + + void Nullify() + { + tab = NULL ; + } + + int GetSize() + { + return size ; + } + + void Print() + { + int i ; + for ( i = 0 ; i < asize; ++i ) + printf( "%llu ", tab[i] ) ; + printf( "\n" ) ; + } +} ; + + +#endif diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/CombineSubexons.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/CombineSubexons.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1680 @@ +#include +#include + +#include +#include +#include +#include + +#include "alignments.hpp" +#include "blocks.hpp" +#include "stats.hpp" +#include "SubexonGraph.hpp" + +char usage[] = "combineSubexons [options]\n" + "Required options:\n" + "\t-s STRING: the path to the predicted subexon information. Can use multiple -s to specify multiple subexon prediction files\n" + "\t\tor\n" + "\t--ls STRING: the path to file of the list of the predicted subexon information.\n" ; + +struct _overhang +{ + int cnt ; // the number of samples support this subexon. + int validCnt ; // The number of samples that are used for compute probability. + int length ; + double classifier ; +} ; + +struct _intronicInfo +{ + int chrId ; + int start, end ; + int leftSubexonIdx, rightSubexonIdx ; + double irClassifier ; + int irCnt ; + int validIrCnt ; + struct _overhang leftOverhang, rightOverhang ; // leftOverhangClassifier is for the overhang subexon at the left side of this intron. +} ; + +struct _seInterval +{ + int chrId ; + int start, end ; + int type ; // 0-subexon, 1-intronicInfo + int idx ; +} ; + +struct _subexonSplit +{ + int chrId ; + int pos ; + int type ; //1-start of a subexon. 2-end of a subexon + int splitType ; //0-soft boundary, 1-start of an exon, 2-end of an exon. + int strand ; + + int weight ; +} ; + +struct _interval // exon or intron +{ + int chrId ; + int start, end ; + int strand ; + int sampleSupport ; +} ; + +struct _subexonSupplement // supplement the subexon structure defined in SubexonGraph. +{ + int *nextSupport ; + int *prevSupport ; +} ; + +char buffer[4096] ; + +bool CompSubexonSplit( struct _subexonSplit a, struct _subexonSplit b ) +{ + if ( a.chrId < b.chrId ) + return true ; + else if ( a.chrId > b.chrId ) + return false ; + else if ( a.pos != b.pos ) + return a.pos < b.pos ; + else if ( a.type != b.type ) + { + // the split site with no strand information should come first. + /*if ( a.splitType != b.splitType ) + { + if ( a.splitType == 0 ) + return true ; + else if ( b.splitType == 0 ) + return false ; + }*/ + return a.type < b.type ; + } + else if ( a.splitType != b.splitType ) + { + //return a.splitType < b.splitType ; + if ( a.splitType == 0 ) + return true ; + else if ( b.splitType == 0 ) + return false ; + + if ( a.type == 1 ) + return a.splitType > b.splitType ; + else + return a.splitType < b.splitType ; + } + + return false ; +} + +bool CompInterval( struct _interval a, struct _interval b ) +{ + if ( a.chrId < b.chrId ) + return true ; + else if ( a.chrId > b.chrId ) + return false ; + else if ( a.start != b.start ) + return a.start < b.start ; + else if ( a.end != b.end ) + return a.end < b.end ; + return false ; +} + +bool CompSeInterval( struct _seInterval a, struct _seInterval b ) +{ + if ( a.chrId < b.chrId ) + return true ; + else if ( a.chrId > b.chrId ) + return false ; + else if ( a.start < b.start ) + return true ; + else if ( a.start > b.start ) + return false ; + else if ( a.end < b.end ) + return true ; + else + return false ; +} + +// Keep this the same as in SubexonInfo.cpp. +double TransformCov( double c ) +{ + double ret ; + //return sqrt( c ) - 1 ; + + if ( c <= 2 + 1e-6 ) + ret = 1e-6 ; + else + ret = c - 2 ; + + return ret ; +} + +double GetUpdateMixtureGammaClassifier( double ratio, double cov, double piRatio, double kRatio[2], double thetaRatio[2], + double piCov, double kCov[2], double thetaCov[2], bool conservative ) +{ + double p1 = 0, p2 ; + + cov = TransformCov( cov ) ; + if ( cov < ( kCov[0] - 1 ) * thetaCov[0] ) + cov = ( kCov[0] - 1 ) * thetaCov[0] ; + + if ( ratio > 0 ) + p1 = MixtureGammaAssignment( ratio, piRatio, kRatio, thetaRatio ) ; + // Make sure cov > 1? + p2 = MixtureGammaAssignment( cov, piCov, kCov, thetaCov ) ; + double ret = 0 ; + + if ( conservative ) + { + if ( p1 >= p2 ) // we should use ratio. + ret = LogGammaDensity( ratio, kRatio[1], thetaRatio[1] ) + - LogGammaDensity( ratio, kRatio[0], thetaRatio[0] ) ; + else + ret = LogGammaDensity( cov, kCov[1], thetaCov[1] ) + - LogGammaDensity( cov, kCov[0], thetaCov[0] ) ; + } + else + { + if ( p1 >= p2 ) // we should use ratio. + ret = LogGammaDensity( ratio, kRatio[1], thetaRatio[1] ) + - LogGammaDensity( ratio, kRatio[0], thetaRatio[0] ) ; + else + ret = LogGammaDensity( cov, kCov[1], thetaCov[1] ) + - LogGammaDensity( cov, kCov[0], thetaCov[0] ) ; + } + return ret ; +} + +double GetPValueOfGeneEnd( double cov ) +{ + if ( cov <= 2.0 ) + return 1.0 ; + double tmp = 2.0 * ( sqrt( cov ) - log( cov ) ) ; + if ( tmp <= 0 ) + return 1.0 ; + return 2.0 * alnorm( tmp, true ) ; +} + +char StrandNumToSymbol( int strand ) +{ + if ( strand > 0 ) + return '+' ; + else if ( strand < 0 ) + return '-' ; + else + return '.' ; +} + +int StrandSymbolToNum( char c ) +{ + if ( c == '+' ) + return 1 ; + else if ( c == '-' ) + return -1 ; + else + return 0 ; +} + +int *MergePositions( int *old, int ocnt, int *add, int acnt, int &newCnt ) //, int **support ) +{ + int i, j, k ; + int *ret ; + if ( acnt == 0 ) + { + newCnt = ocnt ; + return old ; + } + if ( ocnt == 0 ) + { + newCnt = acnt ; + ret = new int[acnt] ; + //*support = new int[acnt] ; + for ( i = 0 ; i < acnt ; ++i ) + { + //(*support)[i] = 1 ; + ret[i] = add[i] ; + } + return ret ; + } + newCnt = 0 ; + for ( i = 0, j = 0 ; i < ocnt && j < acnt ; ) + { + if ( old[i] < add[j] ) + { + ++i ; + ++newCnt ; + } + else if ( old[i] == add[j] ) + { + ++i ; ++j ; + ++newCnt ; + } + else + { + ++j ; + ++newCnt ; + } + } + newCnt = newCnt + ( ocnt - i ) + ( acnt - j ) ; + // no new elements. + if ( newCnt == ocnt ) + { + /*i = 0 ; + for ( j = 0 ; j < acnt ; ++j ) + { + for ( ; old[i] < add[j] ; ++i ) + ; + ++(*support)[i] ; + }*/ + return old ; + } + k = 0 ; + //delete []old ; + ret = new int[ newCnt ] ; + //int *bufferSupport = new int[newCnt] ; + for ( i = 0, j = 0 ; i < ocnt && j < acnt ; ) + { + if ( old[i] < add[j] ) + { + ret[k] = old[i] ; + //bufferSupport[k] = (*support)[i] ; + ++i ; + ++k ; + } + else if ( old[i] == add[j] ) + { + ret[k] = old[i] ; + //bufferSupport[k] = (*support)[i] + 1 ; + ++i ; ++j ; + ++k ; + } + else + { + ret[k] = add[j] ; + //bufferSupport[k] = 1 ; + ++j ; + ++k ; + } + } + for ( ; i < ocnt ; ++i, ++k ) + { + ret[k] = old[i] ; + //bufferSupport[k] = (*support)[i] ; + } + for ( ; j < acnt ; ++j, ++k ) + { + ret[k] = add[j] ; + //bufferSupport[k] = 1 ; + } + delete[] old ; + //delete[] *support ; + //*support = bufferSupport ; + return ret ; +} + +void CoalesceSubexonSplits( std::vector &splits, int mid ) +{ + int i, j, k ; + int cnt = splits.size() ; + //std::sort( splits.begin(), splits.end(), CompSubexonSplit ) ; + + std::vector sortedSplits ; + sortedSplits.resize( cnt ) ; + + k = 0 ; + for ( i = 0, j = mid ; i < mid && j < cnt ; ++k ) + { + if ( CompSubexonSplit( splits[i], splits[j] ) ) + { + sortedSplits[k] = splits[i] ; + ++i ; + } + else + { + sortedSplits[k] = splits[j] ; + ++j ; + } + } + for ( ; i < mid ; ++i, ++k ) + sortedSplits[k] = splits[i] ; + for ( ; j < cnt ; ++j, ++k ) + sortedSplits[k] = splits[j] ; + splits = sortedSplits ; + + k = 0 ; + for ( i = 1 ; i < cnt ; ++i ) + { + if ( splits[i].chrId == splits[k].chrId && splits[i].pos == splits[k].pos && splits[i].type == splits[k].type && splits[i].splitType == splits[k].splitType + && splits[i].strand == splits[k].strand ) + { + splits[k].weight += splits[i].weight ; + } + else + { + ++k ; + splits[k] = splits[i] ; + } + } + splits.resize( k + 1 ) ; +} + +void CoalesceDifferentStrandSubexonSplits( std::vector &splits ) +{ + int i, j, k, l ; + int cnt = splits.size() ; + k = 0 ; + for ( i = 0 ; i < cnt ; ) + { + for ( j = i + 1 ; j < cnt ; ++j ) + { + if ( splits[i].chrId == splits[j].chrId && splits[i].pos == splits[j].pos && splits[i].type == splits[j].type && splits[i].splitType == splits[j].splitType ) + continue ; + break ; + } + + int maxWeight = -1 ; + int weightSum = 0 ; + int strand = splits[i].strand ; + for ( l = i ; l < j ; ++l ) + { + weightSum += splits[l].weight ; + if ( splits[l].strand != 0 && splits[l].weight > maxWeight ) + { + strand = splits[l].strand ; + maxWeight = splits[l].weight ; + } + } + //printf( "%d: %d %d %d\n", splits[i].pos, i, j, strand ) ; + splits[k] = splits[i] ; + splits[k].strand = strand ; + splits[k].weight = weightSum ; + ++k ; + + i = j ; + } + splits.resize( k ) ; +} + + +void CoalesceIntervals( std::vector &intervals ) +{ + int i, k ; + std::sort( intervals.begin(), intervals.end(), CompInterval ) ; + int cnt = intervals.size() ; + k = 0 ; + for ( i = 1 ; i < cnt ; ++i ) + { + if ( intervals[i].chrId == intervals[k].chrId && intervals[i].start == intervals[k].start && intervals[i].end == intervals[k].end ) + intervals[k].sampleSupport += intervals[i].sampleSupport ; + else + { + ++k ; + intervals[k] = intervals[i] ; + } + } + intervals.resize( k + 1 ) ; +} + +void CleanIntervalIrOverhang( std::vector &irOverhang ) +{ + int i, j, k ; + std::sort( irOverhang.begin(), irOverhang.end(), CompInterval ) ; + + int cnt = irOverhang.size() ; + for ( i = 0 ; i < cnt ; ++i ) + { + if ( irOverhang[i].start == -1 ) + continue ; + + + // locate the longest interval start at the same coordinate. + int tag = i ; + + for ( j = i + 1 ; j < cnt ; ++j ) + { + if ( irOverhang[j].chrId != irOverhang[i].chrId || irOverhang[j].start != irOverhang[i].start ) + break ; + if ( irOverhang[j].start == -1 ) + continue ; + tag = j ; + } + + for ( k = i ; k < tag ; ++k ) + { + irOverhang[k].start = -1 ; + } + + for ( k = tag + 1 ; k < cnt ; ++k ) + { + if ( irOverhang[k].chrId != irOverhang[tag].chrId || irOverhang[k].start > irOverhang[tag].end ) + break ; + if ( irOverhang[k].end <= irOverhang[tag].end ) + { + irOverhang[k].start = -1 ; + } + } + } + + k = 0 ; + for ( i = 0 ; i < cnt ; ++i ) + { + if ( irOverhang[i].start == -1 ) + continue ; + irOverhang[k] = irOverhang[i] ; + ++k ; + } + irOverhang.resize( k ) ; +} + +// Remove the connection that does not match the boundary +// of subexons. +void CleanUpSubexonConnections( std::vector &subexons ) +{ + int seCnt = subexons.size() ; + int i, j, k, m ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].prevCnt > 0 ) + { + for ( k = i ; k >= 0 ; --k ) + if ( subexons[k].chrId != subexons[i].chrId || subexons[k].end <= subexons[i].prev[0] ) + break ; + if ( subexons[k].chrId != subexons[i].chrId ) + ++k ; + m = 0 ; + for ( j = 0 ; j < subexons[i].prevCnt ; ++j ) + { + for ( ; k <= i ; ++k ) + if ( subexons[k].end >= subexons[i].prev[j] ) + break ; + if ( subexons[k].end == subexons[i].prev[j] + && ( SubexonGraph::IsSameStrand( subexons[k].rightStrand, subexons[i].leftStrand ) || subexons[k].end + 1 == subexons[i].start ) ) + { + subexons[i].prev[m] = subexons[i].prev[j] ; + ++m ; + } + } + subexons[i].prevCnt = m ; + } + + m = 0 ; + k = i ; + for ( j = 0 ; j < subexons[i].nextCnt ; ++j ) + { + for ( ; k < seCnt ; ++k ) + if ( subexons[k].chrId != subexons[i].chrId || subexons[k].start >= subexons[i].next[j] ) + break ; + if ( subexons[k].start == subexons[i].next[j] + && ( SubexonGraph::IsSameStrand( subexons[i].rightStrand, subexons[k].leftStrand ) || subexons[i].end + 1 == subexons[k].start ) ) + { + subexons[i].next[m] = subexons[i].next[j] ; + ++m ; + } + } + subexons[i].nextCnt = m ; + } +} + +int main( int argc, char *argv[] ) +{ + int i, j, k ; + FILE *fp ; + std::vector files ; + + Blocks regions ; + Alignments alignments ; + + if ( argc == 1 ) + { + printf( "%s", usage ) ; + return 0 ; + } + + for ( i = 1 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "-s" ) ) + { + files.push_back( argv[i + 1] ) ; + ++i ; + continue ; + } + else if ( !strcmp( argv[i], "--ls" ) ) + { + FILE *fpLs = fopen( argv[i + 1], "r" ) ; + char buffer[1024] ; + while ( fgets( buffer, sizeof( buffer ), fpLs ) != NULL ) + { + int len = strlen( buffer ) ; + if ( buffer[len - 1] == '\n' ) + { + buffer[len - 1] = '\0' ; + --len ; + + } + char *fileName = strdup( buffer ) ; + files.push_back( fileName ) ; + } + } + } + int fileCnt = files.size() ; + // Obtain the chromosome ids through bam file. + fp = fopen( files[0], "r" ) ; + if ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) + { + int len = strlen( buffer ) ; + buffer[len - 1] = '\0' ; + alignments.Open( buffer + 1 ) ; + } + fclose( fp ) ; + + // Collect the split sites of subexons. + std::vector subexonSplits ; + std::vector intervalIrOverhang ; // intervals contains ir and overhang. + std::vector introns ; + std::vector exons ; + + + for ( k = 0 ; k < fileCnt ; ++k ) + { + fp = fopen( files[k], "r" ) ; + struct _subexon se ; + struct _subexonSplit sp ; + char chrName[50] ; + int origSize = subexonSplits.size() ; + while ( fgets( buffer, sizeof( buffer), fp ) != NULL ) + { + if ( buffer[0] == '#' ) + continue ; + + SubexonGraph::InputSubexon( buffer, alignments, se, false) ; + // Record all the intron rentention, overhang from the samples + if ( ( se.leftType == 2 && se.rightType == 1 ) + || ( se.leftType == 2 && se.rightType == 0 ) + || ( se.leftType == 0 && se.rightType == 1 ) ) + { + struct _interval si ; + si.chrId = se.chrId ; + si.start = se.start ; + si.end = se.end ; + + intervalIrOverhang.push_back( si ) ; + } + + // Ignore overhang subexons and ir subexons for now. + if ( ( se.leftType == 0 && se.rightType == 1 ) + || ( se.leftType == 2 && se.rightType == 0 ) + || ( se.leftType == 2 && se.rightType == 1 ) ) + continue ; + + if ( se.leftType == 0 && se.rightType == 0 && ( se.leftClassifier == -1 || se.leftClassifier == 1 ) ) // ignore noisy single-exon island + continue ; + if ( se.leftType == 0 && se.rightType == 0 && ( fileCnt >= 10 && se.leftClassifier > 0.99 ) ) + continue ; + + if ( se.leftType == 1 && se.rightType == 2 ) // a full exon, we allow mixtured strand here. + { + struct _interval ni ; + ni.chrId = se.chrId ; + ni.start = se.start ; + ni.end = se.end ; + ni.strand = se.rightStrand ; + ni.sampleSupport = 1 ; + exons.push_back( ni ) ; + } + + + /*for ( i = 0 ; i < se.nextCnt ; ++i ) + { + struct _interval ni ; + ni.chrId = se.chrId ; + ni.start = se.end ; + ni.end = se.next[i] ; + ni.strand = se.rightStrand ; + ni.sampleSupport = 1 ; + if ( ni.start + 1 < ni.end ) + introns.push_back( ni ) ; + }*/ + + sp.chrId = se.chrId ; + sp.pos = se.start ; + sp.type = 1 ; + sp.splitType = se.leftType ; + sp.strand = se.leftStrand ; + sp.weight = 1 ; + subexonSplits.push_back( sp ) ; + + sp.chrId = se.chrId ; + sp.pos = se.end ; + sp.type = 2 ; + sp.splitType = se.rightType ; + sp.strand = se.rightStrand ; + sp.weight = 1 ; + subexonSplits.push_back( sp ) ; + + /*if ( se.prevCnt > 0 ) + delete[] se.prev ; + if ( se.nextCnt > 0 ) + delete[] se.next ;*/ + } + CoalesceIntervals( exons ) ; + CoalesceIntervals( introns ) ; + CoalesceSubexonSplits( subexonSplits, origSize ) ; + CleanIntervalIrOverhang( intervalIrOverhang ) ; + + fclose( fp ) ; + } + + CoalesceDifferentStrandSubexonSplits( subexonSplits ) ; + + // Obtain the split sites from the introns. + int intronCnt = introns.size() ; + std::vector intronSplits ; + for ( i = 0 ; i < intronCnt ; ++i ) + { + /*if ( introns[i].sampleSupport < 0.05 * fileCnt ) + { + continue ; + }*/ + struct _interval &it = introns[i] ; + struct _subexonSplit sp ; + sp.chrId = it.chrId ; + sp.pos = it.start ; + sp.type = 2 ; + sp.splitType = 2 ; + sp.strand = it.strand ; + intronSplits.push_back( sp ) ; + + sp.chrId = it.chrId ; + sp.pos = it.end ; + sp.type = 1 ; + sp.splitType = 1 ; + sp.strand = it.strand ; + intronSplits.push_back( sp ) ; + } + + // Pair up the split sites to get subexons + std::sort( intronSplits.begin(), intronSplits.end(), CompSubexonSplit ) ; + //std::sort( subexonSplits.begin(), subexonSplits.end(), CompSubexonSplit ) ; + + // Convert the hard boundary to soft boundary if the split sites is filtered from the introns + // Seems NO need to do this now. + int splitCnt = subexonSplits.size() ; + int intronSplitCnt = intronSplits.size() ; + k = 0 ; + //for ( i = 0 ; i < splitCnt ; ++i ) + while ( 0 ) + { + if ( subexonSplits[i].type != subexonSplits[i].splitType ) + continue ; + + while ( k < intronSplitCnt && ( intronSplits[k].chrId < subexonSplits[i].chrId + || ( intronSplits[k].chrId == subexonSplits[i].chrId && intronSplits[k].pos < subexonSplits[i].pos ) ) ) + ++k ; + j = k ; + while ( j < intronSplitCnt && intronSplits[j].chrId == subexonSplits[i].chrId + && intronSplits[j].pos == subexonSplits[i].pos && intronSplits[j].splitType != subexonSplits[i].splitType ) + ++j ; + + // the split site is filtered. + if ( j >= intronSplitCnt || intronSplits[j].chrId != subexonSplits[i].chrId || + intronSplits[j].pos > subexonSplits[i].pos ) + { + //printf( "%d %d. %d %d\n", subexonSplits[i].pos, intronSplits[j].pos, intronSplits[j].chrId , subexonSplits[i].chrId ) ; + subexonSplits[i].splitType = 0 ; + + // Convert the adjacent subexon split. + for ( int l = i + 1 ; i < splitCnt && subexonSplits[l].chrId == subexonSplits[i].chrId + && subexonSplits[l].pos == subexonSplits[i].pos + 1 ; ++l ) + { + if ( subexonSplits[l].type != subexonSplits[i].type + && subexonSplits[l].splitType == subexonSplits[i].splitType ) + { + subexonSplits[l].splitType = 0 ; + } + } + + // And the other direction + for ( int l = i - 1 ; l >= 0 && subexonSplits[l].chrId == subexonSplits[i].chrId + && subexonSplits[l].pos == subexonSplits[i].pos - 1 ; --l ) + { + if ( subexonSplits[l].type != subexonSplits[i].type + && subexonSplits[l].splitType == subexonSplits[i].splitType ) + { + subexonSplits[l].splitType = 0 ; + } + } + } + } + intronSplits.clear() ; + std::vector().swap( intronSplits ) ; + + // Force the soft boundary that collides with hard boundaries to be hard boundary. + for ( i = 0 ; i < splitCnt ; ++i ) + { + if ( subexonSplits[i].splitType != 0 ) + continue ; + int newSplitType = 0 ; + int newStrand = subexonSplits[i].strand ; + for ( j = i + 1 ; j < splitCnt ; ++j ) + { + if ( subexonSplits[i].type != subexonSplits[j].type || subexonSplits[i].pos != subexonSplits[j].pos || + subexonSplits[i].chrId != subexonSplits[j].chrId ) + break ; + if ( subexonSplits[j].splitType != 0 ) + { + newSplitType = subexonSplits[j].splitType ; + newStrand = subexonSplits[j].strand ; + break ; + } + } + + if ( newSplitType == 0 ) + { + for ( j = i - 1 ; j >= 0 ; --j ) + { + if ( subexonSplits[i].type != subexonSplits[j].type || subexonSplits[i].pos != subexonSplits[j].pos || + subexonSplits[i].chrId != subexonSplits[j].chrId ) + break ; + if ( subexonSplits[j].splitType != 0 ) + { + newSplitType = subexonSplits[j].splitType ; + newStrand = subexonSplits[j].strand ; + break ; + } + } + + } + /*if ( subexonSplits[i].pos == 154464157 ) + { + printf( "force conversion: %d %d %d. %d %d\n", subexonSplits[i].pos, subexonSplits[i].splitType, subexonSplits[i].weight, subexonSplits[i + 1].pos, subexonSplits[i + 1].splitType ) ; + }*/ + subexonSplits[i].splitType = newSplitType ; + subexonSplits[i].strand = newStrand ; + } + + /*for ( i = 0 ; i < splitCnt ; ++i ) + { + printf( "%d: type=%d splitType=%d weight=%d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, subexonSplits[i].weight ) ; + }*/ + + // Build subexons from the collected split sites. + + std::vector subexons ; + int diffCnt = 0 ; // |start of subexon split| - |end of subexon split| + int seCnt = 0 ; + for ( i = 0 ; i < splitCnt - 1 ; ++i ) + { + struct _subexon se ; + /*if ( subexonSplits[i + 1].pos == 144177260 ) + { + printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, + subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ; + }*/ + + if ( subexonSplits[i].type == 1 ) + diffCnt += subexonSplits[i].weight ; + else + diffCnt -= subexonSplits[i].weight ; + + if ( subexonSplits[i + 1].chrId != subexonSplits[i].chrId ) + { + diffCnt = 0 ; + continue ; + } + + if ( diffCnt == 0 ) // the interval between subexon + continue ; + + se.chrId = subexonSplits[i].chrId ; + se.start = subexonSplits[i].pos ; + se.leftType = subexonSplits[i].splitType ; + se.leftStrand = subexonSplits[i].strand ; + if ( subexonSplits[i].type == 2 ) + { + se.leftStrand = 0 ; + ++se.start ; + } + + se.end = subexonSplits[i + 1].pos ; + se.rightType = subexonSplits[i + 1].splitType ; + se.rightStrand = subexonSplits[i + 1].strand ; + if ( subexonSplits[i + 1].type == 1 ) + { + se.rightStrand = 0 ; + --se.end ; + } + + /*if ( se.end == 24613649 ) + { + //printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, + // subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ; + printf( "%d %d %d. %d %d %d\n", se.start, se.leftType, se.leftStrand, se.end, se.rightType, se.rightStrand ) ; + }*/ + + if ( se.start > se.end ) //Note: this handles the case of repeated subexon split. + { + // handle the case in sample 0: [...[..] + // in sample 1: [..]...] + if ( seCnt > 0 && se.end == subexons[seCnt - 1].end && subexons[seCnt - 1].rightType < se.rightType ) + { + subexons[seCnt - 1].rightType = se.rightType ; + subexons[seCnt - 1].rightStrand = se.rightStrand ; + } + continue ; + } + se.leftClassifier = se.rightClassifier = 0 ; + se.lcCnt = se.rcCnt = 0 ; + + /*if ( 1 ) //se.chrId == 25 ) + { + printf( "%d: %d-%d: %d %d\n", se.chrId, se.start, se.end, se.leftType, se.rightType ) ; + }*/ + + + se.next = se.prev = NULL ; + se.nextCnt = se.prevCnt = 0 ; + subexons.push_back( se ) ; + ++seCnt ; + } + subexonSplits.clear() ; + std::vector().swap( subexonSplits ) ; + + // Adjust the split type. + seCnt = subexons.size() ; + for ( i = 1 ; i < seCnt ; ++i ) + { + if ( subexons[i - 1].end + 1 == subexons[i].start ) + { + if ( subexons[i - 1].rightType == 0 ) + subexons[i - 1].rightType = subexons[i].leftType ; + if ( subexons[i].leftType == 0 ) + subexons[i].leftType = subexons[i - 1].rightType ; + } + } + + // Merge the adjacent soft boundaries + std::vector rawSubexons = subexons ; + int exonCnt = exons.size() ; + subexons.clear() ; + + k = 0 ; // hold index for exon. + for ( i = 0 ; i < seCnt ; ) + { + /*if ( rawSubexons[k].rightType == 0 && rawSubexons[i].leftType == 0 + && rawSubexons[k].end + 1 == rawSubexons[i].start ) + { + rawSubexons[k].end = rawSubexons[i].end ; + rawSubexons[k].rightType = rawSubexons[i].rightType ; + rawSubexons[k].rightStrand = rawSubexons[i].rightStrand ; + } + else + { + subexons.push_back( rawSubexons[k] ) ; + k = i ; + }*/ + + while ( k < exonCnt && ( exons[k].chrId < rawSubexons[i].chrId + || ( exons[k].chrId == rawSubexons[i].chrId && exons[k].start < rawSubexons[i].start ) ) ) + ++k ; + + for ( j = i + 1 ; j < seCnt ; ++j ) + { + if ( rawSubexons[j - 1].chrId != rawSubexons[j].chrId || rawSubexons[j - 1].rightType != 0 || rawSubexons[j].leftType != 0 + || ( fileCnt > 1 && rawSubexons[j - 1].end + 1 != rawSubexons[j].start ) + || ( fileCnt == 1 && rawSubexons[j - 1].end + 50 < rawSubexons[j].start ) ) + break ; + } + // rawsubexons[i...j-1] will be merged. + + /*if ( rawSubexons[i].start == 119625875 ) + { + printf( "merge j-1: %d %d %d %d\n", rawSubexons[j - 1].end, rawSubexons[j - 1].rightType, + rawSubexons[j].start, rawSubexons[j].leftType ) ; + }*/ + bool merge = true ; + if ( rawSubexons[i].leftType == 1 && rawSubexons[j - 1].rightType == 2 && j - i > 1 + && rawSubexons[j - 1].end - rawSubexons[i].start >= 1000 ) + { + merge = false ; + int sampleSupport = 0 ; + for ( int l = k ; l < exonCnt ; ++l ) + { + if ( exons[l].chrId != rawSubexons[i].chrId || exons[l].start > rawSubexons[i].start ) + break ; + if ( exons[l].end == rawSubexons[j - 1].end ) + { + merge = true ; + sampleSupport = exons[l].sampleSupport ; + break ; + } + } + + if ( merge == true && rawSubexons[j - 1].end - rawSubexons[i].start >= 1000 ) + { + if ( sampleSupport <= 0.2 * fileCnt ) + { + merge = false ; + } + } + + if ( merge == false ) + { + if ( j - i >= 3 ) + { + rawSubexons[i].end = rawSubexons[ (i + j - 1) / 2 ].start ; + rawSubexons[j - 1].start = rawSubexons[ (i + j - 1) / 2].end ; + } + + if ( rawSubexons[i].end + 1 == rawSubexons[j - 1].start ) + { + --rawSubexons[i].end ; + ++rawSubexons[j - 1].start ; + } + subexons.push_back( rawSubexons[i] ) ; + subexons.push_back( rawSubexons[j - 1] ) ; + } + } + + if ( merge ) + { + rawSubexons[i].end = rawSubexons[j - 1].end ; + rawSubexons[i].rightType = rawSubexons[j - 1].rightType ; + rawSubexons[i].rightStrand = rawSubexons[j - 1].rightStrand ; + + if ( rawSubexons[i].leftType == 0 && rawSubexons[i].rightType != 0 ) + { + rawSubexons[i].start = rawSubexons[ ( i + j - 1 ) / 2 ].start ; + } + else if ( rawSubexons[i].rightType == 0 && rawSubexons[i].leftType != 0 ) + { + rawSubexons[i].end = rawSubexons[ ( i + j - 1 ) / 2 ].end ; + } + + subexons.push_back( rawSubexons[i] ) ; + } + + i = j ; + } + exons.clear() ; + std::vector().swap( exons ) ; + + // Remove overhang, ir subexons intron created after putting multiple sample to gether. + // eg: s0: [......) + // s1: [...]--------[....] + // s2: [...]..)-----[....] + // Though the overhang from s2 is filtered in readin, there will a new overhang created combining s0,s1. + // But be careful about how to compute the classifier for the overhang part contributed from s0. + // Furthermore, note that the case of single-exon island showed up in intron retention region after combining is not possible when get here. + // eg: s0:[...]-----[...] + // s1: (.) + // s2:[.............] + // After merge adjacent soft boundaries, the single-exon island will disappear. + rawSubexons = subexons ; + seCnt = subexons.size() ; + subexons.clear() ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( ( rawSubexons[i].leftType == 2 && rawSubexons[i].rightType == 1 ) // ir + || ( rawSubexons[i].leftType == 2 && rawSubexons[i].rightType == 0 ) // overhang + || ( rawSubexons[i].leftType == 0 && rawSubexons[i].rightType == 1 ) ) + continue ; + subexons.push_back( rawSubexons[i] ) ; + } + + // Remove the single-exon island if it overlaps with intron retentioned or overhang. + rawSubexons = subexons ; + seCnt = subexons.size() ; + subexons.clear() ; + k = 0 ; + std::sort( intervalIrOverhang.begin(), intervalIrOverhang.end(), CompInterval ) ; + int irOverhangCnt = intervalIrOverhang.size() ; + + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( rawSubexons[i].leftType != 0 || rawSubexons[i].rightType != 0 ) + { + subexons.push_back( rawSubexons[i] ) ; + continue ; + } + + while ( k < irOverhangCnt ) + { + // Locate the interval that before the island + if ( intervalIrOverhang[k].chrId < rawSubexons[i].chrId + || ( intervalIrOverhang[k].chrId == rawSubexons[i].chrId && intervalIrOverhang[k].end < rawSubexons[i].start ) ) + { + ++k ; + continue ; + } + break ; + } + bool overlap = false ; + for ( j = k ; j < irOverhangCnt ; ++j ) + { + if ( intervalIrOverhang[j].chrId > rawSubexons[i].chrId || intervalIrOverhang[j].start > rawSubexons[i].end ) + break ; + if ( ( intervalIrOverhang[j].start <= rawSubexons[i].start && intervalIrOverhang[j].end >= rawSubexons[i].start ) + || ( intervalIrOverhang[j].start <= rawSubexons[i].end && intervalIrOverhang[j].end >= rawSubexons[i].end ) ) + { + overlap = true ; + break ; + } + } + + if ( !overlap ) + subexons.push_back( rawSubexons[i] ) ; + } + rawSubexons.clear() ; + std::vector().swap( rawSubexons ) ; + + intervalIrOverhang.clear() ; + std::vector().swap( intervalIrOverhang ) ; + + // Create the dummy intervals. + seCnt = subexons.size() ; + std::vector intronicInfos ; + std::vector seIntervals ; + std::vector subexonInfo ; + + //subexonInfo.resize( seCnt ) ; + for ( i = 0 ; i < seCnt ; ++i ) + { + struct _seInterval ni ; // new interval + ni.start = subexons[i].start ; + ni.end = subexons[i].end ; + ni.type = 0 ; + ni.idx = i ; + ni.chrId = subexons[i].chrId ; + seIntervals.push_back( ni ) ; + + /*if ( subexons[i].end == 42671717 ) + { + printf( "%d: %d-%d: %d\n", subexons[i].chrId, subexons[i].start, subexons[i].end, subexons[i].rightType ) ; + }*/ + //subexonInfo[i].prevSupport = subexonInfo[i].nextSupport = NULL ; + + /*int nexti ; + for ( nexti = i + 1 ; nexti < seCnt ; ++nexti ) + if ( subexons[ nexti ].leftType == 0 && subexons[nexti].rightType == 0 )*/ + + if ( i < seCnt - 1 && subexons[i].chrId == subexons[i + 1].chrId && + subexons[i].end + 1 < subexons[i + 1].start && + subexons[i].rightType + subexons[i + 1].leftType != 0 ) + { + // Only consider the intervals like ]..[,]...(, )...[ + // The case like ]...] is actaully things like ][...] in subexon perspective, + // so they won't pass the if-statement + struct _intronicInfo nii ; // new intronic info + ni.start = subexons[i].end + 1 ; + ni.end = subexons[i + 1].start - 1 ; + ni.type = 1 ; + ni.idx = intronicInfos.size() ; + seIntervals.push_back( ni ) ; + + nii.chrId = subexons[i].chrId ; + nii.start = ni.start ; + nii.end = ni.end ; + nii.leftSubexonIdx = i ; + nii.rightSubexonIdx = i + 1 ; + nii.irClassifier = 0 ; + nii.irCnt = 0 ; + nii.validIrCnt = 0 ; + nii.leftOverhang.cnt = 0 ; + nii.leftOverhang.validCnt = 0 ; + nii.leftOverhang.length = 0 ; + nii.leftOverhang.classifier = 0 ; + nii.rightOverhang.cnt = 0 ; + nii.rightOverhang.validCnt = 0 ; + nii.rightOverhang.length = 0 ; + nii.rightOverhang.classifier = 0 ; + intronicInfos.push_back( nii ) ; + /*if ( nii.end == 23667 ) + { + printf( "%d %d. %d (%d %d %d)\n", nii.start, nii.end, subexons[i].rightType, subexons[i+1].start, subexons[i + 1].end, subexons[i + 1].leftType ) ; + }*/ + } + } + + // Go through all the files to get some statistics number + double avgIrPiRatio = 0 ; + double avgIrPiCov = 0 ; + double irPiRatio, irKRatio[2], irThetaRatio[2] ; // Some statistical results + double irPiCov, irKCov[2], irThetaCov[2] ; + + double avgOverhangPiRatio = 0 ; + double avgOverhangPiCov = 0 ; + double overhangPiRatio, overhangKRatio[2], overhangThetaRatio[2] ; // Some statistical results + double overhangPiCov, overhangKCov[2], overhangThetaCov[2] ; + + for ( k = 0 ; k < fileCnt ; ++k ) + { + fp = fopen( files[k], "r" ) ; + + while ( fgets( buffer, sizeof( buffer), fp ) != NULL ) + { + if ( buffer[0] == '#' ) + { + char buffer2[100] ; + sscanf( buffer, "%s", buffer2 ) ; + if ( !strcmp( buffer2, "#fitted_ir_parameter_ratio:" ) ) + { + // TODO: ignore certain samples if the coverage seems wrong. + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &irPiRatio, buffer2, &irKRatio[0], buffer2, &irThetaRatio[0], + buffer2, &irKRatio[1], buffer2, &irThetaRatio[1] ) ; + avgIrPiRatio += irPiRatio ; + } + else if ( !strcmp( buffer2, "#fitted_ir_parameter_cov:" ) ) + { + } + else if ( !strcmp( buffer2, "#fitted_overhang_parameter_ratio:" ) ) + { + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &overhangPiRatio, buffer2, &overhangKRatio[0], buffer2, &overhangThetaRatio[0], + buffer2, &overhangKRatio[1], buffer2, &overhangThetaRatio[1] ) ; + avgOverhangPiRatio += overhangPiRatio ; + } + } + else + break ; + } + fclose( fp ) ; + } + avgIrPiRatio /= fileCnt ; + avgOverhangPiRatio /= fileCnt ; + + // Go through all the files to put statistical results into each subexon. + std::vector< struct _subexon > sampleSubexons ; + int subexonCnt = subexons.size() ; + for ( k = 0 ; k < fileCnt ; ++k ) + { + //if ( k == 220 ) + // exit( 1 ) ; + fp = fopen( files[k], "r" ) ; + struct _subexon se ; + struct _subexonSplit sp ; + char chrName[50] ; + + sampleSubexons.clear() ; + + int tag = 0 ; + while ( fgets( buffer, sizeof( buffer), fp ) != NULL ) + { + if ( buffer[0] == '#' ) + { + char buffer2[200] ; + sscanf( buffer, "%s", buffer2 ) ; + if ( !strcmp( buffer2, "#fitted_ir_parameter_ratio:" ) ) + { + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &irPiRatio, buffer2, &irKRatio[0], buffer2, &irThetaRatio[0], + buffer2, &irKRatio[1], buffer2, &irThetaRatio[1] ) ; + } + else if ( !strcmp( buffer2, "#fitted_ir_parameter_cov:" ) ) + { + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &irPiCov, buffer2, &irKCov[0], buffer2, &irThetaCov[0], + buffer2, &irKCov[1], buffer2, &irThetaCov[1] ) ; + } + else if ( !strcmp( buffer2, "#fitted_overhang_parameter_ratio:" ) ) + { + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &overhangPiRatio, buffer2, &overhangKRatio[0], buffer2, &overhangThetaRatio[0], + buffer2, &overhangKRatio[1], buffer2, &overhangThetaRatio[1] ) ; + } + else if ( !strcmp( buffer2, "#fitted_overhang_parameter_cov:" ) ) + { + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &overhangPiCov, buffer2, &overhangKCov[0], buffer2, &overhangThetaCov[0], + buffer2, &overhangKCov[1], buffer2, &overhangThetaCov[1] ) ; + } + continue ; + } + else + break ; + + //SubexonGraph::InputSubexon( buffer, alignments, se, true ) ; + //sampleSubexons.push_back( se ) ; + } + + //int sampleSubexonCnt = sampleSubexons.size() ; + int intervalCnt = seIntervals.size() ; + //for ( i = 0 ; i < sampleSubexonCnt ; ++i ) + int iterCnt = 0 ; + while ( 1 ) + { + if ( iterCnt > 0 && fgets( buffer, sizeof( buffer), fp ) == NULL) + break ; + ++iterCnt ; + + struct _subexon se ; + SubexonGraph::InputSubexon( buffer, alignments, se, true ) ; + + while ( tag < intervalCnt ) + { + if ( seIntervals[tag].chrId < se.chrId || + ( seIntervals[tag].chrId == se.chrId && seIntervals[tag].end < se.start ) ) + { + ++tag ; + continue ; + } + else + break ; + } + + for ( j = tag ; j < intervalCnt ; ++j ) + { + if ( seIntervals[j].start > se.end || seIntervals[j].chrId > se.chrId ) // terminate if no overlap. + break ; + int idx ; + + if ( seIntervals[j].type == 0 ) + { + idx = seIntervals[j].idx ; + if ( subexons[idx].leftType == 1 && se.leftType == 1 && subexons[idx].start == se.start ) + { + double tmp = se.leftClassifier ; + if ( se.leftClassifier == 0 ) + tmp = 1e-7 ; + subexons[idx].leftClassifier -= 2.0 * log( tmp ) ; + ++subexons[idx].lcCnt ; + subexons[idx].prev = MergePositions( subexons[idx].prev, subexons[idx].prevCnt, + se.prev, se.prevCnt, subexons[idx].prevCnt ) ; + + if ( se.rightType == 0 ) // a gene end here + { + for ( int l = idx ; l < subexonCnt ; ++l ) + { + if ( l > idx && ( subexons[l].end > subexons[l - 1].start + 1 + || subexons[l].chrId != subexons[l - 1].chrId ) ) + break ; + if ( subexons[l].rightType == 2 ) + { + double adjustAvgDepth = se.avgDepth ; + if ( se.end - se.start + 1 >= 100 ) + adjustAvgDepth += se.avgDepth * 100.0 / ( se.end - se.start + 1 ) ; + else + adjustAvgDepth *= 2 ; + double p = GetPValueOfGeneEnd( adjustAvgDepth ) ; + //if ( se.end - se.start + 1 >= 500 && p > 0.001 ) + // p = 0.001 ; + + subexons[l].rightClassifier -= 2.0 * log( p ) ; + ++subexons[l].rcCnt ; + break ; + } + } + } + } + //if ( se.chrId == 25 ) + // printf( "(%d %d %d. %d) (%d %d %d. %d)\n", se.chrId, se.start, se.end, se.rightType, subexons[idx].chrId, subexons[idx].start, subexons[idx].end, subexons[idx].rightType ) ; + if ( subexons[idx].rightType == 2 && se.rightType == 2 && subexons[idx].end == se.end ) + { + double tmp = se.rightClassifier ; + if ( se.rightClassifier == 0 ) + tmp = 1e-7 ; + subexons[idx].rightClassifier -= 2.0 * log( tmp ) ; + ++subexons[idx].rcCnt ; + + subexons[idx].next = MergePositions( subexons[idx].next, subexons[idx].nextCnt, + se.next, se.nextCnt, subexons[idx].nextCnt ) ; + + if ( se.leftType == 0 ) + { + for ( int l = idx ; l >= 0 ; --l ) + { + if ( l < idx && ( subexons[l].end < subexons[l + 1].start - 1 + || subexons[l].chrId != subexons[l + 1].chrId ) ) + break ; + if ( subexons[l].leftType == 1 ) + { + double adjustAvgDepth = se.avgDepth ; + if ( se.end - se.start + 1 >= 100 ) + adjustAvgDepth += se.avgDepth * 100.0 / ( se.end - se.start + 1 ) ; + else + adjustAvgDepth *= 2 ; + double p = GetPValueOfGeneEnd( adjustAvgDepth ) ; + //if ( se.end - se.start + 1 >= 500 && p >= 0.001 ) + // p = 0.001 ; + subexons[l].leftClassifier -= 2.0 * log( p ) ; + ++subexons[l].lcCnt ; + break ; + } + } + } + } + + if ( subexons[idx].leftType == 0 && subexons[idx].rightType == 0 + && se.leftType == 0 && se.rightType == 0 ) // the single-exon island. + { + double tmp = se.leftClassifier ; + if ( se.leftClassifier == 0 ) + tmp = 1e-7 ; + subexons[idx].leftClassifier -= 2.0 * log( tmp ) ; + subexons[idx].rightClassifier = subexons[idx].leftClassifier ; + ++subexons[idx].lcCnt ; + ++subexons[idx].rcCnt ; + } + } + else if ( seIntervals[j].type == 1 ) + { + idx = seIntervals[j].idx ; + // Overlap on the left part of intron + if ( se.start <= intronicInfos[idx].start && se.end < intronicInfos[idx].end + && subexons[ intronicInfos[idx].leftSubexonIdx ].rightType != 0 ) + { + int len = se.end - intronicInfos[idx].start + 1 ; + intronicInfos[idx].leftOverhang.length += len ; + ++intronicInfos[idx].leftOverhang.cnt ; + + // Note that the sample subexon must have a soft boundary at right hand side, + // otherwise, this part is not an intron and won't show up in intronic Info. + if ( se.leftType == 2 ) + { + if ( se.leftRatio > 0 && se.avgDepth > 1 ) + { + ++intronicInfos[idx].leftOverhang.validCnt ; + + double update = GetUpdateMixtureGammaClassifier( se.leftRatio, se.avgDepth, + overhangPiRatio, overhangKRatio, overhangThetaRatio, + overhangPiCov, overhangKCov, overhangThetaCov, false ) ; + intronicInfos[idx].leftOverhang.classifier += update ; + } + } + else if ( se.leftType == 1 ) + { + ++intronicInfos[idx].leftOverhang.validCnt ; + double update = GetUpdateMixtureGammaClassifier( 1.0, se.avgDepth, + overhangPiRatio, overhangKRatio, overhangThetaRatio, + overhangPiCov, overhangKCov, overhangThetaCov, true ) ; + intronicInfos[idx].leftOverhang.classifier += update ; + + int seIdx = intronicInfos[idx].leftSubexonIdx ; + subexons[seIdx].rightClassifier -= 2.0 * log( GetPValueOfGeneEnd( se.avgDepth ) ) ; + ++subexons[ seIdx ].rcCnt ; + } + // ignore the contribution of single-exon island here? + } + // Overlap on the right part of intron + else if ( se.start > intronicInfos[idx].start && se.end >= intronicInfos[idx].end + && subexons[ intronicInfos[idx].rightSubexonIdx ].leftType != 0 ) + { + int len = intronicInfos[idx].end - se.start + 1 ; + intronicInfos[idx].rightOverhang.length += len ; + ++intronicInfos[idx].rightOverhang.cnt ; + + // Note that the sample subexon must have a soft boundary at left hand side, + // otherwise, this won't show up in intronic Info + if ( se.rightType == 1 ) + { + if ( se.rightRatio > 0 && se.avgDepth > 1 ) + { + ++intronicInfos[idx].rightOverhang.validCnt ; + + double update = GetUpdateMixtureGammaClassifier( se.rightRatio, se.avgDepth, + overhangPiRatio, overhangKRatio, overhangThetaRatio, + overhangPiCov, overhangKCov, overhangThetaCov, false ) ; + intronicInfos[idx].rightOverhang.classifier += update ; + } + } + else if ( se.rightType == 2 ) + { + ++intronicInfos[idx].rightOverhang.validCnt ; + + double update = GetUpdateMixtureGammaClassifier( 1, se.avgDepth, + overhangPiRatio, overhangKRatio, overhangThetaRatio, + overhangPiCov, overhangKCov, overhangThetaCov, true ) ; + intronicInfos[idx].rightOverhang.classifier += update ; + + int seIdx = intronicInfos[idx].rightSubexonIdx ; + /*if ( subexons[ seIdx ].start == 6873648 ) + { + printf( "%lf %lf: %lf %lf %lf\n", subexons[seIdx].leftClassifier, GetPValueOfGeneEnd( se.avgDepth ), se.avgDepth, sqrt( se.avgDepth ), log( se.avgDepth ) ) ; + }*/ + subexons[seIdx].leftClassifier -= 2.0 * log( GetPValueOfGeneEnd( se.avgDepth ) ) ; + ++subexons[ seIdx ].lcCnt ; + } + } + // Intron is fully contained in this sample subexon, then it is a ir candidate + else if ( se.start <= intronicInfos[idx].start && se.end >= intronicInfos[idx].end ) + { + if ( se.leftType == 2 && se.rightType == 1 ) + { + double ratio = regions.PickLeftAndRightRatio( se.leftRatio, se.rightRatio ) ; + ++intronicInfos[idx].irCnt ; + if ( ratio > 0 && se.avgDepth > 1 ) + { + double update = GetUpdateMixtureGammaClassifier( ratio, se.avgDepth, + irPiRatio, irKRatio, irThetaRatio, + irPiCov, irKCov, irThetaCov, true ) ; + //if ( intronicInfos[idx].start == 37617368 ) + // printf( "hi %lf %d %d: %d %d\n", update, se.start, se.end, intronicInfos[idx].start, intronicInfos[idx].end ) ; + intronicInfos[idx].irClassifier += update ; + ++intronicInfos[idx].validIrCnt ; + } + } + else if ( se.leftType == 1 || se.rightType == 2 ) + { + //intronicInfos[idx].irClassifier += LogGammaDensity( 4.0, irKRatio[1], irThetaRatio[1] ) + // - LogGammaDensity( 4.0, irKRatio[0], irThetaRatio[0] ) ; + /*if ( se.start == 37617368 ) + { + printf( "%lf: %lf %lf\n", se.avgDepth, MixtureGammaAssignment( ( irKCov[0] - 1 ) * irThetaCov[0], irPiRatio, irKCov, irThetaCov ), + MixtureGammaAssignment( TransformCov( 4.0 ), irPiRatio, irKCov, irThetaCov ) ) ; + }*/ + if ( se.avgDepth > 1 ) + { + // let the depth be the threshold to determine. + double update = GetUpdateMixtureGammaClassifier( 4.0, se.avgDepth, + irPiRatio, irKRatio, irThetaRatio, + irPiCov, irKCov, irThetaCov, true ) ; + //if ( intronicInfos[idx].start == 36266630 ) + // printf( "hi %lf %d %d: %d %d\n", update, se.start, se.end, intronicInfos[idx].start, intronicInfos[idx].end ) ; + intronicInfos[idx].irClassifier += update ; + ++intronicInfos[idx].irCnt ; + ++intronicInfos[idx].validIrCnt ; + } + } + else + { + // the intron is contained in a overhang subexon from the sample or single-exon island + } + } + // sample subexon is contained in the intron. + else + { + // Do nothing. + } + } + } + + //if ( se.nextCnt > 0 ) + delete[] se.next ; + //if ( se.prevCnt > 0 ) + delete[] se.prev ; + } + fclose( fp ) ; + + /*for ( i = 0 ; i < sampleSubexonCnt ; ++i ) + { + if ( sampleSubexons[i].nextCnt > 0 ) + delete[] sampleSubexons[i].next ; + if ( sampleSubexons[i].prevCnt > 0 ) + delete[] sampleSubexons[i].prev ; + }*/ + } + + CleanUpSubexonConnections( subexons ) ; + + // Convert the temporary statistics number into formal statistics result. + for ( i = 0 ; i < subexonCnt ; ++i ) + { + struct _subexon &se = subexons[i] ; + if ( se.leftType == 0 && se.rightType == 0 ) // single-exon txpt. + { + se.leftClassifier = se.rightClassifier = 1 - chicdf( se.rightClassifier, 2 * se.rcCnt ) ; + } + else + { + if ( se.leftType == 1 ) + { + se.leftClassifier = 1 - chicdf( se.leftClassifier, 2 * se.lcCnt ) ; + } + else + se.leftClassifier = -1 ; + + if ( se.rightType == 2 ) + se.rightClassifier = 1 - chicdf( se.rightClassifier, 2 * se.rcCnt ) ; + else + se.rightClassifier = -1 ; + } + } + + int iiCnt = intronicInfos.size() ; //intronicInfo count + for ( i = 0 ; i < iiCnt ; ++i ) + { + struct _intronicInfo &ii = intronicInfos[i] ; + if ( ii.validIrCnt > 0 ) + { + for ( j = 0 ; j < fileCnt - ii.validIrCnt ; ++j ) + { + ii.irClassifier -= log( 10.0 ) ; + } + /*if ( ii.validIrCnt < fileCnt * 0.15 ) + ii.irClassifier -= log( 1000.0 ) ; + else if ( ii.validIrCnt < fileCnt * 0.5 ) + ii.irClassifier -= log( 100.0 ) ;*/ + ii.irClassifier = (double)1.0 / ( 1.0 + exp( ii.irClassifier + log( 1 - avgIrPiRatio ) - log( avgIrPiRatio ) ) ) ; + } + else + ii.irClassifier = -1 ; + + if ( ii.leftOverhang.validCnt > 0 ) + ii.leftOverhang.classifier = (double)1.0 / ( 1.0 + exp( ii.leftOverhang.classifier + + log( 1 - avgOverhangPiRatio ) - log( avgOverhangPiRatio ) ) ) ; + else + ii.leftOverhang.classifier = -1 ; + + if ( ii.rightOverhang.validCnt > 0 ) + ii.rightOverhang.classifier = (double)1.0 / ( 1.0 + exp( ii.rightOverhang.classifier + + log( 1 - avgOverhangPiRatio ) - log( avgOverhangPiRatio ) ) ) ; + else + ii.rightOverhang.classifier = -1 ; + } + + // Change the classifier for the hard boundaries if its adjacent intron has intron retention classifier + // which collide with overhang subexon. + int intervalCnt = seIntervals.size() ; + for ( i = 0 ; i < intervalCnt ; ++i ) + { + if ( seIntervals[i].type == 1 && intronicInfos[ seIntervals[i].idx ].irCnt > 0 ) + { + int idx = seIntervals[i].idx ; + if ( intronicInfos[idx].leftOverhang.cnt > 0 ) + { + int k = seIntervals[i - 1].idx ; + // Should aim for more conservative? + if ( subexons[k].rightClassifier > intronicInfos[idx].leftOverhang.classifier ) + subexons[k].rightClassifier = intronicInfos[idx].leftOverhang.classifier ; + } + + if ( intronicInfos[idx].rightOverhang.cnt > 0 ) + { + int k = seIntervals[i + 1].idx ; + if ( subexons[k].leftClassifier > intronicInfos[idx].rightOverhang.classifier ) + subexons[k].leftClassifier = intronicInfos[idx].rightOverhang.classifier ; + } + } + } + + // Output the result. + for ( i = 0 ; i < intervalCnt ; ++i ) + { + if ( seIntervals[i].type == 0 ) + { + struct _subexon &se = subexons[ seIntervals[i].idx ] ; + + char ls, rs ; + + ls = StrandNumToSymbol( se.leftStrand ) ; + rs = StrandNumToSymbol( se.rightStrand ) ; + + printf( "%s %d %d %d %d %c %c -1 -1 -1 %lf %lf ", alignments.GetChromName( se.chrId ), se.start, se.end, + se.leftType, se.rightType, ls, rs, se.leftClassifier, se.rightClassifier ) ; + if ( i > 0 && seIntervals[i - 1].chrId == seIntervals[i].chrId + && seIntervals[i - 1].end + 1 == seIntervals[i].start + && !( seIntervals[i - 1].type == 0 && + subexons[ seIntervals[i - 1].idx ].rightType != se.leftType ) + && !( seIntervals[i - 1].type == 1 && intronicInfos[ seIntervals[i - 1].idx ].irCnt == 0 + && intronicInfos[ seIntervals[i - 1].idx ].rightOverhang.cnt == 0 ) + && ( se.prevCnt == 0 || se.start - 1 != se.prev[ se.prevCnt - 1 ] ) ) // The connection showed up in the subexon file. + { + printf( "%d ", se.prevCnt + 1 ) ; + for ( j = 0 ; j < se.prevCnt ; ++j ) + printf( "%d ", se.prev[j] ) ; + printf( "%d ", se.start - 1 ) ; + } + else + { + printf( "%d ", se.prevCnt ) ; + for ( j = 0 ; j < se.prevCnt ; ++j ) + printf( "%d ", se.prev[j] ) ; + } + + if ( i < intervalCnt - 1 && seIntervals[i].chrId == seIntervals[i + 1].chrId + && seIntervals[i].end == seIntervals[i + 1].start - 1 + && !( seIntervals[i + 1].type == 0 && + subexons[ seIntervals[i + 1].idx ].leftType != se.rightType ) + && !( seIntervals[i + 1].type == 1 && intronicInfos[ seIntervals[i + 1].idx ].irCnt == 0 + && intronicInfos[ seIntervals[i + 1].idx ].leftOverhang.cnt == 0 ) + && ( se.nextCnt == 0 || se.end + 1 != se.next[0] ) ) + { + printf( "%d %d ", se.nextCnt + 1, se.end + 1 ) ; + } + else + printf( "%d ", se.nextCnt ) ; + for ( j = 0 ; j < se.nextCnt ; ++j ) + printf( "%d ", se.next[j] ) ; + printf( "\n" ) ; + } + else if ( seIntervals[i].type == 1 ) + { + struct _intronicInfo &ii = intronicInfos[ seIntervals[i].idx ] ; + if ( ii.irCnt > 0 ) + { + printf( "%s %d %d 2 1 . . -1 -1 -1 %lf %lf 1 %d 1 %d\n", + alignments.GetChromName( ii.chrId ), ii.start, ii.end, + ii.irClassifier, ii.irClassifier, + seIntervals[i - 1].end, seIntervals[i + 1].start ) ; + } + else + { + // left overhang. + if ( ii.leftOverhang.cnt > 0 ) + { + printf( "%s %d %d 2 0 . . -1 -1 -1 %lf %lf 1 %d 0\n", + alignments.GetChromName( ii.chrId ), ii.start, + ii.start + ( ii.leftOverhang.length / ii.leftOverhang.cnt ) - 1, + ii.leftOverhang.classifier, ii.leftOverhang.classifier, + ii.start - 1 ) ; + } + + // right overhang. + if ( ii.rightOverhang.cnt > 0 ) + { + printf( "%s %d %d 0 1 . . -1 -1 -1 %lf %lf 0 1 %d\n", + alignments.GetChromName( ii.chrId ), + ii.end - ( ii.rightOverhang.length / ii.rightOverhang.cnt ) + 1, ii.end, + ii.rightOverhang.classifier, ii.rightOverhang.classifier, + ii.end + 1 ) ; + } + + } + } + } + + return 0 ; +} + diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/Constraints.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/Constraints.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,438 @@ +#include "Constraints.hpp" + +// return whether this constraint is compatible with the subexons. +bool Constraints::ConvertAlignmentToBitTable( struct _pair *segments, int segCnt, + struct _subexon *subexons, int seCnt, int seStart, struct _constraint &ct ) +{ + int i, j, k ; + k = seStart ; + ct.vector.Init( seCnt ) ; + // Each segment of an alignment can cover several subexons. + // But the first and last segment can partially cover a subexon. + for ( i = 0 ; i < segCnt ; ++i ) + { + int leftIdx, rightIdx ; // the range of subexons covered by this segment. + leftIdx = -1 ; + rightIdx = -1 ; + + for ( ; k < seCnt ; ++k ) + { + //if ( segments[0].b == 110282529 && segCnt == 2 ) + // printf( "(%d:%d %d):(%d:%d %d)\n", i, (int)segments[i].a, (int)segments[i].b, k, (int)subexons[k].start, (int)subexons[k].end ) ; + if ( subexons[k].start > segments[i].b ) + break ; + if ( segments[i].a > subexons[k].end ) + continue ; + + int relaxedWidth = 0 ; + if ( ( subexons[k].start >= segments[i].a && subexons[k].end <= segments[i].b ) + || ( i == 0 && subexons[k].start - relaxedWidth < segments[i].a && subexons[k].end <= segments[i].b ) + || ( i == segCnt - 1 && subexons[k].start >= segments[i].a && subexons[k].end + relaxedWidth > segments[i].b ) + || ( i == 0 && i == segCnt - 1 && subexons[k].start - relaxedWidth < segments[i].a && subexons[k].end + relaxedWidth > segments[i].b ) ) + { + if ( leftIdx == -1 ) + leftIdx = k ; + rightIdx = k ; + ct.vector.Set( k ) ; + } + else + { + return false ; + } + } + + if ( leftIdx == -1 ) + return false ; + + // The cover contradict the boundary. + if ( !( ( subexons[leftIdx].leftType == 0 || subexons[leftIdx].start <= segments[i].a ) + && ( subexons[rightIdx].rightType == 0 || subexons[rightIdx].end >= segments[i].b ) ) ) + return false ; + + // The intron must exists in the subexon graph. + if ( i > 0 ) + { + for ( j = 0 ; j < subexons[ ct.last ].nextCnt ; ++j ) + if ( subexons[ct.last].next[j] == leftIdx ) + break ; + if ( j >= subexons[ ct.last ].nextCnt ) + return false ; + } + + // The subexons must be consecutive + for ( j = leftIdx + 1 ; j <= rightIdx ; ++j ) + if ( subexons[j].start > subexons[j - 1].end + 1 ) + return false ; + + if ( i == 0 ) + ct.first = leftIdx ; + ct.last = rightIdx ; + } + return true ; +} + +void Constraints::CoalesceSameConstraints() +{ + int i, k ; + int size = constraints.size() ; + for ( i = 0 ; i < size ; ++i ) + { + constraints[i].info = i ; + //printf( "constraints %d: %d %d %d\n", i, constraints[i].vector.Test( 0 ), constraints[i].vector.Test(1), constraints[i].support ) ; + } + + std::vector newIdx ; + newIdx.resize( size, 0 ) ; + + // Update the constraints. + if ( size > 0 ) + { + std::sort( constraints.begin(), constraints.end(), CompSortConstraints ) ; + + k = 0 ; + newIdx[ constraints[0].info ] = 0 ; + for ( i = 1 ; i < size ; ++i ) + { + if ( constraints[k].vector.IsEqual( constraints[i].vector ) ) + { + constraints[k].weight += constraints[i].weight ; + constraints[k].support += constraints[i].support ; + constraints[k].uniqSupport += constraints[i].uniqSupport ; + constraints[k].maxReadLen = ( constraints[k].maxReadLen > constraints[i].maxReadLen ) ? + constraints[k].maxReadLen : constraints[i].maxReadLen ; + constraints[i].vector.Release() ; + } + else + { + ++k ; + if ( k != i ) + constraints[k] = constraints[i] ; + } + newIdx[ constraints[i].info ] = k ; + } + constraints.resize( k + 1 ) ; + } + + // Update the mate pairs. + size = matePairs.size() ; + if ( size > 0 ) + { + for ( i = 0 ; i < size ; ++i ) + { + //printf( "%d %d: %d => %d | %d =>%d\n", i, newIdx.size(), matePairs[i].i, newIdx[ matePairs[i].i ], + // matePairs[i].j, newIdx[ matePairs[i].j ] ) ; + matePairs[i].i = newIdx[ matePairs[i].i ] ; + matePairs[i].j = newIdx[ matePairs[i].j ] ; + } + + std::sort( matePairs.begin(), matePairs.end(), CompSortMatePairs ) ; + + k = 0 ; + for ( i = 1 ; i < size ; ++i ) + { + if ( matePairs[i].i == matePairs[k].i && matePairs[i].j == matePairs[k].j ) + { + matePairs[k].support += matePairs[i].support ; + matePairs[k].uniqSupport += matePairs[i].uniqSupport ; + } + else + { + ++k ; + matePairs[k] = matePairs[i] ; + } + } + //printf( "%s: %d\n", __func__, matePairs[1].i) ; + matePairs.resize( k + 1 ) ; + } + + // Update the data structure for future mate pairs. + mateReadIds.UpdateIdx( newIdx ) ; +} + +void Constraints::ComputeNormAbund( struct _subexon *subexons ) +{ + int i, j ; + int ctSize = constraints.size() ; + for ( i = 0 ; i < ctSize ; ++i ) + { + // spanned more than 2 subexon + int readLen = constraints[i].maxReadLen ; + if ( constraints[i].first + 1 < constraints[i].last ) + { + std::vector subexonInd ; + constraints[i].vector.GetOnesIndices( subexonInd ) ; + int size = subexonInd.size() ; + for ( j = 1 ; j < size - 1 ; ++j ) + { + int a = subexonInd[j] ; + readLen -= ( subexons[a].end - subexons[a].start + 1 ) ; + } + } + + int effectiveLength ; + if ( constraints[i].first == constraints[i].last ) + { + effectiveLength = ( subexons[ constraints[i].first ].end - readLen + 1 )- subexons[ constraints[i].first ].start + 1 ; + if ( effectiveLength <= 0 ) // this happens in the 3',5'-end subexon, where we trimmed the length + effectiveLength = ( subexons[ constraints[i].first ].end - subexons[ constraints[i].first ].start + 1 ) / 2 + 1 ; + } + else + { + int a = constraints[i].first ; + int b = constraints[i].last ; + int start, end ; // the range of the possible start sites of a read in subexons[a]. + start = subexons[a].end + 1 - ( readLen - 1 ) ; + if ( start < subexons[a].start ) + start = subexons[a].start ; + + if ( subexons[b].end - subexons[b].start + 1 >= readLen - 1 || subexons[b].rightType == 0 ) + end = subexons[a].end ; + else + { + end = subexons[a].end + 1 - ( readLen - ( subexons[b].end - subexons[b].start + 1 ) ) ; + } + + if ( end < start ) // when we trimmed the subexon. + end = subexons[a].start ; + + effectiveLength = end - start + 1 ; + } + //printf( "%d: effectiveLength=%d support=%d\n", i, effectiveLength, constraints[i].support ) ; + constraints[i].normAbund = (double)constraints[i].weight / (double)effectiveLength ; + + if ( ( subexons[ constraints[i].first ].leftType == 0 && subexons[ constraints[i].first ].end - subexons[ constraints[i].first ].start + 1 >= 8 * pAlignments->readLen ) + || ( subexons[ constraints[i].last ].rightType == 0 && subexons[ constraints[i].last ].end - subexons[ constraints[i].last ].start + 1 >= 8 * pAlignments->readLen ) ) // some random elongation of the sequence might make unnecessary long effective length. + { + constraints[i].normAbund *= 2 ; + } + constraints[i].abundance = constraints[i].normAbund ; + } + + ctSize = matePairs.size() ; + for ( i = 0 ; i < ctSize ; ++i ) + { + double a = constraints[ matePairs[i].i ].normAbund ; + double b = constraints[ matePairs[i].j ].normAbund ; + + matePairs[i].normAbund = a < b ? a : b ; + + if ( matePairs[i].i != matePairs[i].j ) + { + if ( subexons[ constraints[ matePairs[i].i ].first ].leftType == 0 + && constraints[ matePairs[i].i ].first == constraints[ matePairs[i].i ].last + && a b ) + { + matePairs[i].normAbund = a ; + } + } + //matePairs[i].normAbund = sqrt( a * b ) ; + + matePairs[i].abundance = matePairs[i].normAbund ; + } +} + +int Constraints::BuildConstraints( struct _subexon *subexons, int seCnt, int start, int end ) +{ + int i ; + int tag = 0 ; + int coalesceThreshold = 16384 ; + Alignments &alignments = *pAlignments ; + // Release the memory from previous gene. + int size = constraints.size() ; + + if ( size > 0 ) + { + for ( i = 0 ; i < size ; ++i ) + constraints[i].vector.Release() ; + std::vector().swap( constraints ) ; + } + std::vector().swap( matePairs ) ; + mateReadIds.Clear() ; + + // Start to build the constraints. + bool callNext = false ; // the last used alignment + if ( alignments.IsAtBegin() ) + callNext = true ; + while ( !alignments.IsAtEnd() ) + { + if ( callNext ) + { + if ( !alignments.Next() ) + break ; + } + else + callNext = true ; + + if ( alignments.GetChromId() < subexons[0].chrId ) + continue ; + else if ( alignments.GetChromId() > subexons[0].chrId ) + break ; + // locate the first subexon in this region that overlapps with current alignment. + for ( ; tag < seCnt && subexons[tag].end < alignments.segments[0].a ; ++tag ) + ; + + if ( tag >= seCnt ) + break ; + if ( alignments.segments[ alignments.segCnt - 1 ].b < subexons[tag].start ) + continue ; + + int uniqSupport = 0 ; + if ( usePrimaryAsUnique ) + uniqSupport = alignments.IsPrimary() ? 1 : 0 ; + else + uniqSupport = alignments.IsUnique() ? 1 : 0 ; + + struct _constraint ct ; + ct.vector.Init( seCnt ) ; + //printf( "%s %d: %lld-%lld | %d-%d\n", __func__, alignments.segCnt, alignments.segments[0].a, alignments.segments[0].b, subexons[tag].start, subexons[tag].end ) ; + ct.weight = 1.0 / alignments.GetNumberOfHits() ; + if ( alignments.IsGCRich() ) + ct.weight *= 10 ; + ct.normAbund = 0 ; + ct.support = 1 ; + ct.uniqSupport = uniqSupport ; + ct.maxReadLen = alignments.GetRefCoverLength() ; + + if ( alignments.IsPrimary() && ConvertAlignmentToBitTable( alignments.segments, alignments.segCnt, + subexons, seCnt, tag, ct ) ) + { + + //printf( "%s ", alignments.GetReadId() ) ; + //ct.vector.Print() ; + + + // If the alignment has clipped end or tail. We only keep those clipped in the 3'/5'-end + bool validClip = true ; + if ( alignments.HasClipHead() ) + { + if ( ( ct.first < seCnt - 1 && subexons[ct.first].end + 1 == subexons[ct.first + 1].start ) + || subexons[ct.first].prevCnt > 0 + || alignments.segments[0].b - alignments.segments[0].a + 1 <= alignments.GetRefCoverLength() / 3.0 ) + validClip = false ; + } + if ( alignments.HasClipTail() ) + { + int tmp = alignments.segCnt - 1 ; + if ( ( ct.last > 0 && subexons[ct.last].start - 1 == subexons[ct.last - 1].end ) + || subexons[ct.last].nextCnt > 0 + || alignments.segments[tmp].b - alignments.segments[tmp].a + 1 <= alignments.GetRefCoverLength() / 3.0 ) + validClip = false ; + } + + if ( validClip ) + { + constraints.push_back( ct ) ; // if we just coalesced but the list size does not decrease, this will force capacity increase. + //if ( !strcmp( alignments.GetReadId(), "ERR188021.8489052" ) ) + // ct.vector.Print() ; + // Add the mate-pair information. + int mateChrId ; + int64_t matePos ; + alignments.GetMatePosition( mateChrId, matePos ) ; + if ( alignments.GetChromId() == mateChrId ) + { + if ( matePos < alignments.segments[0].a ) + { + int mateIdx = mateReadIds.Query( alignments.GetReadId(), alignments.segments[0].a ) ; + if ( mateIdx != -1 ) + { + struct _matePairConstraint nm ; + nm.i = mateIdx ; + nm.j = constraints.size() - 1 ; + nm.abundance = 0 ; + nm.support = 1 ; + nm.uniqSupport = uniqSupport ; + nm.effectiveCount = 2 ; + matePairs.push_back( nm ) ; + } + } + else if ( matePos > alignments.segments[0].a ) + { + mateReadIds.Insert( alignments.GetReadId(), alignments.segments[0].a, constraints.size() - 1, matePos ) ; + } + else // two mates have the same coordinate. + { + if ( alignments.IsFirstMate() ) + { + struct _matePairConstraint nm ; + nm.i = constraints.size() - 1 ; + nm.j = constraints.size() - 1 ; + nm.abundance = 0 ; + nm.support = 1 ; + nm.uniqSupport = uniqSupport ; + nm.effectiveCount = 2 ; + matePairs.push_back( nm ) ; + } + } + } + } + else + ct.vector.Release() ; + + // Coalesce if necessary. + size = constraints.size() ; + if ( (int)size > coalesceThreshold && size == (int)constraints.capacity() ) + { + + //printf( "start coalescing. %d\n", constraints.capacity() ) ; + CoalesceSameConstraints() ; + + // Not coalesce enough + if ( constraints.size() >= constraints.capacity() / 2 ) + { + coalesceThreshold *= 2 ; + } + } + } + else + { + //printf( "not compatible\n" ) ; + ct.vector.Release() ; + } + } + //printf( "start coalescing. %d %d\n", constraints.size(), matePairs.size() ) ; + CoalesceSameConstraints() ; + //printf( "after coalescing. %d %d\n", constraints.size(), matePairs.size() ) ; + //for ( i = 0 ; i < matePairs.size() ; ++i ) + // printf( "matePair: %d %d %d\n", matePairs[i].i, matePairs[i].j, matePairs[i].support ) ; + // single-end data set + //if ( matePairs.size() == 0 ) + if ( alignments.fragStdev == 0 ) + { + int size = constraints.size() ; + matePairs.clear() ; + + for ( i = 0 ; i < size ; ++i ) + { + struct _matePairConstraint nm ; + nm.i = i ; + nm.j = i ; + nm.abundance = 0 ; + nm.support = constraints[i].support ; + nm.uniqSupport = constraints[i].uniqSupport ; + nm.effectiveCount = 1 ; + + matePairs.push_back( nm ) ; + } + } + + ComputeNormAbund( subexons ) ; + + /*for ( i = 0 ; i < constraints.size() ; ++i ) + { + printf( "constraints %d: %lf %d %d %d ", i, constraints[i].normAbund, constraints[i].first, constraints[i].last, constraints[i].support ) ; + constraints[i].vector.Print() ; + } + + for ( i = 0 ; i < matePairs.size() ; ++i ) + { + printf( "mates %d: %lf %d %d %d %d\n", i, matePairs[i].normAbund, matePairs[i].i, matePairs[i].j, matePairs[i].support, matePairs[i].uniqSupport ) ; + }*/ + + return 0 ; +} diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/Constraints.hpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/Constraints.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,445 @@ +#ifndef _MOURISL_CLASSES_CONSTRAINTS_HEADER +#define _MOURISL_CLASSES_CONSTRAINTS_HEADER + +#include +#include +#include +#include +#include + +#include "BitTable.hpp" +#include "alignments.hpp" +#include "SubexonGraph.hpp" + +struct _constraint +{ + BitTable vector ; // subexon vector + double weight ; + double normAbund ; + double abundance ; + int support ; + int uniqSupport ; + int maxReadLen ; // the longest read length support this constraint. + + int info ; // other usages. + int first, last ; // indicate the first and last index of the subexons. +} ; + +struct _matePairConstraint +{ + int i, j ; + + int support ; + int uniqSupport ; + + double abundance ; + double normAbund ; + int effectiveCount ; + int type ; +} ; + +struct _readIdHeap +{ + char *readId ; + int pos ; + int matePos ; + int idx ; +} ; + +//---------------------------------------------------------------------------------- +// We assume the access to the data structure is sorted by matePos. +// So we can "pre-cache" the ids with the same matePos. +class MateReadIds +{ +private: + std::vector< struct _readIdHeap > heap ; + + void HeapifyUp( int tag ) + { + while ( tag > 1 ) + { + if ( heap[tag / 2].matePos < heap[tag].matePos ) + return ; + struct _readIdHeap tmp ; + tmp = heap[tag / 2] ; + heap[tag / 2] = heap[tag] ; + heap[tag] = tmp ; + + tag /= 2 ; + } + } + + void HeapifyDown( int tag ) + { + int size = heap.size() ; + while ( 2 * tag < size ) + { + int choose = 2 * tag ; + if ( 2 * tag + 1 < size && + heap[ 2 * tag + 1].matePos < heap[2 * tag ].matePos ) + { + choose = 2 * tag + 1 ; + } + + if ( heap[tag].matePos < heap[choose].matePos ) + return ; + + struct _readIdHeap tmp ; + tmp = heap[choose] ; + heap[choose] = heap[tag] ; + heap[tag] = tmp ; + + tag = choose ; + } + } + + struct _readIdHeap Pop() + { + struct _readIdHeap ret ; + int size = heap.size() ; + if ( size < 2 ) + { + ret.readId = NULL ; + return ret ; + } + + ret = heap[1] ; + + heap[1] = heap[ heap.size() - 1] ; + heap.pop_back() ; + HeapifyDown( 1 ) ; + + return ret ; + } + + int cachedMatePos ; + std::map cachedIdx ; + bool hasMateReadIdSuffix ; // ignore the last ".{1,2}" or "/{1,2}" . +public: + MateReadIds() + { + // Push a dummy element so the vector becomes 1-based. + struct _readIdHeap nh ; + nh.readId = NULL ; + nh.pos = nh.idx = nh.matePos = -1 ; + heap.push_back( nh ) ; + cachedMatePos = -1 ; + hasMateReadIdSuffix = false ; + } + ~MateReadIds() + { + int size = heap.size() ; + std::map().swap( cachedIdx ) ; + for ( int i = 0 ; i < size ; ++i ) + if ( heap[i].readId != NULL ) + free( heap[i].readId ) ; + } + + void Clear() + { + std::map().swap( cachedIdx ) ; + + int size = heap.size() ; + for ( int i = 0 ; i < size ; ++i ) + if ( heap[i].readId != NULL ) + free( heap[i].readId ) ; + std::vector().swap( heap ) ; + + struct _readIdHeap nh ; + nh.readId = NULL ; + nh.pos = nh.idx = nh.matePos = -1 ; + heap.push_back( nh ) ; + cachedMatePos = -1 ; + } + + void Insert( char *id, int pos, int idx, int matePos ) + { + struct _readIdHeap nh ; + nh.readId = strdup( id ) ; + nh.pos = pos ; + nh.idx = idx ; + nh.matePos = matePos ; + + heap.push_back( nh ) ; + HeapifyUp( heap.size() - 1 ) ; + } + + // If the id does not exist, return -1. + int Query( char *id, int matePos ) + { + int size ; + size = heap.size() ; + if ( matePos > cachedMatePos ) + { + std::map().swap( cachedIdx ) ; + + while ( size >= 2 && heap[1].matePos < matePos ) + { + struct _readIdHeap r = Pop() ; + if ( r.readId ) + { + free( r.readId ) ; + } + --size ; + } + + while ( size >= 2 && heap[1].matePos == matePos ) + { + struct _readIdHeap r = Pop() ; + cachedIdx[ std::string( r.readId ) ] = r.idx ; + if ( r.readId ) + free( r.readId ) ; + --size ; + } + cachedMatePos = matePos ; + } + std::string s( id ) ; + if ( hasMateReadIdSuffix ) + { + int len = s.length() ; + if ( len >= 2 && ( s[len - 1] == '1' || s[len - 1] == '2' ) + && ( s[len - 2] == '.' || s[len - 2] == '/' ) ) + { + s[len - 1] = '2' - s[len - 1] + '1' ; + } + } + + if ( cachedIdx.find( s ) != cachedIdx.end() ) + { + return cachedIdx[s] ; + } + return -1 ; + } + + void UpdateIdx( std::vector &newIdx ) + { + int size = heap.size() ; + int i ; + for ( i = 1 ; i < size ; ++i ) + { + heap[i].idx = newIdx[ heap[i].idx ] ; + } + + for ( std::map::iterator it = cachedIdx.begin() ; it != cachedIdx.end() ; ++it ) + it->second = newIdx[ it->second ] ; + } + + void SetHasMateReadIdSuffix( bool in ) + { + hasMateReadIdSuffix = true ; + } +} ; + + +//-------------------------------------------------------------------------- +class Constraints +{ +private: + int prevStart, prevEnd ; + bool usePrimaryAsUnique ; + MateReadIds mateReadIds ; + + Alignments *pAlignments ; + + //@return: whether this alignment is compatible with the subexons or not. + bool ConvertAlignmentToBitTable( struct _pair *segments, int segCnt, struct _subexon *subexons, int seCnt, int seStart, struct _constraint &ct ) ; + + // Sort to increasing order. Since the first subexon occupies the least important digit. + static bool CompSortConstraints( const struct _constraint &a, const struct _constraint &b ) + { + //int k + if ( a.first < b.first ) + return true ; + else if ( a.first > b.first ) + return false ; + + int diffPos = a.vector.GetFirstDifference( b.vector ) ; + if ( diffPos == -1 ) // case of equal. + return false ; + + if ( a.vector.Test( diffPos )) + return false ; + else + return true ; + } + + static bool CompSortMatePairs( const struct _matePairConstraint &a, const struct _matePairConstraint &b ) + { + if ( a.i < b.i ) + return true ; + else if ( a.i > b.i ) + return false ; + else + { + if ( a.j < b.j ) + return true ; + else + return false ; + } + } + + void CoalesceSameConstraints() ; + void ComputeNormAbund( struct _subexon *subexons ) ; +public: + std::vector constraints ; + std::vector matePairs ; + + Constraints() + { + usePrimaryAsUnique = false ; + } + + Constraints( Alignments *a ): pAlignments( a ) + { + } + + ~Constraints() + { + int i ; + int size = constraints.size() ; + for ( i = 0 ; i < size ; ++i ) + constraints[i].vector.Release() ; + constraints.clear() ; + std::vector().swap( constraints ) ; + matePairs.clear() ; + std::vector().swap( matePairs ) ; + } + + void Clear() + { + //TODO: do I need to release the memory from BitTable? + constraints.clear() ; + } + + void SetAlignments( Alignments *a ) + { + pAlignments = a ; + } + + void SetUsePrimaryAsUnique( bool in ) + { + usePrimaryAsUnique = in ; + } + + void Assign( Constraints &c ) + { + int i ; + int size = constraints.size() ; + if ( size > 0 ) + { + for ( i = 0 ; i < size ; ++i ) + constraints[i].vector.Release() ; + constraints.clear() ; + std::vector().swap( constraints ) ; + } + matePairs.clear() ; + std::vector().swap( matePairs ) ; + + //constraints.resize( c.constraints.size() ) ; + constraints = c.constraints ; + size = c.constraints.size() ; + for ( i = 0 ; i < size ; ++i ) + { + /*struct _constraint nc ; + nc.weight = c.constraints[i].weight ; + nc.normAbund = c.constraints[i].normAbund ; + nc.abundance = c.constraints[i].abundance ; + nc.support = c.constraints[i].support ; + nc.uniqSupport = c.constraints[i].uniqSupport ; + nc.maxReadLen = c.constraints[i].maxReadLen ; + nc.info = c.constraints[i].info ; + nc.first = c.constraints[i].first ; + nc.last = c.constraints[i].last ; + nc.vector.Duplicate( c.constraints[i].vector ) ; + constraints[i] = ( nc ) ; */ + constraints[i].vector.Nullify() ; // so that it won't affect the BitTable in "c" + constraints[i].vector.Duplicate( c.constraints[i].vector ) ; + } + matePairs = c.matePairs ; + pAlignments = c.pAlignments ; + } + + void DownsampleConstraintsFrom( Constraints &c, int stride = 10 ) + { + int i ; + int size = constraints.size(), k ; + + if ( size > 0 ) + { + for ( i = 0 ; i < size ; ++i ) + constraints[i].vector.Release() ; + constraints.clear() ; + std::vector().swap( constraints ) ; + } + matePairs.clear() ; + std::vector().swap( matePairs ) ; + + //constraints.resize( c.constraints.size() ) ; + //constraints = c.constraints ; + k = 0 ; + size = c.constraints.size() ; + for ( i = 0 ; i < size ; i += stride, ++k ) + { + constraints.push_back( c.constraints[i] ) ; + constraints[k].vector.Nullify() ; // so that it won't affect the BitTable in "c" + constraints[k].vector.Duplicate( c.constraints[i].vector ) ; + + /*std::vector seIdx ; + constraints[k].vector.GetOnesIndices( seIdx ) ; + int j, l = seIdx.size() ; + for ( j = 2 ; j < l ; ++j ) + { + constraints[k].vector.Unset( seIdx[j] ) ; + } + constraints[k].last = seIdx[1] ;*/ + } + // mate pairs is not used. if we down-sampling + pAlignments = c.pAlignments ; + } + + void TruncateConstraintsCoverFrom( Constraints &c, int seCnt, int maxConstraintSize ) + { + int i ; + int size = constraints.size() ; + + if ( size > 0 ) + { + for ( i = 0 ; i < size ; ++i ) + constraints[i].vector.Release() ; + constraints.clear() ; + std::vector().swap( constraints ) ; + } + matePairs.clear() ; + std::vector().swap( matePairs ) ; + + //constraints.resize( c.constraints.size() ) ; + //constraints = c.constraints ; + size = c.constraints.size() ; + for ( i = 0 ; i < size ; ++i ) + { + constraints.push_back( c.constraints[i] ) ; + constraints[i].vector.Nullify() ; // so that it won't affect the BitTable in "c" + constraints[i].vector.Init( seCnt ) ; + std::vector seIdx ; + c.constraints[i].vector.GetOnesIndices( seIdx ) ; + int j, l = seIdx.size() ; + for ( j = 0 ; j < maxConstraintSize && j < l ; ++j ) + { + constraints[i].vector.Set( seIdx[j] ) ; + } + constraints[i].last = seIdx[j - 1] ; + } + // mate pairs is not used. if we down-sampling + pAlignments = c.pAlignments ; + } + + void SetHasMateReadIdSuffix( bool in ) + { + mateReadIds.SetHasMateReadIdSuffix( in ) ; + } + + int BuildConstraints( struct _subexon *subexons, int seCnt, int start, int end ) ; + +} ; + +#endif diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/FilterSplice.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/FilterSplice.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,48 @@ +#!/bin/perl + +use strict ; +use warnings ; + +die "usage: a.pl a.raw_splice trusted.plisce > filtered.splice\n" if ( @ARGV == 0 ) ; + +my %trustedSplices ; + +open FP1, $ARGV[1] ; +while ( ) +{ + chomp ; + my @cols = split /\s+/, $_ ; + my $key = $cols[0]." ".$cols[1]." ".$cols[2] ; + $trustedSplices{ $key } = $cols[4] ; +} +close FP1 ; + +open FP1, $ARGV[0] ; +while ( ) +{ + chomp ; + my @cols = split /\s+/, $_ ; + my $key = $cols[0]." ".$cols[1]." ".$cols[2] ; + next if ( !defined $trustedSplices{ $key } ) ; + + my $trustedStrand = $trustedSplices{ $key } ; + if ( $cols[3] <= 0 ) + { + print $cols[0], " ", $cols[1], " ", $cols[2], " 1 ", $trustedStrand, " 1 0 0 0\n" ; + } + else + { + if ( $cols[4] eq $trustedStrand ) + { + print $_, "\n" ; + } + else + { + $cols[4] = $trustedStrand ; + print join( " ", @cols ), "\n" ; + } + } + +} +close FP1 ; + diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/FindJunction.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/FindJunction.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1108 @@ +/* + Find junctions from the SAM file generated by Tophat2. + The SAM file should be sorted. + The junction is the start and the end coordinates of the spliced region in this program. The output of the junction is a bit different. + + Usage: ./a.out input [option] >output. + ./a.out -h for help. +*/ +#include +#include +#include + +#include "sam.h" + +#define LINE_SIZE 4097 +#define QUEUE_SIZE 10001 +#define HASH_MAX 1000003 + +struct _readTree +{ + char id[256] ; + int leftAnchor, rightAnchor ; + int pos ; + //bool secondary ; + bool valid ; + int editDistance ; + int NH, cnt ; // if cnt < NH, then it has real secondary match for this splice junction + struct _readTree *left, *right ; + + int flag ;// The flag from sam head. +} ; + +// The structure of a junction +struct _junction +{ + int start, end ; + int readCnt ; // The # of reads containing this junction + int secReadCnt ; // The # of reads whose secondary alignment has this junction. + char strand ; // On '+' or '-' strand + int leftAnchor, rightAnchor ; // The longest left and right anchor + int oppositeAnchor ; // The longest anchor of the shorter side. + int uniqEditDistance, secEditDistance ; + struct _readTree head ; +} ; + +char line[LINE_SIZE] ; +char col[11][LINE_SIZE] ; // The option fields is not needed. +char strand ; // Extract XS field +char noncanonStrandInfo ; +//bool secondary ; +int NH ; +int editDistance ; +int mateStart ; +int filterYS ; +int samFlag ; + +struct _junction junctionQueue[QUEUE_SIZE] ; // Expected only a few junctions in it for each read. This queue is sorted. + +int qHead, qTail ; + +bool flagPrintJunction ; +bool flagPrintAll ; +bool flagStrict ; +int junctionCnt ; +bool anchorBoth ; +bool validRead ; + +int flank ; +struct _cigarSeg +{ + int len ; + char type ; +} ; + +struct _readTree *contradictedReads ; + +char nucToNum[26] = { 0, 4, 1, 4, 4, 4, 2, + 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 3, + 4, 4, 4, 4, 4, 4 } ; + +void PrintHelp() +{ + printf( + "Prints reads from the SAM/BAM file that containing junctions.\n" + "Usage: ./a.out input [option]>output\n" + "Options:\n" + "\t-j xx [-B]: Output the junctions using 1-based coordinates. The format is \"reference id\" start end \"# of read\" strand.(They are sorted)\n and the xx is an integer means the maximum unqualified anchor length for a splice junction(default=8). If -B, the splice junction must be supported by a read whose both anchors are longer than xx.\n" + "\t-a: Output all the junctions, and use non-positive support number to indicate unqualified junctions.\n" + "\t-y: If the bits from YS field of bam matches the argument, we filter the alignment (default: 4).\n" + ) ; +} + +void GetJunctionInfo( struct _junction &junc, struct _readTree *p ) +{ + if ( p == NULL ) + return ; + + if ( p->valid ) + { + //if ( junc.start == 22381343 + 1 && junc.end == 22904987 - 1 ) + // printf( "%s %d %d %d\n", p->id, p->leftAnchor, p->rightAnchor, p->flag ) ; + + + + if ( p->cnt < p->NH ) + { + junc.secEditDistance += p->editDistance ; + ++junc.secReadCnt ; + } + else + { + junc.uniqEditDistance += p->editDistance ; + ++junc.readCnt ; + } + int l = p->leftAnchor, r = p->rightAnchor ; + + if ( !anchorBoth ) + { + if ( l > junc.leftAnchor ) + junc.leftAnchor = l ; + if ( r > junc.rightAnchor ) + junc.rightAnchor = r ; + + if ( l <= r && l > junc.oppositeAnchor ) + junc.oppositeAnchor = l ; + else if ( r < l && r > junc.oppositeAnchor ) + junc.oppositeAnchor = r ; + } + else + { + if ( l > flank && r > flank ) + { + junc.leftAnchor = l ; + junc.rightAnchor = r ; + } + + if ( l <= r && l > junc.oppositeAnchor ) + junc.oppositeAnchor = l ; + else if ( r < l && r > junc.oppositeAnchor ) + junc.oppositeAnchor = r ; + } + } + GetJunctionInfo( junc, p->left ) ; + GetJunctionInfo( junc, p->right ) ; +} + +void PrintJunctionReads( struct _junction &junc, struct _readTree *p ) +{ + if ( p == NULL ) + return ; + + if ( p->valid ) + printf( "%s\n", p->id ) ; + PrintJunctionReads( junc, p->left ) ; + PrintJunctionReads( junc, p->right ) ; +} + +void PrintJunction( char *chrome, struct _junction &junc ) +{ + int sum ; + junc.leftAnchor = 0 ; + junc.rightAnchor = 0 ; + junc.oppositeAnchor = 0 ; + junc.readCnt = 0 ; + junc.secReadCnt = 0 ; + junc.uniqEditDistance = 0 ; + junc.secEditDistance = 0 ; + GetJunctionInfo( junc, &junc.head ) ; + + sum = junc.readCnt + junc.secReadCnt ; + + if ( junc.leftAnchor <= flank || junc.rightAnchor <= flank || ( junc.readCnt + junc.secReadCnt <= 0 ) ) + { + if ( flagPrintAll ) + { + sum = -sum ; + } + else + return ; + } + + /*if ( junc.end - junc.start + 1 >= 200000 ) + { + if ( junc.secReadCnt > 0 ) + junc.secReadCnt = 1 ; + }*/ + + /*if ( junc.oppositeAnchor <= ( ( flank / 2 < 1 ) ? flank / 2 : 1 ) && ( junc.readCnt + junc.secReadCnt ) <= 10 ) + { + if ( junc.readCnt > 0 ) + junc.readCnt = 1 ; + if ( junc.secReadCnt > 0 ) + junc.secReadCnt = 0 ; + }*/ + + printf( "%s %d %d %d %c %d %d %d %d\n", chrome, junc.start - 1, junc.end + 1, sum, junc.strand, + junc.readCnt, junc.secReadCnt, junc.uniqEditDistance, junc.secEditDistance ) ; + //PrintJunctionReads( junc, &junc.head ) ; +} + +void ClearReadTree( struct _readTree *p ) +{ + if ( p == NULL ) + return ; + ClearReadTree( p->left ) ; + ClearReadTree( p->right ) ; + free( p ) ; +} + +// Insert to the read tree +bool InsertReadTree( struct _readTree *p, char *id, int l, int r ) +{ + int tmp = strcmp( p->id, id ) ; + if ( tmp == 0 ) + { + p->cnt += 1 ; + return true ; + } + else if ( tmp < 0 ) + { + if ( p->left ) + return InsertReadTree( p->left, id, l, r ) ; + else + { + p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; + strcpy( p->left->id, id ) ; + p->left->leftAnchor = l ; + p->left->rightAnchor = r ; + //p->left->secondary = secondary ; + p->left->editDistance = editDistance ; + p->left->left = p->left->right = NULL ; + p->left->valid = validRead ; + p->left->cnt = 1 ; + p->left->NH = NH ; + p->left->flag = samFlag ; + return false ; + } + } + else + { + if ( p->right ) + return InsertReadTree( p->right, id, l, r ) ; + else + { + p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; + strcpy( p->right->id, id ) ; + p->right->leftAnchor = l ; + p->right->rightAnchor = r ; + //p->right->secondary = secondary ; + p->right->editDistance = editDistance ; + p->right->left = p->right->right = NULL ; + p->right->valid = validRead ; + p->right->cnt = 1 ; + p->right->NH = NH ; + p->right->flag = samFlag ; + return false ; + } + } +} + + +bool SearchContradictedReads( struct _readTree *p, char *id, int pos ) +{ + if ( p == NULL ) + return false ; + int tmp = strcmp( p->id, id ) ; + if ( tmp == 0 && p->pos == pos ) + return true ; + else if ( tmp <= 0 ) + return SearchContradictedReads( p->left, id, pos ) ; + else + return SearchContradictedReads( p->right, id, pos ) ; +} + +bool InsertContradictedReads( struct _readTree *p, char *id, int pos ) +{ + if ( p == NULL ) + { + contradictedReads = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; + strcpy( contradictedReads->id, id ) ; + contradictedReads->pos = pos ; + contradictedReads->left = contradictedReads->right = NULL ; + return false ; + } + int tmp = strcmp( p->id, id ) ; + if ( tmp == 0 && p->pos == pos ) + return true ; + else if ( tmp <= 0 ) + { + if ( p->left ) + return InsertContradictedReads( p->left, id, pos ) ; + else + { + p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; + strcpy( p->left->id, id ) ; + p->left->pos = pos ; + p->left->left = p->left->right = NULL ; + return false ; + } + } + else + { + if ( p->right ) + return InsertContradictedReads( p->right, id, pos ) ; + else + { + p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; + strcpy( p->right->id, id ) ; + p->right->pos = pos ; + p->right->left = p->right->right = NULL ; + return false ; + } + } +} + +struct _readTree *GetReadTreeNode( struct _readTree *p, char *id ) +{ + if ( p == NULL ) + return NULL ; + int tmp = strcmp( p->id, id ) ; + if ( tmp == 0 ) + return p ; + else if ( tmp < 0 ) + return GetReadTreeNode( p->left, id ) ; + else + return GetReadTreeNode( p->right, id ) ; +} + +// Insert the new junction into the queue, +// and make sure the queue is sorted. +// Assume each junction range is only on one strand. +// l, r is the left and right anchor from a read +void InsertQueue( int start, int end, int l, int r ) +{ + int i, j ; + i = qTail ; + + + while ( i != qHead ) + { + j = i - 1 ; + if ( j < 0 ) + j = QUEUE_SIZE - 1 ; + + if ( junctionQueue[j].start < start + || ( junctionQueue[j].start == start && junctionQueue[j].end < end ) ) + break ; + + junctionQueue[i] = junctionQueue[j] ; + --i ; + + if ( i < 0 ) + i = QUEUE_SIZE - 1 ; + } + + junctionQueue[i].start = start ; + junctionQueue[i].end = end ; + /*if ( !secondary ) + { + junctionQueue[i].readCnt = 1 ; + junctionQueue[i].secReadCnt = 0 ; + } + else + { + junctionQueue[i].readCnt = 0 ; + junctionQueue[i].secReadCnt = 1 ; + }*/ + junctionQueue[i].strand = strand ; + junctionQueue[i].leftAnchor = l ; + junctionQueue[i].rightAnchor = r ; + + strcpy( junctionQueue[i].head.id, col[0] ) ; + junctionQueue[i].head.valid = validRead ; + junctionQueue[i].head.leftAnchor = l ; + junctionQueue[i].head.rightAnchor = r ; + junctionQueue[i].head.left = junctionQueue[i].head.right = NULL ; + //junctionQueue[i].head.secondary = secondary ; + junctionQueue[i].head.NH = NH ; + junctionQueue[i].head.cnt = 1 ; + junctionQueue[i].head.editDistance = editDistance ; + + ++qTail ; + if ( qTail >= QUEUE_SIZE ) + qTail = 0 ; +} + +// Search the queue, and remove the heads if its end(start) is smaller than prune.(Because it is impossible to have that junction again) +// Add the junction to the queue, if it is not in it. +// Return true, if it finds the junction. Otherwise, return false. +bool SearchQueue( int start, int end, int prune, int l, int r ) +{ + int i ; + + // Test whether this read might be a false alignment. + i = qHead ; + while ( i != qTail ) + { + struct _readTree *rt ; + if ( ( junctionQueue[i].start == start && junctionQueue[i].end < end ) || + ( junctionQueue[i].start > start && junctionQueue[i].end == end ) ) + { + // This alignment is false ; + rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ; + // the commented out logic because it is handled by contradicted reads + if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) ) + { + if ( rt->leftAnchor <= flank || rt->rightAnchor <= flank )//|| rt->secondary ) + { + if ( l > flank && r > flank )//&& !secondary ) + rt->valid = false ; + else + validRead = false ; + } + else + { + // Ignore this read + //return true ; + validRead = false ; + } + } + } + else if ( ( junctionQueue[i].start == start && junctionQueue[i].end > end ) || + ( junctionQueue[i].start < start && junctionQueue[i].end == end ) ) + { + // This other alignment is false ; + rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ; + //if ( rt != NULL ) + if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) ) + { + if ( l <= flank || r <= flank )//|| secondary ) + { + if ( rt->leftAnchor > flank && rt->rightAnchor > flank )//&& !rt->secondary ) + validRead = false ; + else + rt->valid = false ; + } + else + rt->valid = false ; + } + } + ++i ; + if ( i >= QUEUE_SIZE ) + i = 0 ; + } + + //if ( start == 8077108 && end == 8078439 ) + // exit( 1 ) ; + i = qHead ; + + while ( i != qTail ) + { + if ( junctionQueue[i].start == start && + junctionQueue[i].end == end ) + { + if (junctionQueue[i].strand == '?' && junctionQueue[i].strand != strand ) + { + junctionQueue[i].strand = strand ; + } + InsertReadTree( &junctionQueue[i].head, col[0], l, r ) ; + return true ; + } + + if ( junctionQueue[i].end < prune && i == qHead ) + { + // pop + PrintJunction( col[2], junctionQueue[i] ) ; + ClearReadTree( junctionQueue[i].head.left ) ; + ClearReadTree( junctionQueue[i].head.right ) ; + ++qHead ; + if ( qHead >= QUEUE_SIZE ) + qHead = 0 ; + } + ++i ; + if ( i >= QUEUE_SIZE ) + i = 0 ; + } + + InsertQueue( start, end, l, r ) ; + return false ; +} + +// Compute the junctions based on the CIGAR +bool CompareJunctions( int startLocation, char *cigar ) +{ + int currentLocation = startLocation ; // Current location on the reference genome + int i, j ; + int num ; + int newJuncCnt = 0 ; // The # of junctions in the read, and the # of new junctions among them. + + struct _cigarSeg cigarSeg[1000] ; // A segment of the cigar. + int ccnt = 0 ; // cigarSeg cnt + + j = 0 ; + num = 0 ; + validRead = true ; + + for ( i = 0 ; cigar[i] ; ++i ) + { + if ( cigar[i] >= '0' && cigar[i] <= '9' ) + { + num = num * 10 + cigar[i] - '0' ; + } + else + { + cigarSeg[ccnt].len = num ; + cigarSeg[ccnt].type = cigar[i] ; + ++ccnt ; + num = 0 ; + } + } + + // Filter low complex alignment. + // Only applies this to alignments does not have a strand information. + if ( strand == '?' ) + { + if ( noncanonStrandInfo != -1 ) + { + if ( ( noncanonStrandInfo & filterYS ) != 0 ) + validRead = false ; + } + else + { + int softStart = -1 ; + int softEnd = 0 ; + if ( cigarSeg[0].type == 'S' ) + softStart = cigarSeg[0].len ; + if ( cigarSeg[ ccnt - 1 ].type == 'S' ) + softEnd = cigarSeg[ ccnt - 1 ].len ; + int readLen = strlen( col[9] ) ; + int count[5] = { 0, 0, 0, 0, 0 } ; + + int pos = 0 ; + for ( i = 0 ; i < ccnt ; ++i ) + { + switch ( cigarSeg[i].type ) + { + case 'S': + pos += cigarSeg[i].len ; + case 'M': + case 'I': + { + for ( j = 0 ; j < cigarSeg[i].len ; ++j ) + ++count[ nucToNum[ col[9][pos + j] - 'A' ] ] ; + pos += j ; + } break ; + case 'N': + { + int max = 0 ; + int sum = 0 ; + for ( j = 0 ; j < 5 ; ++j ) + { + if ( count[j] > max ) + max = count[j] ; + sum += count[j] ; + } + if ( max > 0.8 * sum ) + validRead = false ; + count[0] = count[1] = count[2] = count[3] = count[4] = 0 ; + } break ; + case 'H': + case 'P': + case 'D': + default: break ; + } + } + + int max = 0 ; + int sum = 0 ; + for ( j = 0 ; j < 5 ; ++j ) + { + if ( count[j] > max ) + max = count[j] ; + sum += count[j] ; + } + if ( max > 0.8 * sum ) + validRead = false ; + /* count[0] = count[1] = count[2] = count[3] = count[4] = 0 ; + + + for ( i = softStart + 1 ; i < readLen - softEnd ; ++i ) + { + switch ( col[9][i] ) + { + case 'A': ++count[0] ; break ; + case 'C': ++count[1] ; break ; + case 'G': ++count[2] ; break ; + case 'T': ++count[3] ; break ; + default: ++count[4] ; + } + } + int max = 0 ; + for ( j = 0 ; j < 5 ; ++j ) + if ( count[j] > max ) + max = count[j] ; + if ( max > 0.6 * ( readLen - softEnd - softStart - 1 ) ) + validRead = false ;*/ + } + } + + // Test whether contradict with mate pair + if ( col[6][0] == '=' ) + { + currentLocation = startLocation ; + for ( i = 0 ; i < ccnt ; ++i ) + { + if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H' + || cigarSeg[i].type == 'P' ) + continue ; + else if ( cigarSeg[i].type == 'N' ) + { + if ( mateStart >= currentLocation && mateStart <= currentLocation + cigarSeg[i].len - 1 ) + { + /*if ( cigarSeg[i].len == 91486 ) + { + printf( "%s %d %d\n", currentLocation, mateStart ) ; + exit( 1 ) ; + }*/ + // ignore this read + //return false ; + InsertContradictedReads( contradictedReads, col[0], mateStart ) ; + validRead = false ; + break ; + } + } + currentLocation += cigarSeg[i].len ; + } + + i = qHead ; + // Search if it falls in the splices junction created by its mate. + while ( i != qTail ) + { + if ( mateStart < junctionQueue[i].start && junctionQueue[i].start <= startLocation + && startLocation <= junctionQueue[i].end + && SearchContradictedReads( contradictedReads, col[0], startLocation ) ) + { + validRead = false ; + break ; + } + ++i ; + if ( i >= QUEUE_SIZE ) + i = 0 ; + } + } + + currentLocation = startLocation ; + for ( i = 0 ; i < ccnt ; ++i ) + { + if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H' + || cigarSeg[i].type == 'P' ) + continue ; + else if ( cigarSeg[i].type == 'N' ) + { + int left, right ; + int tmp ; + tmp = i ; + while ( i > 0 && ( cigarSeg[i - 1].type == 'I' || cigarSeg[i - 1].type == 'D' ) ) + --i ; + if ( i > 0 && cigarSeg[i - 1].type == 'M' ) + { + left = cigarSeg[i - 1].len ; + if ( i >= 2 && cigarSeg[i - 2].type == 'N' && left <= flank ) + { + left = flank + 1 ; + } + } + else + left = 0 ; + + i = tmp ; + while ( i < ccnt && ( cigarSeg[i + 1].type == 'I' || cigarSeg[i + 1].type == 'D' ) ) + ++i ; + if ( i < ccnt && cigarSeg[i + 1].type == 'M' ) + { + right = cigarSeg[i + 1].len ; + if ( i + 2 < ccnt && cigarSeg[i + 2].type == 'N' && right <= flank ) + { + right = flank + 1 ; + } + } + else + right = 0 ; + i = tmp ; + if ( !SearchQueue( currentLocation, currentLocation + cigarSeg[i].len - 1, startLocation, left, right ) ) + { + ++newJuncCnt ; + } + } + currentLocation += cigarSeg[i].len ; + } + /*else if ( cigar[i] == 'I' ) + { + num = 0 ; + } + else if ( cigar[i] == 'N' ) + { + if ( !SearchQueue( currentLocation, currentLocation + num - 1, startLocation ) ) + { + ++newJuncCnt ; + //if ( flagPrintJunction ) + // printf( "%s %d %d\n", col[2], currentLocation - 2, currentLocation + len - 1 ) ; + } + currentLocation += num ; + num = 0 ; + } + else // Other operations, like M, D,..., + { + currentLocation += num ; + num = 0 ; + }*/ + + junctionCnt += newJuncCnt ; + + if ( newJuncCnt ) + return true ; + else + return false ; +} + +void cigar2string( bam1_core_t *c, uint32_t *in_cigar, char *out_cigar ) +{ + int k, op, l ; + char opcode ; + + *out_cigar = '\0' ; + for ( k = 0 ; k < c->n_cigar ; ++k ) + { + op = in_cigar[k] & BAM_CIGAR_MASK ; + l = in_cigar[k] >> BAM_CIGAR_SHIFT ; + switch (op) + { + case BAM_CMATCH: opcode = 'M' ; break ; + case BAM_CINS: opcode = 'I' ; break ; + case BAM_CDEL: opcode = 'D' ; break ; + case BAM_CREF_SKIP: opcode = 'N' ; break ; + case BAM_CSOFT_CLIP: opcode = 'S' ; break ; + case BAM_CHARD_CLIP: opcode = 'H' ; break ; + case BAM_CPAD: opcode = 'P' ; break ; + } + sprintf( out_cigar + strlen( out_cigar ), "%d%c", l, opcode ) ; + } +} + + + +int main( int argc, char *argv[] ) +{ + FILE *fp ; + samfile_t *fpsam ; + bam1_t *b = NULL ; + bool useSam = true ; + + int i, len ; + int startLocation ; + bool flagRemove = false ; + char prevChrome[103] ; + + anchorBoth = false ; + + flagPrintJunction = false ; + flagPrintAll = false ; + flagStrict = false ; + junctionCnt = 0 ; + bool hasMateReadIdSuffix = false ; + + strcpy( prevChrome, "" ) ; + flagRemove = true ; + flagPrintJunction = true ; + flank = 8 ; + filterYS = 4 ; + + contradictedReads = NULL ; + + // processing the argument list + for ( i = 1 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "-h" ) ) + { + PrintHelp() ; + return 0 ; + } + else if ( !strcmp( argv[i], "-r" ) ) + { + flagRemove = true ; + } + else if ( !strcmp( argv[i], "-j" ) ) + { + strcpy( prevChrome, "" ) ; + flagRemove = true ; + flagPrintJunction = true ; + flank = atoi( argv[i + 1] ) ; + if ( i + 2 < argc && !strcmp( argv[i+2], "-B" ) ) + { + anchorBoth = true ; + ++i ; + } + ++i ; + } + else if ( !strcmp( argv[i], "-a" ) ) + { + flagPrintAll = true ; + } + else if ( !strcmp( argv[i], "--strict" ) ) + { + flagStrict = true ; + } + else if ( !strcmp( argv[i], "-y" ) ) + { + filterYS = atoi( argv[i + 1] ) ; + ++i ; + } + else if ( !strcmp( argv[i], "--hasMateIdSuffix" ) ) + { + hasMateReadIdSuffix = true ; + ++i ; + } + else if ( i > 1 ) + { + printf( "Unknown option %s\n", argv[i] ) ; + exit( 1 ) ; + } + } + if ( argc == 1 ) + { + PrintHelp() ; + return 0 ; + } + + len = strlen( argv[1] ) ; + if ( argv[1][len-3] == 'b' || argv[1][len-3] == 'B' ) + { + if ( !( fpsam = samopen( argv[1], "rb", 0 ) ) ) + return 0 ; + + if ( !fpsam->header ) + { + //samclose( fpsam ) ; + //fpsam = samopen( argv[1], "r", 0 ) ; + //if ( !fpsam->header ) + //{ + useSam = false ; + fp = fopen( argv[1], "r" ) ; + //} + } + } + else + { + useSam = false ; + fp = NULL ; + if ( !strcmp( argv[1], "-" ) ) + fp = stdin ; + else + fp = fopen( argv[1], "r" ) ; + if ( fp == NULL ) + { + printf( "Could not open file %s\n", argv[1] ) ; + return 0 ; + } + } + + while ( 1 ) + { + int flag = 0 ; + if ( useSam ) + { + if ( b ) + bam_destroy1( b ) ; + b = bam_init1() ; + if ( samread( fpsam, b ) <= 0 ) + break ; + if ( b->core.tid >= 0 ) + strcpy( col[2], fpsam->header->target_name[b->core.tid] ) ; + else + strcpy( col[2], "-1" ) ; + cigar2string( &(b->core), bam1_cigar( b ), col[5] ) ; + strcpy( col[0], bam1_qname( b ) ) ; + flag = b->core.flag ; + if ( bam_aux_get( b, "NH" ) ) + { + /*if ( bam_aux2i( bam_aux_get(b, "NH" ) ) >= 2 ) + { + secondary = true ; + } + else + secondary = false ;*/ + + NH = bam_aux2i( bam_aux_get( b, "NH" ) ) ; + } + else + { + //secondary = false ; + NH = 1 ; + } + + if ( bam_aux_get( b, "NM" ) ) + { + editDistance = bam_aux2i( bam_aux_get( b, "NM" ) ) ; + } + else if ( bam_aux_get( b, "nM" ) ) + { + editDistance = bam_aux2i( bam_aux_get( b, "nM" ) ) ; + } + else + editDistance = 0 ; + + mateStart = b->core.mpos + 1 ; + if ( b->core.mtid == b->core.tid ) + col[6][0] = '=' ; + else + col[6][0] = '*' ; + + if ( b->core.l_qseq < 20 ) + continue ; + + for ( i = 0 ; i < b->core.l_qseq ; ++i ) + { + int bit = bam1_seqi( bam1_seq( b ), i ) ; + switch ( bit ) + { + case 1: col[9][i] = 'A' ; break ; + case 2: col[9][i] = 'C' ; break ; + case 4: col[9][i] = 'G' ; break ; + case 8: col[9][i] = 'T' ; break ; + case 15: col[9][i] = 'N' ; break ; + default: col[9][i] = 'A' ; break ; + } + } + col[9][i] = '\0' ; + + /*if ( flag & 0x100 ) + secondary = true ; + else + secondary = false ;*/ + } + else + { + char *p ; + if ( !fgets( line, LINE_SIZE, fp ) ) + break ; + if ( line[0] == '\0' || line[0] == '@' ) + continue ; + sscanf( line, "%s%s%s%s%s%s%s%s%s%s%s", col, col + 1, col + 2, col + 3, col + 4, + col + 5, col + 6, col + 7, col + 8, col + 9, col + 10 ) ; + + flag = atoi( col[1] ) ; + if ( p = strstr( line, "NH" ) ) + { + int k = 0 ; + p += 5 ; + while ( *p != ' ' && *p != '\t' && *p != '\0') + { + k = k * 10 + *p - '0' ; + ++p ; + } + /*if ( k >= 2 ) + { + secondary = true ; + } + else + secondary = false ;*/ + NH = k ; + } + else + { + //secondary = false ; + NH = 1 ; + } + if ( ( p = strstr( line, "NM" ) ) || ( p = strstr( line, "nM" ) ) ) + { + int k = 0 ; + p += 5 ; + while ( *p != ' ' && *p != '\t' && *p != '\0') + { + k = k * 10 + *p - '0' ; + ++p ; + } + editDistance = k ; + } + else + editDistance = 0 ; + + mateStart = atoi( col[7] ) ; + /*if ( flag & 0x100 ) + secondary = true ; + else + secondary = false ;*/ + + } + samFlag = flag ; + for ( i = 0 ; col[5][i] ; ++i ) + if ( col[5][i] == 'N' ) + break ; + + if ( !col[5][i] ) + continue ; + + // remove .1, .2 or /1, /2 suffix + if ( hasMateReadIdSuffix ) + { + char *s = col[0] ; + int len = strlen( s ) ; + if ( len >= 2 && ( s[len - 1] == '1' || s[len - 1] == '2' ) + && ( s[len - 2] == '.' || s[len - 2] == '/' ) ) + { + s[len - 2] = '\0' ; + } + } + + if ( useSam ) + { + if ( bam_aux_get( b, "XS" ) ) + { + strand = bam_aux2A( bam_aux_get( b, "XS" ) ) ; + if ( bam_aux_get( b, "YS" ) ) + { + noncanonStrandInfo = bam_aux2i( bam_aux_get( b, "YS" ) ) ; + } + else + { + noncanonStrandInfo = -1 ; + } + } + else + { + strand = '?' ; + noncanonStrandInfo = -1 ; + } + startLocation = b->core.pos + 1 ; + } + else + { + if ( strstr( line, "XS:A:-" ) ) // on negative strand + strand = '-' ; + else if ( strstr( line, "XS:A:+" ) ) + strand = '+' ; + else + { + strand = '?' ; + char *p = strstr( line, "YS:i:" ) ; + if ( p != NULL ) + { + p += 5 ; + noncanonStrandInfo = atoi( p ) ; + } + else + noncanonStrandInfo = -1 ; + } + startLocation = atoi( col[3] ) ; + } + + // Found the junctions from the read. + if ( strcmp( prevChrome, col[2] ) ) + { + if ( flagPrintJunction ) + { + // Print the remaining elements in the queue + i = qHead ; + while ( i != qTail ) + { + PrintJunction( prevChrome, junctionQueue[i] ) ; + ++i ; + if ( i >= QUEUE_SIZE ) + i = 0 ; + } + } + + // new chromosome + ClearReadTree( contradictedReads ) ; + contradictedReads = NULL ; + qHead = qTail = 0 ; + strcpy( prevChrome, col[2] ) ; + } + + if ( flagRemove ) + { + if ( CompareJunctions( startLocation, col[5] ) ) + { + // Test whether this read has new junctions + //++junctionCnt ; + if ( !flagPrintJunction ) + printf( "%s", line ) ; + } + } + else + { + ++junctionCnt ; + printf( "%s", line ) ; + } + //printf( "hi2 %s\n", col[0] ) ; + } + + if ( flagPrintJunction ) + { + // Print the remaining elements in the queue + i = qHead ; + while ( i != qTail ) + { + PrintJunction( prevChrome, junctionQueue[i] ) ; + ++i ; + if ( i >= QUEUE_SIZE ) + i = 0 ; + } + } + + //fprintf( stderr, "The number of junctions: %d\n", junctionCnt ) ; + return 0 ; +} + diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/GetTrustedSplice.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/GetTrustedSplice.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,429 @@ +#include +#include +#include +#include + +#include "alignments.hpp" + +#define MAX(x, y) (((x)<(y))?(y):(x)) +#define MIN(x, y) (((x)<(y))?(x):(y)) + +char usage[] = "Usage: ./trust-splice splice_file_list one_bam_file [OPTIONS]\n" + "Options:\n" + "\t-a FLOAT: average number of supported reads from the samples (default: 0.5)\n" ; + +struct _intron +{ + int chrId ; + int start, end ; + double support ; + int uniqSupport ; + int secSupport ; + char strand ; + int editDist ; + + int sampleSupport ; +} ; + +struct _site +{ + int chrId ; + int pos ; + double support ; + + char strand ; + int associatedIntronCnt ; +} ; + +bool CompIntrons( struct _intron a, struct _intron b ) +{ + if ( a.chrId != b.chrId ) + return a.chrId < b.chrId ; + else if ( a.start != b.start ) + return a.start < b.start ; + else if ( a.end != b.end ) + return a.end < b.end ; + return false ; +} + +bool CompSites( struct _site a, struct _site b ) +{ + if ( a.chrId != b.chrId ) + return a.chrId < b.chrId ; + else if ( a.pos != b.pos ) + return a.pos < b.pos ; + return false ; +} + +void CoalesceIntrons( std::vector &introns ) +{ + std::sort( introns.begin(), introns.end(), CompIntrons ) ; + int size = introns.size() ; + int i, k ; + k = 0 ; + for ( i = 1 ; i < size ; ++i ) + { + if ( introns[i].chrId == introns[k].chrId && introns[i].start == introns[k].start + && introns[i].end == introns[k].end ) + { + introns[k].support += introns[i].support ; + introns[k].uniqSupport += introns[i].uniqSupport ; + introns[k].secSupport += introns[i].secSupport ; + introns[k].sampleSupport += introns[i].sampleSupport ; + + if ( introns[k].strand == '?' ) + introns[k].strand = introns[i].strand ; + } + else + { + ++k ; + introns[k] = introns[i] ; + } + } + introns.resize( k + 1 ) ; +} + +void CoalesceSites( std::vector &sites ) +{ + std::sort( sites.begin(), sites.end(), CompSites ) ; + int size = sites.size() ; + int i, k ; + k = 0 ; + for ( i = 1 ; i < size ; ++i ) + { + if ( sites[i].chrId == sites[k].chrId && sites[i].pos == sites[k].pos ) + { + sites[k].support += sites[i].support ; + sites[k].associatedIntronCnt += sites[i].associatedIntronCnt ; + } + else + { + ++k ; + sites[k] = sites[i] ; + } + } + sites.resize( k + 1 ) ; +} + +int main( int argc, char *argv[] ) +{ + int i, j, k ; + FILE *fpList ; + FILE *fp ; + char spliceFile[2048] ; + char chrName[1024], strand[3] ; + int start, end, uniqSupport, secSupport, uniqEditDistance, secEditDistance ; + double support ; + Alignments alignments ; + std::vector introns ; + std::vector sites ; + double averageSupportThreshold = 0.5 ; + + if ( argc <= 1 ) + { + printf( "%s", usage ) ; + exit( 1 ) ; + } + + for ( i = 3 ; i < argc ; ++i ) + { + if ( !strcmp( argv[ i ], "-a" ) ) + { + averageSupportThreshold = atof( argv[i + 1] ) ; + ++i ; + } + else + { + printf( "Unknown option: %s", argv[i] ) ; + exit( 1 ) ; + } + } + + alignments.Open( argv[2] ) ; + + // Get the number of samples. + fpList = fopen( argv[1], "r" ) ; + int sampleCnt = 0 ; + while ( fscanf( fpList, "%s", spliceFile ) != EOF ) + { + ++sampleCnt ; + } + fclose( fpList ) ; + + // Get all the introns. + fpList = fopen( argv[1], "r" ) ; + while ( fscanf( fpList, "%s", spliceFile ) != EOF ) + { + fp = fopen( spliceFile, "r" ) ; + while ( fscanf( fp, "%s %d %d %lf %s %d %d %d %d", chrName, &start, &end, &support, + strand, &uniqSupport, &secSupport, &uniqEditDistance, &secEditDistance ) != EOF ) + { + + if ( support <= 0 ) + support = 0.1 ; + else if ( support == 1 && sampleCnt > 5 ) + support = 0.75 ; + + struct _intron ni ; + ni.chrId = alignments.GetChromIdFromName( chrName ) ; + ni.start = start ; + ni.end = end ; + ni.support = support ; + ni.strand = strand[0] ; + ni.uniqSupport = uniqSupport ; + ni.secSupport = secSupport ; + ni.sampleSupport = 1 ; + ni.editDist = uniqEditDistance + secEditDistance ; + introns.push_back( ni ) ; + } + fclose( fp ) ; + + CoalesceIntrons( introns ) ; + } + fclose( fpList ) ; + + // Obtain the split sites. + int intronCnt = introns.size() ; + for ( i = 0 ; i < intronCnt ; ++i ) + { + struct _site ns ; + ns.chrId = introns[i].chrId ; + ns.associatedIntronCnt = 1 ; + ns.support = introns[i].support ; + ns.strand = introns[i].strand ; + + ns.pos = introns[i].start ; + sites.push_back( ns ) ; + ns.pos = introns[i].end ; + sites.push_back( ns ) ; + } + CoalesceSites( sites ) ; + + // Get the chromsomes with too many split sites. + int siteCnt = sites.size() ; + std::vector badChrom ; + + badChrom.resize( alignments.GetChromCount() ) ; + int size = alignments.GetChromCount() ; + for ( i = 0 ; i < size ; ++i ) + badChrom[i] = false ; + + for ( i = 0 ; i < siteCnt ; ) + { + for ( j = i + 1 ; sites[j].chrId == sites[i].chrId ; ++j ) + ; + //printf( "%s %d %d:\n", alignments.GetChromName( sites[i].chrId ), alignments.GetChromLength( sites[i].chrId ), j - i ) ; + if ( ( j - i ) * 20 > alignments.GetChromLength( sites[i].chrId ) ) + badChrom[ sites[i].chrId ] = true ; + i = j ; + } + + // Output the valid introns. + k = 0 ; + double unit = sampleCnt / 50 ; + if ( unit < 1 ) + unit = 1 ; + + int longIntronSize ; + std::vector intronSizes ; + for (i = 0 ; i < intronCnt ; ++i) + intronSizes.push_back( introns[i].end - introns[i].start + 1 ) ; + std::sort( intronSizes.begin(), intronSizes.end() ) ; + longIntronSize = intronSizes[ int(intronCnt * 0.99) ] ; + if ( longIntronSize > 100000 ) + longIntronSize = 100000 ; + for ( i = 0 ; i < intronCnt ; ++i ) + { + if ( introns[i].support / sampleCnt < averageSupportThreshold ) + continue ; + + if ( badChrom[ introns[i].chrId ] ) + { + if ( introns[i].sampleSupport <= sampleCnt / 2 ) + continue ; + } + + if ( introns[i].uniqSupport <= 2 && ( introns[i].support / sampleCnt < 1 || introns[i].sampleSupport < MIN( 2, sampleCnt ) ) ) + continue ; + + //Locate the two split sites. + while ( sites[k].chrId < introns[i].chrId || ( sites[k].chrId == introns[i].chrId && sites[k].pos < introns[i].start ) ) + { + ++k ; + } + int a, b ; + a = k ; + for ( b = a + 1 ; b < siteCnt ; ++b ) + { + if ( sites[b].chrId == introns[i].chrId && sites[b].pos == introns[i].end ) + break ; + } + double siteSupport = MAX( sites[a].support, sites[b].support ) ; + //if ( introns[i].start == 100233371 && introns[i].end == 100236850 ) + // printf( "test: %lf %lf\n", introns[i].support, siteSupport) ; + if ( introns[i].support < siteSupport / 10.0 ) + { + double needSample = MIN( ( -log( introns[i].support / siteSupport ) / log( 10.0 ) + 1 ) * unit, sampleCnt ) ; + if ( introns[i].sampleSupport < needSample ) + continue ; + } + + if ( sampleCnt >= 100 ) //&& introns[i].end - introns[i].start + 1 >= 50000 ) + { + if ( introns[i].sampleSupport <= sampleCnt * 0.01 ) + continue ; + } + /*if ( sampleCnt >= 50 ) + { + // just some randomly intron. + if ( introns[i].sampleSupport == 1 + && ( sites[a].associatedIntronCnt == 1 || sites[b].associatedIntronCnt == 1 ) ) + continue ; + }*/ + + /*if ( introns[i].end - introns[i].start + 1 < 50 ) + { + int needSample = MIN( ( 5 - ( introns[i].end - introns[i].start + 1 ) / 10 ) * unit, sampleCnt ) ; + int flag = 0 ; + + if ( introns[i].sampleSupport < needSample ) + flag = 1 ; + if ( flag == 1 ) + continue ; + }*/ + if ( introns[i].strand == '?' && sampleCnt == 1 && + ( introns[i].support < 5 || introns[i].uniqSupport == 0 || introns[i].support < 2 * introns[i].editDist ) ) + continue ; + + // Since the strand is uncertain, the alinger may make different decision sample from sample. + // To keep this intron, one of its splice sites must be more supported than adjacent splice sites. + if ( introns[i].strand == '?' && introns[i].sampleSupport <= 0.5 * sampleCnt ) + { + int s, e ; + int l ; + int cnt = 0 ; + for ( l = 0 ; l < 2 ; ++l ) + { + int ind = ( l == 0 ) ? a : b ; + double max = sites[ind].support ; + int maxTag = ind ; + for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s ) + { + if ( sites[s].pos + 7 < sites[s + 1].pos ) + break ; + if ( sites[s].support >= max ) + { + max = sites[s].support ; + maxTag = s ; + } + } + + for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e ) + { + if ( sites[e].pos - 7 > sites[e - 1].pos ) + break ; + if ( sites[e].support >= max ) + { + max = sites[e].support ; + maxTag = e ; + } + } + + if ( maxTag == ind ) + ++cnt ; + } + if ( cnt == 0 ) + continue ; + } + + // Test whether this a intron coming from a wrong strand + /*if ( b - a + 1 >= 10 && introns[i].strand != '?' && introns[i].sampleSupport <= 0.5 * sampleCnt ) + { + int plusStrand = 0 ; + int minusStrand = 0 ; + int l ; + int s, e ; + + if ( introns[i].strand == '+' ) + plusStrand = 2 ; + else + minusStrand = 2 ; + + for ( l = 0 ; l < 2 ; ++l ) + { + int ind = ( l == 0 ) ? a : b ; + for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s ) + { + if ( sites[s].pos + 10000 < sites[s + 1].pos ) + break ; + + if ( sites[s].strand == '+' ) + ++plusStrand ; + else if ( sites[s].strand == '-' ) + ++minusStrand ; + } + + for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e ) + { + if ( sites[e].pos - 10000 > sites[e - 1].pos ) + break ; + + if ( sites[e].strand == '+' ) + ++plusStrand ; + else if ( sites[e].strand == '-' ) + ++minusStrand ; + } + } + + if ( introns[i].start == 161517978 ) + printf( "capture: %d %d %d %d\n", a, b, minusStrand, plusStrand) ; + + if ( introns[i].strand == '+' && minusStrand >= 20 && plusStrand == 2 ) + continue ; + else if ( introns[i].strand == '-' && plusStrand >= 20 && minusStrand == 2 ) + continue ; + + }*/ + + // Filter a intron if one of its splice site associated with too many introns. + /*if ( sites[a].associatedIntronCnt >= 10 ) + { + int needSample = MIN( sites[a].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ; + if ( introns[i].support < sites[a].support / 100 && + introns[i].sampleSupport < needSample ) + { + continue ; + } + } + if ( sites[b].associatedIntronCnt >= 10 ) + { + int needSample = MIN( sites[b].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ; + if ( introns[i].support < sites[b].support / 100 && + introns[i].sampleSupport < needSample ) + { + continue ; + } + }*/ + + + // Test for long intron + if ( introns[i].end - introns[i].start + 1 >= longIntronSize ) + { + int needSample = MIN( ( ( introns[i].end - introns[i].start + 1 ) / longIntronSize + 1 ) * unit, sampleCnt ) ; + int flag = 0 ; + if ( (double)introns[i].uniqSupport / ( introns[i].uniqSupport + introns[i].secSupport ) < 0.1 + || introns[i].uniqSupport / sampleCnt < 1 + || introns[i].sampleSupport < needSample ) + flag = 1 ; + if ( flag == 1 && introns[i].end - introns[i].start + 1 >= 3 * longIntronSize ) + continue ; + if ( flag == 1 && ( sites[a].associatedIntronCnt > 1 || sites[b].associatedIntronCnt > 1 || introns[i].sampleSupport <= 1 ) ) // an intron may connect two genes + continue ; + } + + + printf( "%s %d %d 10 %c 10 0 0 0\n", alignments.GetChromName( introns[i].chrId ), introns[i].start, introns[i].end, introns[i].strand ) ; + } + + return 0 ; +} diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/GetTrustedSplice.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/GetTrustedSplice.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,125 @@ +#!/bin/perl + +use strict ; +use warnings ; +use List::Util qw[min max]; + +die "usage: a.pl path_to_list_of_splice_file > trusted.splice\n" if ( @ARGV == 0 ) ; + +my %spliceSupport ; +my %spliceSampleSupport ; +my %spliceUniqSupport ; +my %spliceSecSupport ; +my %uniqSpliceSites ; +my %spliceSiteSupport ; + +my $sampleCnt = 0 ; +open FP1, $ARGV[0] ; +while ( ) +{ + ++$sampleCnt ; +} +close FP1 ; + +open FP1, $ARGV[0] ; +while ( ) +{ + chomp ; + open FP2, $_ ; + while ( ) + { + chomp ; + my $line = $_ ; + my @cols = split /\s+/, $line ; + my $key = $cols[0]." ".$cols[1]." ".$cols[2]." ".$cols[4] ; + if ( $cols[3] <= 0 ) + { + $cols[3] = 0.1 ; + } + elsif ( $cols[3] == 1 && $sampleCnt > 5 ) + { + $cols[3] = 0.75 ; + } + + if ( ! defined $spliceSupport{$key} ) + { + $spliceSupport{ $key } = $cols[3] ; + $spliceSampleSupport{ $key } = 1 ; + $spliceUniqSupport{ $key } = $cols[5] ; + $spliceSecSupport{ $key } = $cols[6] ; + } + else + { + $spliceSupport{ $key } += $cols[3] ; + $spliceSampleSupport{ $key } += 1 ; + $spliceUniqSupport{ $key } += $cols[5] ; + $spliceSecSupport{ $key } += $cols[6] ; + } + + for ( my $i = 1 ; $i <=2 ; ++$i ) + { + $key = $cols[0]." ".$cols[$i] ; + if ( defined $spliceSiteSupport{ $key } ) + { + $spliceSiteSupport{ $key } += $cols[3] ; + } + else + { + $spliceSiteSupport{ $key } = $cols[3] ; + } + + if ( defined $uniqSpliceSites{ $key } && $uniqSpliceSites{ $key } != $cols[2 - $i + 1] ) + { + $uniqSpliceSites{ $key } = -1 ; + } + else + { + $uniqSpliceSites{ $key } = $cols[2 - $i + 1] ; + } + } + } + close FP2 ; +} +close FP1 ; + +foreach my $key (keys %spliceSupport) +{ + next if ( $spliceSupport{ $key } / $sampleCnt < 0.5 ) ; + #next if ( $spliceUniqSupport{$key} / ( $spliceSecSupport{$key} + $spliceUniqSupport{$key} ) < 0.01 ) ; + next if ( $spliceUniqSupport{$key} <= 2 && ( $spliceSupport{ $key } / $sampleCnt < 1 || $spliceSampleSupport{$key} < min( 2, $sampleCnt ) ) ) ; + + my @cols = split /\s+/, $key ; + my $flag = 0 ; + #if ( $cols[2] - $cols[1] + 1 >= 10000 ) + #{ + # $flag = 1 if ( $spliceSupport{ $key } / $sampleCnt < 1 ) ; + #} + my $siteSupport = max( $spliceSiteSupport{ $cols[0]." ".$cols[1] }, $spliceSiteSupport{ $cols[0]." ".$cols[2] } ) ; + + if ( $spliceSupport{ $key } < $siteSupport / 10.0 ) + { + #print $spliceSupport{ $key } / $siteSupport, " ", -log( $spliceSupport{ $key } / $siteSupport ) / log( 10.0 ), "\n" ; + #if ( $cols[1] == 73518141 && $cols[2] == 73518206 ) + #{ + # print "test: ", $spliceSupport{$key}, " $siteSupport ", -log( $spliceSupport{ $key } / $siteSupport ), "\n"; + #} + my $needSample = min( -log( $spliceSupport{ $key } / $siteSupport ) / log( 10.0 ) + 1, $sampleCnt ) ; + next if ( $spliceSampleSupport{ $key } < $needSample ) ; + } + + if ( $cols[2] - $cols[1] + 1 >= 100000 ) + { + my $needSample = int( ( $cols[2] - $cols[1] + 1 ) / 100000 ) + 1 ; + $needSample = $sampleCnt if ( $needSample > $sampleCnt ) ; + $flag = 1 if ( $spliceUniqSupport{$key} / ( $spliceSecSupport{$key} + $spliceUniqSupport{$key} ) < 0.1 + || ( $spliceUniqSupport{ $key } / $sampleCnt < 1 ) + || $spliceSampleSupport{ $key } < $needSample ) ; + next if ( $flag == 1 && $cols[2] - $cols[1] + 1 >= 300000 ) ; + } + if ( $flag == 1 && ( ( $uniqSpliceSites{ $cols[0]." ".$cols[1] } == -1 || $uniqSpliceSites{ $cols[0]." ".$cols[2] } == -1 ) + || $spliceSampleSupport{ $key } <= 1 ) ) + { + next ; + } + print $cols[0], " ", $cols[1], " ", $cols[2], " 10 ", $cols[3], " 10 0 0 0\n" ; +} diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/Makefile Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,70 @@ +CXX = g++ +CXXFLAGS= -Wall -O3 #-g #-std=c++11 #-Wall #-g +#CXXFLAGS= -Wall -g #-std=c++11 #-Wall #-g +LINKPATH= -I./samtools-0.1.19 -L./samtools-0.1.19 +LINKFLAGS = -lbam -lz -lm -lpthread +DEBUG= +OBJECTS = stats.o subexon-graph.o + +all: subexon-info combine-subexons classes vote-transcripts junc grader trust-splice add-genename addXS + +subexon-info: subexon-info.o $(OBJECTS) + if [ ! -f ./samtools-0.1.19/libbam.a ] ; \ + then \ + cd samtools-0.1.19 ; make ;\ + fi ; + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) subexon-info.o $(LINKFLAGS) + +combine-subexons: combine-subexons.o $(OBJECTS) + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) combine-subexons.o $(LINKFLAGS) + +classes: classes.o constraints.o transcript-decider.o $(OBJECTS) + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) constraints.o transcript-decider.o classes.o $(LINKFLAGS) + +trust-splice: trust-splice.o + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) trust-splice.o $(LINKFLAGS) + +vote-transcripts: vote-transcripts.o + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) vote-transcripts.o $(LINKFLAGS) + +junc: junc.o + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) junc.o $(LINKFLAGS) + +grader: grader.o + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) grader.o $(LINKFLAGS) + +addXS: addXS.o + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) addXS.o $(LINKFLAGS) + +add-genename: add-genename.o + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) add-genename.o $(LINKFLAGS) + +subexon-info.o: SubexonInfo.cpp alignments.hpp blocks.hpp support.hpp defs.h stats.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +combine-subexons.o: CombineSubexons.cpp alignments.hpp blocks.hpp support.hpp defs.h stats.hpp SubexonGraph.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +stats.o: stats.cpp stats.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +subexon-graph.o: SubexonGraph.cpp SubexonGraph.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +constraints.o: Constraints.cpp Constraints.hpp SubexonGraph.hpp alignments.hpp BitTable.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +transcript-decider.o: TranscriptDecider.cpp TranscriptDecider.hpp Constraints.hpp BitTable.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +classes.o: classes.cpp SubexonGraph.hpp SubexonCorrelation.hpp BitTable.hpp Constraints.hpp alignments.hpp TranscriptDecider.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +trust-splice.o: GetTrustedSplice.cpp alignments.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +vote-transcripts.o: Vote.cpp TranscriptDecider.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +junc.o: FindJunction.cpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +grader.o: grader.cpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +addXS.o: AddXS.cpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +add-genename.o: AddGeneName.cpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) + +clean: + rm -f *.o *.gch subexon-info combine-subexons trust-splice vote-transcripts junc grader add-genename addXS diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/ManipulateIntronFile.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/ManipulateIntronFile.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,71 @@ +#!/usr/bin/env perl + +use strict ; +use warnings ; + +die "usage: a.pl intronA intronB [op]\n" if (@ARGV == 0 ) ; + +my %intronInfo ; +my %chromRank ; + +sub sortIntron +{ + my @cols1 = split /\s+/, $a ; + my @cols2 = split /\s+/, $b ; + + if ( $cols1[0] ne $cols2[0] ) + { + $chromRank{ $cols1[0] } cmp $chromRank{ $cols2[0] } ; + } + elsif ( $cols1[1] != $cols2[1] ) + { + $cols1[1] <=> $cols2[1] ; + } + else + { + $cols1[2] <=> $cols2[2] ; + } +} + +open FP1, $ARGV[0] ; +my $cnt = 0 ; +while ( ) +{ + chomp ; + my $line = $_ ; + my @cols = split /\s+/ ; + push @cols, 1 ; + @{ $intronInfo{ $cols[0]." ".$cols[1]." ".$cols[2] } } = @cols ; + if ( !defined $chromRank{ $cols[0]} ) + { + $chromRank{ $cols[0] } = $cnt ; + ++$cnt ; + } +} +close FP1 ; + +open FP1, $ARGV[1] ; +while ( ) +{ + chomp ; + my $line = $_ ; + my @cols = split /\s+/ ; + my $key = $cols[0]." ".$cols[1]." ".$cols[2] ; + next if ( !defined $intronInfo{ $key } ) ; + my @infoCols = @{ $intronInfo{ $key } } ; + $infoCols[4] = $cols[4] if ( $infoCols[4] ne "+" || $infoCols[4] ne "-" ) ; + $infoCols[9] |= 2 ; + + @{ $intronInfo{ $key } } = @infoCols ; +} +close FP1 ; + +foreach my $key (sort sortIntron keys %intronInfo ) +{ + my @infoCols = @{ $intronInfo{ $key } } ; + #print join( " ", @infoCols ), "\n" ; + + next if ( $infoCols[9] != 3 ) ; + pop @infoCols ; + print join( " ", @infoCols ), "\n" ; +} diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/README.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/README.md Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,118 @@ +PsiCLASS +======= + +Described in: + +Song, L., Sabunciyan, S., Yang, G. and Florea, L. [A multi-sample approach increases the accuracy of transcript assembly](https://www.nature.com/articles/s41467-019-12990-0). *Nat Commun* 10, 5000 (2019) + + Copyright (C) 2018- and GNU GPL by Li Song, Liliana Florea + +Includes portions copyright from: + + samtools - Copyright (C) 2008-, Genome Research Ltd, Heng Li + +Commands, scripts and supporting data for the paper can be found [here](https://github.com/splicebox/PsiCLASS_paper/). + +### What is PsiCLASS? + +PsiCLASS is a reference-based transcriptome assembler for single or multiple RNA-seq samples. Unlike conventional methods that analyze each sample separately and then merge the outcomes to create a unified set of meta-annotations, PsiCLASS takes a multi-sample approach, simultaneously analyzing all RNA-seq data sets in an experiment. PsiCLASS is both a transcript assembler and a meta-assembler, producing separate transcript sets for the individual samples and a unified set of meta-annotations. The algorithmic underpinnings of PsiCLASS include using a global subexon splice graph, statistical cross-sample feature (intron, subexon) selection methods, and an efficient dynamic programming algorithm to select a subset of transcripts from among those encoded in the graph, based on the read support in each sample. Lastly, the set of meta-annotations is selected from among the transcripts generated for individual samples by voting. While PsiCLASS is highly accurate and efficient for medium-to-large collections of RNA-seq data, its accuracy is equally high for small RNA-seq data sets (2-10 samples) and is competitive to reference methods for single samples. Additionally, its performance is robust with the aggregation method used, including the built-in voting and assembly-based approaches such as StringTie-merge and TACO. Therefore, it can be effectively used as a multi-sample and as a single-sample assembler, as well as in conventional assemble-and-merge protocols. + +### Install + +1. Clone the [GitHub repo](https://github.com/splicebox/psiclass), e.g. with `git clone https://github.com/splicebox/psiclass.git` +2. Run `make` in the repo directory + +You will find the executable files in the downloaded directory. If you want to run PsiCLASS without specifying the directory, you can either add the directory of PsiCLASS to the environment variable PATH or create a soft link ("ln -s") of the file "psiclass" to a directory in PATH. + +PsiCLASS depends on [pthreads](http://en.wikipedia.org/wiki/POSIX_Threads) and samtools depends on [zlib](http://en.wikipedia.org/wiki/Zlib). + + +### Usage + + Usage: ./psiclass [OPTIONS] + Required: + -b STRING: paths to the alignment BAM files; use comma to separate multiple BAM files + or + --lb STRING: path to the file listing the alignment BAM files + Optional: + -s STRING: path to the trusted splice file (default: not used) + -o STRING: prefix of output files (default: ./psiclass) + -p INT: number of threads (default: 1) + -c FLOAT: only use the subexons with classifier score <= than the given number. (default: 0.05) + --sa FLOAT: the minimum average number of supported read for retained introns (default: 0.5) + --vd FLOAT : the minimum average coverage depth of a transcript to be reported in voting (defaults: 1.0) + --maxDpConstraintSize: the number of subexons a constraint can cover in DP. (default: 7. -1 for inf) + --primaryParalog: use primary alignment to retain paralog genes (default: use unique alignments) + --version: print version and exit + --stage INT: (default: 0) + 0-start from the beginning - building the splice site file for each sample + 1-start from building the subexon file for each samples + 2-start from combining the subexon files across samples + 3-start from assembling the transcripts for each sample + 4-start from voting the consensus transcripts across samples + +### Practical notes + +*Alignment compatibility.* PsiCLASS has been tuned to run on alignments generated with the tools [HISAT](https://ccb.jhu.edu/software/hisat/index.shtml) and [STAR](https://github.com/alexdobin/STAR). + +When running PsiCLASS with STAR alignments, run STAR with the option `--outSAMstrandField intronMotif`, which will include the XS field indicating the strand in the BAM alignments. Further, when including alignments with *non-canonical splice sites*, use the provided `addXS` executable to add the XS field: + + samtools view -h in.bam | ./addXS reference_genome.fa | samtools view -bS - > out.bam + +*Trusted introns from other sources.* By default, PsiCLASS determines a set of trusted introns from the input spliced alignments, to use in building the global subexon graph. Alternatively, the user can supply an external set of trusted introns, for instance extracted from the GENCODE gene annotations or judiciously selected from the input data using a tool like [JULIP](https://github.com/Guangyu-Yang/JULiP). This file must contain three columns: + + chr_name start_site end_site + +*Voting optimization.* The default parameters for voting have been calibrated and perform near-optimally for a wide variety of data, including with varying levels of coverage and different library construction protocols. However, if further optimization is desired, to determine a better cutoff value one can run the voting stage (see [Usage](#usage) above) with different parameter values, and assess the performance against a reference set of gene annotations, such as [GENCODE](https://www.gencodegenes.org). The program 'grader', included in the package, can be used for this purpose. Note that the per sample sets of transcripts will remain unchanged. + +*Add gene name.* For many applications, it would be desirable to associate the known (annotated) gene name with each transcript. PsiCLASS provides the program "add-genename" for such purpose. "add-genename" takes as input a GTF file containing a reference set of gene annotations and a file listing the raw GTF files, and generates a new GTF file for each input raw GTF file by appending the annotated gene names. If a gene is not found in the annotation, "add-genename" will use "novel_INT" to represent its gene name. The program can be run as: + + ./add-genename annotation.gtf gtflist + +### Input/Output + +The primary input to PsiCLASS is a set of BAM alignment files, one for each RNA-seq sample in the analysis. The program calculates a set of subexon files and a set of splice (intron) files, for the individual samples. (Optionally, one may specify a path to an external file of trusted introns as explained [above](#practical-notes).) The output consists of one GTF file of transcripts for each sample, and the GTF file of meta-annotations produced by voting, stored in the output directory: + + Sample-wise GTF files: (psiclass)_sample_{0,1,...,n-1}.gtf + Meta-assembly GTF file: (psiclass)_vote.gtf + +where indices 0,1,...,n-1 match the order of the input BAM files. + +Subexon and splice (intron) files, and other auxiliary files, are in the subdirectories: + + Intron files: splice/* + Subexon graph files: subexon/* + Log file: (psiclass)_classes.log + +### Example + +The directory './example' in this distribution contains two BAM files, along with an example of a BAM list file. Run PsiCLASS with: + + ./psiclass -b example/s1.bam,example/s2.bam + +or + + ./psiclass --lb example/slist + +The run will generate the files 'psiclass_sample_0.gtf' for 's1.bam', 'psiclass_sample_1.gtf' for 's2.bam', and the file 'psiclass_vote.gtf' containing the meta-assemblies. + +### Terms of use + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received (LICENSE.txt) a copy of the GNU General +Public License along with this program; if not, you can obtain one from +http://www.gnu.org/licenses/gpl.txt or by writing to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +### Support + +Create a [GitHub issue](https://github.com/splicebox/PsiCLASS/issues). diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/SubexonCorrelation.hpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/SubexonCorrelation.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,270 @@ +#ifndef _MOURISL_CLASSES_SUBEXONCORRELATION_HEADER +#define _MOURISL_CLASSES_SUBEXONCORRELATION_HEADER + +#include "SubexonGraph.hpp" + +#include +#include + +class SubexonCorrelation +{ +private: + struct _subexon *lastSubexons ; + std::vector fileList ; + int offset ; + int prevSeCnt ; + int seCnt ; + + double **correlation ; + + int OverlapSize( int s0, int e0, int s1, int e1 ) + { + int s = -1, e = -1 ; + if ( e0 < s1 || s0 > e1 ) + return 0 ; + s = s0 > s1 ? s0 : s1 ; + e = e0 < e1 ? e0 : e1 ; + return e - s + 1 ; + } +public: + SubexonCorrelation() + { + offset = 0 ; + lastSubexons = NULL ; + correlation = NULL ; + prevSeCnt = 0 ; + } + + ~SubexonCorrelation() + { + int cnt = fileList.size() ; + int i ; + for ( i = 0 ; i < cnt ; ++i ) + fclose( fileList[i] ) ; + delete[] lastSubexons ; + } + + void Initialize( char *f ) + { + FILE *fpSl ; + fpSl = fopen( f, "r" ) ; + char buffer[1024] ; + while ( fgets( buffer, sizeof( buffer ), fpSl ) != NULL ) + { + int len = strlen( buffer ) ; + if ( buffer[len - 1] == '\n' ) + { + buffer[len - 1] = '\0' ; + --len ; + + } + FILE *fp = fopen( buffer, "r" ) ; + fileList.push_back( fp ) ; + } + + lastSubexons = new struct _subexon[ fileList.size() ] ; + int i, cnt ; + cnt = fileList.size() ; + for ( i = 0 ; i < cnt ; ++i ) + lastSubexons[i].chrId = -1 ; + + } + + // We assume the subexons only contains the subexons we are interested in. + // And we assume that each time we call this function, the subexons we are interested are + // sorted in order. + void ComputeCorrelation( struct _subexon *subexons, int cnt, Alignments &alignments ) + { + int sampleCnt = fileList.size() ; + if ( sampleCnt <= 1 ) + return ; + int i, j, k ; + seCnt = cnt ; + + // Obtain the depth matrix. + double **depth ; + depth = new double* [sampleCnt] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + depth[i] = new double[ cnt ] ; + memset( depth[i], 0, sizeof( double ) * cnt ) ; + } + + char buffer[2048] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + struct _subexon se ; + bool useLastSubexon = false ; + if ( lastSubexons[i].chrId != -1 ) + { + se = lastSubexons[i] ; + useLastSubexon = true ; + } + else + se.chrId = -1 ; + + // Locate the subexon + while ( 1 ) + { + if ( se.chrId < subexons[0].chrId + || ( se.chrId == subexons[0].chrId && se.end < subexons[0].start ) ) + { + if ( fgets( buffer, sizeof( buffer ), fileList[i] ) == NULL ) + { + buffer[0] = '\0' ; + break ; + } + if ( buffer[0] == '#' ) + continue ; + + SubexonGraph::InputSubexon( buffer, alignments, se ) ; + --se.start ; --se.end ; + useLastSubexon = false ; + continue ; + } + break ; + } + if ( buffer[0] == '\0' ) + { + lastSubexons[i] = se ; + continue ; + } + + // Process the subexon. + int tag = 0 ; + while ( 1 ) + { + lastSubexons[i] = se ; + + if ( useLastSubexon == false ) + { + SubexonGraph::InputSubexon( buffer, alignments, se ) ; + --se.start ; --se.end ; + } + + if ( se.chrId > subexons[cnt - 1].chrId || se.start > subexons[cnt - 1].end ) + break ; + + while ( 1 ) + { + if ( tag > cnt || subexons[tag].end >= se.start ) + break ; + ++tag ; + } + + for ( j = tag ; j < cnt && subexons[j].start <= se.end ; ++j ) + { + int overlap = OverlapSize( se.start, se.end, subexons[j].start, subexons[j].end ) ; + depth[i][j] += overlap * se.avgDepth ; + } + + if ( fgets( buffer, sizeof( buffer ), fileList[i] ) == NULL ) + break ; + } + } + // Normalize the depth + for ( i = 0 ; i < sampleCnt ; ++i ) + { + for ( j = 0 ; j < cnt ; ++j ) + depth[i][j] /= ( subexons[j].end - subexons[j].start + 1 ) ; + } + + // Compute the correlation. + double *avg = new double[cnt] ; + double *var = new double[cnt] ; + if ( correlation != NULL ) + { + for ( i = 0 ; i < prevSeCnt ; ++i ) + delete[] correlation[i] ; + delete[] correlation ; + } + + correlation = new double*[cnt] ; + for ( i = 0 ; i < cnt ; ++i ) + correlation[i] = new double[cnt] ; + + memset( avg, 0, sizeof( double ) * cnt ) ; + memset( var, 0, sizeof( double ) * cnt ) ; + for ( i = 0 ; i < cnt ; ++i ) + { + //avg[i] = 0 ; + //var[i] = 0 ; + memset( correlation[i], 0, sizeof( double ) * cnt ) ; + correlation[i][i] = 1 ; + } + + for ( i = 0 ; i < sampleCnt ; ++i ) + for ( j = 0 ; j < cnt ; ++j ) + { + avg[j] += depth[i][j] ; + var[j] += depth[i][j] * depth[i][j] ; + } + for ( i = 0 ; i < cnt ; ++i ) + { + avg[i] /= sampleCnt ; + var[i] = var[i] / sampleCnt - avg[i] * avg[i] ; + } + + for ( i = 0 ; i < sampleCnt ; ++i ) + for ( j = 0 ; j < cnt ; ++j ) + for ( k = j + 1 ; k < cnt ; ++k ) + { + //if ( subexons[j].geneId == subexons[k].geneId ) + correlation[j][k] += depth[i][j] * depth[i][k] ; + } + for ( j = 0 ; j < cnt ; ++j ) + for ( k = j + 1 ; k < cnt ; ++k ) + { + if ( var[j] > 1e-6 && var[k] > 1e-6 ) + { + correlation[j][k] = ( correlation[j][k] / sampleCnt - avg[j] * avg[k] ) / sqrt( var[j] * var[k] ) ; + } + else + correlation[j][k] = 0 ; + //printf( "%d %d %d\n", j, k, cnt ) ; + correlation[k][j] = correlation[j][k] ; + } + //printf( "%lf\n", correlation[0][1] ) ; + // Release the memory. + for ( i = 0 ; i < sampleCnt ; ++i ) + delete[] depth[i] ; + delete[] depth ; + delete[] avg ; + delete[] var ; + + prevSeCnt = cnt ; + } + + double Query( int i, int j ) + { + if ( fileList.size() > 1 ) + return correlation[i][j] ; + else + return 0 ; + } + + void Assign( const SubexonCorrelation &c ) + { + int i ; + fileList = c.fileList ; + if ( fileList.size() <= 1 ) + return ; + + if ( correlation != NULL ) + { + for ( i = 0 ; i < seCnt ; ++i ) + delete[] correlation[i] ; + delete[] correlation ; + } + + seCnt = c.seCnt ; + correlation = new double*[seCnt] ; + for ( i = 0 ; i < seCnt ; ++i ) + { + correlation[i] = new double[seCnt] ; + memcpy( correlation[i], c.correlation[i], sizeof( double ) * seCnt ) ; + } + } +} ; + +#endif diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/SubexonGraph.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/SubexonGraph.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,241 @@ +#include "SubexonGraph.hpp" + +void SubexonGraph::GetGeneBoundary( int tag, int &boundary, int timeStamp ) +{ + if ( visit[tag] == timeStamp ) + return ; + //printf( "%d %d\n", tag, timeStamp ) ; + visit[tag] = timeStamp ; + if ( subexons[tag].end > boundary ) + boundary = subexons[tag].end ; + //if ( subexons[tag].start == 2858011 ) + // printf( "%d: %d %d\n", tag, subexons[tag].nextCnt, subexons[tag].prevCnt) ; + int i ; + int cnt = subexons[tag].nextCnt ; + for ( i = 0 ; i < cnt ; ++i ) + { + //printf( "next of %d: %d %d\n", tag, i, subexons[tag].next[i] ) ; + GetGeneBoundary( subexons[tag].next[i], boundary, timeStamp ) ; + } +} + +int SubexonGraph::GetGeneIntervalIdx( int startIdx, int &endIdx, int timeStamp ) +{ + int i ; + int seCnt = subexons.size() ; + if ( startIdx >= seCnt ) + return -1 ; + int farthest = -1 ; + GetGeneBoundary( startIdx, farthest, timeStamp ) ; + + for ( i = startIdx + 1 ; i < seCnt ; ++i ) + { + if ( subexons[i].start > farthest || subexons[i].chrId != subexons[ startIdx ].chrId ) + break ; + + GetGeneBoundary( i, farthest, timeStamp ) ; + } + endIdx = i - 1 ; + + return endIdx ; +} + +int SubexonGraph::ComputeGeneIntervals() +{ + int i, cnt ; + int seCnt = subexons.size() ; + visit = new int[seCnt] ; + memset( visit, -1, sizeof( int ) * seCnt ) ; + int tag = 0 ; + cnt = 0 ; + while ( 1 ) + { + struct _geneInterval ngi ; + //printf( "%d %d %d\n", tag, subexons[tag].start + 1, subexons[tag].end + 1 ) ; + if ( GetGeneIntervalIdx( tag, ngi.endIdx, cnt ) == -1 ) + break ; + ++cnt ; + ngi.startIdx = tag ; + ngi.start = subexons[ ngi.startIdx ].start ; + ngi.end = subexons[ ngi.endIdx ].end ; + + tag = ngi.endIdx + 1 ; + // Adjust the extent + // Adjust the start + if ( subexons[ ngi.startIdx ].leftStrand != 0 + && subexons[ngi.startIdx].leftStrand != subexons[ngi.startIdx ].rightStrand ) + // We should make sure that rightstrand is non-zero whenever left-strand is non-zero for the startIdx. + { + for ( i = ngi.startIdx ; i >= 0 ; --i ) + { + if ( ( subexons[i].leftType == 1 && subexons[i].leftClassifier < classifierThreshold ) // an end within the subexon + || ( subexons[i].leftType == 0 ) // probably a overhang subexon. It should be a subset of the criterion following. + || ( i > 0 && subexons[i - 1].end + 1 < subexons[i].start ) ) // a gap. + break ; + } + ngi.start = subexons[i].start ; + } + + // Adjust the end. + // And here, we also need to decide wether we need to adjust "tag" or not, + // because the next interval might be overlap with current interval by the last subexon. + // We solve the overlap genes now, so we DON'T need to adjust tag. + if ( subexons[ ngi.endIdx ].rightStrand != 0 + && subexons[ngi.endIdx].leftStrand != subexons[ngi.endIdx ].rightStrand ) + { + for ( i = ngi.endIdx ; i < seCnt ; ++i ) + { + if ( ( subexons[i].rightType == 2 && subexons[i].rightClassifier < classifierThreshold ) // an end within the subexon + || ( subexons[i].rightType == 0 ) // probably a overhang subexon. + || ( i < seCnt - 1 && subexons[i].end + 1 < subexons[i + 1].start ) ) // a gap + break ; + } + ngi.end = subexons[i].end ; + + /*if ( subexons[ ngi.endIdx ].rightType == 2 ) + { + for ( i = ngi.endIdx ; i >= ngi.startIdx ; --i ) + { + if ( subexons[i].leftType == 1 ) + break ; + } + // The last region overlapps. + if ( i >= ngi.startIdx && subexons[i].leftStrand != subexons[ ngi.endIdx ].rightStrand ) + --tag ; + }*/ + } + geneIntervals.push_back( ngi ) ; + } + delete[] visit ; + + return cnt ; +} + +int SubexonGraph::ExtractSubexons( int startIdx, int endIdx, struct _subexon *retList ) +{ + int i, j, k ; + int cnt = endIdx - startIdx + 1 ; + //printf( "%s: %d %d %d\n", __func__, startIdx, endIdx, cnt ) ; + for ( i = 0 ; i < cnt ; ++i ) + { + retList[i] = subexons[i + startIdx] ; + retList[i].geneId = -1 ; + retList[i].prev = new int[ retList[i].prevCnt ] ; + retList[i].next = new int[ retList[i].nextCnt ] ; + + for ( j = 0 ; j < retList[i].prevCnt ; ++j ) + retList[i].prev[j] = subexons[i + startIdx].prev[j] - startIdx ; + for ( j = 0 ; j < retList[i].nextCnt ; ++j ) + retList[i].next[j] = subexons[i + startIdx].next[j] - startIdx ; + + for ( j = 0, k = 0 ; j < retList[i].prevCnt ; ++j ) + if ( retList[i].prev[j] >= 0 && retList[i].prev[j] < cnt ) + { + retList[i].prev[k] = retList[i].prev[j] ; + ++k ; + } + retList[i].prevCnt = k ; + + for ( j = 0, k = 0 ; j < retList[i].nextCnt ; ++j ) + if ( retList[i].next[j] >= 0 && retList[i].next[j] < cnt ) + { + retList[i].next[k] = retList[i].next[j] ; + ++k ; + } + retList[i].nextCnt = k ; + } + UpdateGeneId( retList, cnt ) ; + return cnt ; +} + +void SubexonGraph::SetGeneId( int tag, int strand, struct _subexon *subexons, int seCnt, int id ) +{ + if ( subexons[tag].geneId != -1 && subexons[tag].geneId != -2 ) + { + if ( subexons[tag].geneId != id ) // a subexon may belong to more than one gene. + { + //printf( "Set -2, %d: %d %d %d %d\n", id, tag, subexons[tag].geneId, subexons[tag].start + 1, strand ) ; + subexons[tag].geneId = -2 ; + } + else + return ; + // There is no need to terminate at the ambiguous exon, the strand will prevent + // us from overwriting previous gene ids. + //return ; + } + else if ( subexons[tag].geneId == -2 ) + return ; + //printf( "%d: %d %d %d %d\n", id, tag, subexons[tag].geneId, subexons[tag].start + 1, strand ) ; + int i ; + if ( subexons[tag].geneId != -2 ) + subexons[ tag ].geneId = id ; + int cnt = subexons[tag].nextCnt ; + // Set through the introns. + if ( IsSameStrand( strand, subexons[tag].rightStrand ) ) + { + for ( i = 0 ; i < cnt ; ++i ) + if ( subexons[ subexons[tag].next[i] ].start > subexons[tag].end + 1 ) + SetGeneId( subexons[tag].next[i], strand, subexons, seCnt, id ) ; + } + + cnt = subexons[tag].prevCnt ; + if ( IsSameStrand( strand, subexons[tag].leftStrand ) ) + { + for ( i = 0 ; i < cnt ; ++i ) + if ( subexons[ subexons[tag].prev[i] ].end < subexons[tag].start - 1 ) + SetGeneId( subexons[tag].prev[i], strand, subexons, seCnt, id ) ; + } + + // Set through the adjacent subexons. + if ( tag < seCnt - 1 && subexons[tag + 1].start == subexons[tag].end + 1 ) + { + SetGeneId( tag + 1, strand, subexons, seCnt, id ) ; + } + + if ( tag > 0 && subexons[tag].start - 1 == subexons[tag - 1].end ) + { + SetGeneId( tag - 1, strand, subexons, seCnt, id ) ; + } + +} + +void SubexonGraph::UpdateGeneId( struct _subexon *subexons, int seCnt ) +{ + int i ; + baseGeneId = usedGeneId ; + int lastMinusStrandGeneId = -1 ; + for ( int strand = -1 ; strand <= 1 ; strand +=2 ) + { + for ( i = 0 ; i < seCnt ; ++i ) + { + //printf( "%d (%d %d) %d.\n", i, subexons[i].start + 1, subexons[i].end + 1, subexons[i].geneId ) ; + if ( ( subexons[i].geneId == -1 && ( ( strand == 1 && subexons[i].rightStrand == 0 ) || subexons[i].rightStrand == strand ) ) + || ( strand == 1 && baseGeneId <= subexons[i].geneId && subexons[i].geneId <= lastMinusStrandGeneId && subexons[i].rightStrand == strand ) ) + { + SetGeneId( i, strand, subexons, seCnt, usedGeneId ) ; + if ( strand == -1 ) + lastMinusStrandGeneId = usedGeneId ; + ++usedGeneId ; + } + } + } + + for ( i = 0 ; i < seCnt ; ++i ) + if ( subexons[i].leftType == 0 && subexons[i].rightType == 0 ) + { + subexons[i].geneId = usedGeneId ; + ++usedGeneId ; + } + // Put base and usedGeneId in lcCnt, rcCnt field. + for ( i = 0 ; i < seCnt ; ++i ) + { + subexons[i].lcCnt = baseGeneId ; + subexons[i].rcCnt = usedGeneId ; + } + + /*for ( i = 0 ; i < seCnt ; ++i ) + { + printf( "geneId %d: %d-%d %d\n", i, subexons[i].start + 1, subexons[i].end + 1, subexons[i].geneId ) ; + } + printf("%d %d\n", baseGeneId, usedGeneId ) ;*/ +} diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/SubexonGraph.hpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/SubexonGraph.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,348 @@ +#ifndef _MOURISL_CLASSES_SUBEXONGRAPH_HEADER +#define _MOURISL_CLASSES_SUBEXONGRAPH_HEADER + +#include "alignments.hpp" +#include "blocks.hpp" + +struct _subexon +{ + int chrId ; + int geneId ; + int start, end ; + int leftType, rightType ; + double avgDepth ; + //double ratio, classifier ; + double leftRatio, rightRatio ; + double leftClassifier, rightClassifier ; + int lcCnt, rcCnt ; + int leftStrand, rightStrand ; + + int nextCnt, prevCnt ; + int *next, *prev ; + + bool canBeStart, canBeEnd ; +} ; + +struct _geneInterval +{ + int startIdx, endIdx ; + int start, end ; // The start and end of a gene interval might be adjusted, so it does not + // need to be match with the corresponding subexons +} ; + +class SubexonGraph +{ +private: + int *visit ; + double classifierThreshold ; + + int usedGeneId ; + int baseGeneId ; + + // The function to assign gene ids to subexons. + void SetGeneId( int tag, int strand, struct _subexon *subexons, int seCnt, int id ) ; + void GetGeneBoundary( int tag, int &boundary, int timeStamp ) ; + void UpdateGeneId( struct _subexon *subexons, int seCnt ) ; +public: + std::vector subexons ; + std::vector geneIntervals ; + + ~SubexonGraph() + { + int i ; + int size = subexons.size() ; + for ( i = 0 ; i < size ; ++i ) + { + if ( subexons[i].next ) + delete[] subexons[i].next ; + if ( subexons[i].prev ) + delete[] subexons[i].prev ; + } + } + + SubexonGraph( double classifierThreshold, Alignments &bam, FILE *fpSubexon ) + { + // Read in the subexons + rewind( fpSubexon ) ; + char buffer[2048] ; + int subexonCnt ; + int i, j, k ; + while ( fgets( buffer, sizeof( buffer ), fpSubexon ) != NULL ) + { + if ( buffer[0] == '#' ) + continue ; + + struct _subexon se ; + InputSubexon( buffer, bam, se, true ) ; + + // filter. + if ( ( se.leftType == 0 && se.rightType == 0 ) + || ( se.leftType == 0 && se.rightType == 1 ) // overhang + || ( se.leftType == 2 && se.rightType == 0 ) // overhang + || ( se.leftType == 2 && se.rightType == 1 ) ) // ir + { + if ( ( se.leftType == 0 && se.rightType == 1 ) + || ( se.leftType == 2 && se.rightType == 0 ) ) // if the overhang is too small + { + if ( se.end - se.start + 1 <= 7 ) + { + if ( se.next ) + delete[] se.next ; + if ( se.prev ) + delete[] se.prev ; + continue ; + } + } + + if ( se.leftClassifier >= classifierThreshold || se.leftClassifier < 0 ) + { + if ( se.next ) + delete[] se.next ; + if ( se.prev ) + delete[] se.prev ; + continue ; + } + } + + // Adjust the coordinate. + subexons.push_back( se ) ; + } + + // Convert the coordinate to index + // Note that each coordinate can only associate with one subexon. + subexonCnt = subexons.size() ; + for ( i = 0 ; i < subexonCnt ; ++i ) + { + struct _subexon &se = subexons[i] ; + //printf( "hi1 %d: %d %d\n", i, se.prevCnt, se.prev[0] ) ; + int cnt = 0 ; + + // due to filter, we may not fully match the coordinate and the subexon + int bound = 0 ; + if ( se.prevCnt > 0 ) + bound = se.prev[0] ; + for ( j = i - 1, k = 0 ; k < se.prevCnt && j >= 0 && subexons[j].end >= bound ; --j ) + { + //printf( " %d %d: %d %d\n", j, k, se.prev[ se.prevCnt - 1 - k], subexons[j].end ) ; + if ( subexons[j].end == se.prev[se.prevCnt - 1 - k] ) // notice the order is reversed + { + se.prev[se.prevCnt - 1 - cnt] = j ; + ++k ; + ++cnt ; + } + else if ( subexons[j].end < se.prev[ se.prevCnt - 1 - k ] ) // the corresponding subexon gets filtered. + { + ++k ; + ++j ; // counter the --j in the loop + } + } + //printf( "hi2 %d : %d\n", i, se.prevCnt ) ; + // shft the list + for ( j = 0, k = se.prevCnt - cnt ; j < cnt ; ++j, ++k ) + { + se.prev[j] = se.prev[k] ; + } + se.prevCnt = cnt ; + cnt = 0 ; + if ( se.nextCnt > 0 ) + bound = se.next[ se.nextCnt - 1] ; + for ( j = i + 1, k = 0 ; k < se.nextCnt && j < subexonCnt && subexons[j].start <= bound ; ++j ) + { + if ( subexons[j].start == se.next[k] ) + { + se.next[cnt] = j ; // cnt is always less than k, so we don't need to worry about overwrite. + ++k ; + ++cnt ; + } + else if ( subexons[j].start > se.next[k] ) + { + ++k ; + --j ; + } + } + se.nextCnt = cnt ; + } + + // Adjust the coordinate + int seCnt = subexons.size() ; + for ( i = 0 ; i < seCnt ; ++i ) + { + --subexons[i].start ; + --subexons[i].end ; + } + rewind( fpSubexon ) ; + + // Adjust the classifier for hard boundary, if there is a overhang attached to that region. + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].leftType == 1 && subexons[i].leftClassifier < 1 ) + { + for ( j = i - 1 ; j >= 0 ; --j ) + if ( subexons[j].end < subexons[j + 1].start - 1 ) + break ; + if ( subexons[j + 1].leftType == 0 ) + subexons[i].leftClassifier = 1 ; + } + if ( subexons[i].rightType == 2 && subexons[i].rightClassifier < 1 ) + { + for ( j = i + 1 ; j < seCnt ; ++j ) + if ( subexons[j].start > subexons[j - 1].end + 1 ) + break ; + if ( subexons[j - 1].rightType == 0 ) + subexons[i].rightClassifier = 1 ; + } + } + + // For the region of mixture of plus and minus strand subexons, if there is + // no overhang attached to it, we need to let the hard boundary be a candidate terminal sites. + for ( i = 0 ; i < seCnt ; ) + { + // [i,j) is a region + int support[2] = {0, 0} ; // the index, 0 is for minus strand, 1 is for plus strand + for ( j = i + 1 ; j < seCnt ; ++j ) + { + if ( subexons[j].start > subexons[j - 1].end + 1 ) + break ; + } + + for ( k = i ; k < j ; ++k ) + { + if ( subexons[k].leftStrand != 0 ) + ++support[ ( subexons[k].leftStrand + 1 ) / 2 ] ; + if ( subexons[k].rightStrand != 0 ) + ++support[ ( subexons[k].rightStrand + 1 ) / 2 ] ; + } + if ( support[0] == 0 || support[1] == 0 ) + { + i = j ; + continue ; + } + // a mixture region. + // We force a terminal site if we have only coming-in and no going-out introns. + int leftSupport[2] = {0, 0}, rightSupport[2] = {0, 0}; + int l ; + for ( k = i ; k < j ; ++k ) + { + int cnt = subexons[k].prevCnt ; + if ( subexons[k].leftStrand != 0 ) + for ( l = 0 ; l < cnt ; ++l ) + if ( subexons[k].prev[l] < i ) + { + ++leftSupport[ ( subexons[k].leftStrand + 1 ) / 2 ] ; + break ; + } + cnt = subexons[k].nextCnt ; + if ( subexons[k].rightStrand != 0 ) + for ( l = 0 ; l < cnt ; ++l ) + if ( subexons[k].next[l] >= j ) + { + ++rightSupport[ ( subexons[k].rightStrand + 1 ) / 2 ] ; + break ; + } + } + + if ( ( ( leftSupport[0] > 0 && rightSupport[0] == 0 ) || + ( leftSupport[1] > 0 && rightSupport[1] == 0 ) ) && + subexons[j - 1].rightType != 0 ) + { + subexons[j - 1].rightClassifier = 0 ; + } + + if ( ( ( leftSupport[0] == 0 && rightSupport[0] > 0 ) || + ( leftSupport[1] == 0 && rightSupport[1] > 0 ) ) && + subexons[j - 1].leftType != 0 ) + { + subexons[j - 1].leftClassifier = 0 ; + } + + i = j ; + } + + this->classifierThreshold = classifierThreshold ; + + usedGeneId = baseGeneId = 0 ; + } + + static bool IsSameStrand( int a, int b ) + { + if ( a == 0 || b == 0 ) + return true ; + if ( a != b ) + return false ; + return true ; + } + // Parse the input line + static int InputSubexon( char *in, Alignments &alignments, struct _subexon &se, bool needPrevNext = false ) + { + int i ; + char chrName[50] ; + char ls[3], rs[3] ; + sscanf( in, "%s %d %d %d %d %s %s %lf %lf %lf %lf %lf", chrName, &se.start, &se.end, &se.leftType, &se.rightType, ls, rs, + &se.avgDepth, &se.leftRatio, &se.rightRatio, + &se.leftClassifier, &se.rightClassifier ) ; + se.chrId = alignments.GetChromIdFromName( chrName ) ; + se.nextCnt = se.prevCnt = 0 ; + se.next = se.prev = NULL ; + se.lcCnt = se.rcCnt = 0 ; + + if ( ls[0] == '+' ) + se.leftStrand = 1 ; + else if ( ls[0] == '-' ) + se.leftStrand = -1 ; + else + se.leftStrand = 0 ; + + if ( rs[0] == '+' ) + se.rightStrand = 1 ; + else if ( rs[0] == '-' ) + se.rightStrand = -1 ; + else + se.rightStrand = 0 ; + + if ( needPrevNext ) + { + char *p = in ; + // Locate the offset for prevCnt + for ( i = 0 ; i <= 11 ; ++i ) + { + p = strchr( p, ' ' ) ; + ++p ; + } + + sscanf( p, "%d", &se.prevCnt ) ; + p = strchr( p, ' ' ) ; + ++p ; + se.prev = new int[ se.prevCnt ] ; + for ( i = 0 ; i < se.prevCnt ; ++i ) + { + sscanf( p, "%d", &se.prev[i] ) ; + p = strchr( p, ' ' ) ; + ++p ; + } + + sscanf( p, "%d", &se.nextCnt ) ; + p = strchr( p, ' ' ) ; + ++p ; + se.next = new int[ se.nextCnt ] ; + for ( i = 0 ; i < se.nextCnt ; ++i ) + { + sscanf( p, "%d", &se.next[i] ) ; + p = strchr( p, ' ' ) ; + ++p ; + } + + } + return 1 ; + } + + int GetGeneIntervalIdx( int startIdx, int &endIdx, int timeStamp ) ; + + //@return: the number of intervals found + int ComputeGeneIntervals() ; + + // Return a list of subexons in that interval and in retList the id of subexon + // should be adjusted to start from 0. + int ExtractSubexons( int startIdx, int endIdx, struct _subexon *retList ) ; +} ; + +#endif diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/SubexonInfo.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/SubexonInfo.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1214 @@ +// report the total depth for the segment +// Format: ./a.out alignments.bam splice_site +#include +#include +#include + +#define __STDC_FORMAT_MACROS +#include + +#include +#include +#include + +#include "alignments.hpp" +#include "blocks.hpp" +#include "stats.hpp" + +#define ABS(x) ((x)<0?-(x):(x)) + +char usage[] = "./subexon-info alignment.bam intron.splice [options]\n" + "options:\n" + "\t--minDepth INT: the minimum coverage depth considered as part of a subexon (default: 2)\n" + "\t--noStats: do not compute the statistical scores (default: not used)\n" ; +char buffer[4096] ; + +int gMinDepth ; + +bool CompSplitSite( struct _splitSite a, struct _splitSite b ) +{ + if ( a.chrId < b.chrId ) + return true ; + else if ( a.chrId > b.chrId ) + return false ; + else if ( a.pos != b.pos ) + return a.pos < b.pos ; + else if ( a.type != b.type ) + return a.type < b.type ; // We want the start of exons comes first, since we are scanning the genome from left to right, + // so we can terminate the extension early and create a single-base subexon later. + else + return a.oppositePos < b.oppositePos ; +} + +bool CompBlocksByAvgDepth( struct _block a, struct _block b ) +{ + double avgA = a.depthSum / (double)( a.end - a.start + 1 ) ; + double avgB = b.depthSum / (double)( b.end - b.start + 1 ) ; + return avgA < avgB ; +} + +bool CompBlocksByRatio( struct _block a, struct _block b ) +{ + return a.ratio < b.ratio ; +} + +int CompDouble( const void *p1, const void *p2 ) +{ + double a = *(double *)p1 ; + double b = *(double *)p2 ; + if ( a > b ) + return 1 ; + else if ( a < b ) + return -1 ; + else + return 0 ; +} + +// Clean up the split sites; +void FilterAndSortSplitSites( std::vector &sites ) +{ + std::sort( sites.begin(), sites.end(), CompSplitSite ) ; + int i, j, k, l ; + int size = sites.size() ; + + for ( i = 0 ; i < size ; ) + { + for ( j = i + 1 ; j < size ; ++j ) + if ( sites[j].chrId != sites[i].chrId || sites[j].pos != sites[i].pos || sites[j].type != sites[i].type ) + break ; + int maxSupport = 0 ; + for ( k = i ; k < j ; ++k ) + if ( sites[k].support > maxSupport ) + maxSupport = sites[k].support ; + + int strandCnt[2] = {0, 0} ; + char strand = sites[i].strand ; + for ( k = i ; k < j ; ++k ) + { + if ( sites[k].strand == '-' ) + strandCnt[0] += sites[k].support ; + else if ( sites[k].strand == '+' ) + strandCnt[1] += sites[k].support ; + + } + if ( strandCnt[0] > strandCnt[1] ) + strand = '-' ; + else if ( strandCnt[1] > strandCnt[0] ) + strand = '+' ; + + bool allOneExceptMax = false ; + if ( maxSupport >= 20 ) + { + allOneExceptMax = true ; + for ( k = i ; k < j ; ++k ) + { + if ( sites[k].support == maxSupport ) + continue ; + if ( sites[k].support > 1 ) + { + allOneExceptMax = false ; + break ; + } + } + } + + for ( k = i ; k < j ; ++k ) + { + if ( ( sites[k].support < 0.01 * maxSupport && sites[k].support <= 3 ) + || sites[k].support < 0.001 * maxSupport || sites[k].strand != strand // The introns from the different strand are filtered. + || ( sites[k].support < 0.02 * maxSupport && sites[k].mismatchSum >= 2 * sites[k].support ) + || ( allOneExceptMax && sites[k].support == 1 ) + || ( sites[k].support <= 2 && sites[k].mismatchSum >= 2 * sites[k].support ) + || ( maxSupport >= 2 && sites[k].support == 1 && ( ABS( sites[k].oppositePos - sites[k].pos - 1 ) >= 10000 || sites[k].mismatchSum != 0 ) ) ) + { + for ( l = i - 1 ; l >= 0 && sites[l].chrId == sites[i].chrId && sites[l].pos >= sites[k].oppositePos ; --l ) + { + if ( sites[l].pos == sites[k].oppositePos && sites[l].oppositePos == sites[k].pos ) + { + sites[l].support = -1 ; + sites[k].support = -1 ; + break ; + } + } + + for ( l = j ; l < size && sites[l].chrId == sites[i].chrId && sites[l].pos <= sites[k].oppositePos ; ++l ) + { + if ( sites[l].pos == sites[k].oppositePos && sites[l].oppositePos == sites[k].pos ) + { + sites[l].support = -1 ; + sites[k].support = -1 ; + break ; + } + } + } + /*else if ( sites[k].support <= 1 && sites[k].oppositePos - sites[k].pos + 1 >= 30000 ) + { + for ( l = j ; l < size && sites[l].chrId == sites[i].chrId && sites[l].pos <= sites[k].oppositePos ; ++l ) + { + if ( sites[l].pos == sites[k].oppositePos && sites[l].oppositePos == sites[k].pos ) + { + if ( l - j >= 20 ) + { + sites[l].support = -1 ; + sites[k].support = -1 ; + break ; + } + } + } + + }*/ + } + i = j ; + } + + k = 0 ; + for ( i = 0 ; i < size ; ++i ) + { + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + if ( sites[k].strand == '?' ) + sites[k].strand = '.' ; + ++k ; + } + } + sites.resize( k ) ; +} + +// Remove the same sites. +void KeepUniqSplitSites( std::vector< struct _splitSite> &sites ) +{ + int i, j ; + int size = sites.size() ; + int k = 0 ; + for ( i = 0 ; i < size ; ) + { + for ( j = i + 1 ; j < size ; ++j ) + if ( sites[j].chrId != sites[i].chrId || sites[j].pos != sites[i].pos || sites[j].type != sites[i].type ) + break ; + sites[k] = sites[i] ; + ++k ; + /*else + { + if ( sites[i].type != sites[k-1].type ) + { + printf( "%d\n", sites[i].pos ) ; + } + }*/ + i = j ; + } + sites.resize( k ) ; + + // For the sites that corresponds to the start of an exon, we remove the adjust to it and + /*for ( i = 1 ; i < size ; ++i ) + { + if ( sites[i].pos == sites[i - 1].pos && sites[i - 1].type == 2 ) + { + if ( sites[i].type == 1 ) + ++sites[i].pos ; + } + }*/ +} + +// Filter split sites that are extremely close to each other but on different strand. +void FilterNearSplitSites( std::vector< struct _splitSite> &sites ) +{ + int i, j ; + int size = sites.size() ; + int k = 0 ; + for ( i = 0 ; i < size - 1 ; ++i ) + { + if ( sites[i].support < 0 || sites[i].type != sites[i + 1].type || sites[i].chrId != sites[i + 1].chrId ) + continue ; + if ( sites[i + 1].pos - sites[i].pos <= 7 && + ( sites[i + 1].strand != sites[i].strand || + sites[i].strand == '?' ) ) + { + int tag = i ; + if ( sites[i + 1].support < sites[i].support ) + tag = i + 1 ; + sites[tag].support = -1 ; + int direction ; + if ( sites[tag].oppositePos < sites[tag].pos ) + direction = -1 ; + else + direction = 1 ; + + for ( j = tag ; j >= 0 && j < size ; j += direction ) + if ( sites[j].pos == sites[tag].oppositePos && sites[j].oppositePos == sites[tag].pos ) + { + sites[j].support = -1 ; + break ; + } + } + } + + for ( i = 0 ; i < size ; ++i ) + { + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + ++k ; + } + } + sites.resize( k ) ; +} + +void FilterRepeatSplitSites( std::vector &sites ) +{ + int i, j ; + int size = sites.size() ; + int k = 0 ; + for ( i = 0 ; i < size ; ) + { + for ( j = i + 1 ; j < size ; ++j ) + { + if ( sites[j].pos != sites[i].pos || sites[j].type != sites[i].type || sites[i].chrId != sites[j].chrId ) + break ; + } + int max = -1 ; + int maxtag = 0 ; + for ( k = i ; k < j ; ++k ) + { + if ( sites[k].uniqSupport > max ) + { + max = sites[k].uniqSupport ; + maxtag = k ; + } + else if ( sites[k].uniqSupport == max && sites[k].support > sites[maxtag].support ) + { + maxtag = k ; + } + } + + if ( max > -1 ) + { + if ( sites[maxtag].uniqSupport > sites[maxtag].support * 0.1 ) + { + for ( k = i ; k < j ; ++k ) + if ( sites[k].uniqSupport < 0.05 * sites[k].support ) + { + sites[k].support = -1 ; + + int direction ; + if ( sites[k].oppositePos < sites[k].pos ) + direction = -1 ; + else + direction = 1 ; + int l ; + for ( l = k ; l >= 0 && l < size ; l += direction ) + if ( sites[l].pos == sites[k].oppositePos && sites[l].oppositePos == sites[k].pos ) + { + sites[l].support = -1 ; + break ; + } + } + } + else + { + for ( k = i ; k < j ; ++k ) + { + if ( sites[k].support <= 10 ) + { + sites[k].support = -1 ; + + int direction ; + if ( sites[k].oppositePos < sites[k].pos ) + direction = -1 ; + else + direction = 1 ; + int l ; + for ( l = k ; l >= 0 && l < size ; l += direction ) + if ( sites[l].pos == sites[k].oppositePos && sites[l].oppositePos == sites[k].pos ) + { + sites[l].support = -1 ; + break ; + } + } + } + } + } + + i = j ; + } + + k = 0 ; + for ( i = 0 ; i < size ; ++i ) + { + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + ++k ; + } + } + sites.resize( k ) ; + +} + + +// When k=1, the gamma distribution becomes exponential distribution, and can be optimized analytically.. +// Maximize: sum_i z_i( log( 1/theta e^{-x_i / theta} ) +/*double ThetaOfExponentialDistribution( double *x, double *z, int n ) +{ + double sumZ = 0; + double sumZX = 0 ; + for ( i = 0 ; i < n ; ++i ) + { + sumZ += z[i] ; + sumZX += z[i] * x[i] ; + } +}*/ + +// for boundK, if it is positive, it represent the upper bound. If it is negative, -boundK will be the lower bound for k. +// if boundK==0, there is no extra bound. +// The same logic for boundProduct, which bounds k*theta +void GradientDescentGammaDistribution( double &k, double &theta, double initK, double initTheta, double lowerBoundK, double upperBoundK, + double lowerBoundMean, double upperBoundMean, double *x, double *z, int n ) +{ + int i ; + k = initK ; + theta = initTheta ; + double c = 0.5 ; + int iterCnt = 1 ; + + double sumZ = 0 ; + double sumZX = 0 ; + double sumZLogX = 0 ; + double Hessian[2][2] ; // 0 for k, 1 for theta + double inverseHessian[2][2] ; + int tmp ; + + double positiveKRecord = -5, positiveThetaRecord = -5 ; // record the value of k, theta when theta, k becomes non-positive. + + for ( i = 0 ; i < n ; ++i ) + { + sumZ += z[i] ; + sumZX += z[i] * x[i] ; + sumZLogX += z[i] * log( x[i] ) ; + } + + while ( 1 ) + { + double gradK = 0 ; + double gradTheta = 0 ; + + double prevK = k ; + double prevTheta = theta ; + double digammaK = digammal( k ) ; + + gradK = sumZ * ( -log( theta ) - digammaK ) + sumZLogX ; + gradTheta = -sumZ * ( k / theta ) + sumZX / ( theta * theta ) ; + + Hessian[0][0] = -sumZ * trigamma( k, &tmp ) ; + Hessian[0][1] = -sumZ / theta ; // \partial l / ( \partial k \partial theta) + Hessian[1][0] = -sumZ / theta ; + Hessian[1][1] = sumZ * k / ( theta * theta ) - 2 * sumZX / ( theta * theta * theta ) ; + + double det = Hessian[0][0] * Hessian[1][1] - Hessian[0][1] * Hessian[1][0] ; + /*printf( "%s iter %d:\n", __func__, iterCnt ) ; + printf( "%lf %lf %lf %lf\n", sumZ, k, theta, sumZX ) ; + printf( "%lf %lf %lf\n", gradK, gradTheta, det ) ; + printf( "%lf %lf %lf %lf\n", Hessian[0][0], Hessian[0][1], Hessian[1][0], Hessian[1][1] ) ;*/ + if ( det <= 1e-4 && det >=-1e-4 ) + { + k = k + c / iterCnt * gradK ; + theta = theta + c / iterCnt * gradTheta ; + } + else + { + inverseHessian[0][0] = Hessian[1][1] / det ; + inverseHessian[0][1] = -Hessian[0][1] / det ; + inverseHessian[1][0] = -Hessian[1][0] / det ; + inverseHessian[1][1] = Hessian[0][0] / det ; + //printf( "%lf %lf %lf %lf: %lf\n=====\n", inverseHessian[0][0], inverseHessian[0][1], inverseHessian[1][0], inverseHessian[1][1], + // Hessian[1][0] * inverseHessian[0][1] + Hessian[1][1] * inverseHessian[1][1] ) ; + double step = 0.5 ; + k = k - step * ( inverseHessian[0][0] * gradK + inverseHessian[0][1] * gradTheta ) ; + theta = theta - step * ( inverseHessian[1][0] * gradK + inverseHessian[1][1] * gradTheta ) ; + + bool flag = false ; + if ( k <= 1e-6 ) + { + step = ( prevK - 1e-6 ) / ( inverseHessian[0][0] * gradK + inverseHessian[0][1] * gradTheta ) ; + + if ( ABS( theta - positiveThetaRecord ) < 1e-5 ) + flag = true ; + positiveThetaRecord = theta ; + } + if ( theta <= 1e-6 ) + { + double tmp = ( prevTheta - 1e-6 ) / ( inverseHessian[1][0] * gradK + inverseHessian[1][1] * gradTheta ) ; + if ( tmp < step ) + step = tmp ; + + if ( ABS( k - positiveKRecord ) < 1e-5 ) + flag = true ; + positiveKRecord = k ; + } + + if ( step != 0.5 ) + { + k = prevK - step * ( inverseHessian[0][0] * gradK + inverseHessian[0][1] * gradTheta ) ; + theta = prevTheta - step * ( inverseHessian[1][0] * gradK + inverseHessian[1][1] * gradTheta ) ; + } + + /*if ( flag ) + { + k = prevK ; + theta = prevTheta ; + break ; + }*/ + } + + if ( upperBoundK > 0 && k > upperBoundK ) + k = upperBoundK ; + else if ( lowerBoundK > 0 && k < lowerBoundK ) + k = lowerBoundK ; + + if ( upperBoundMean > 0 && k * theta > upperBoundMean ) + { + theta = upperBoundMean / k ; + } + else if ( lowerBoundMean > 0 && k * theta < lowerBoundMean ) + { + theta = lowerBoundMean / k ; + } + + if ( k <= 1e-6 ) + { + k = 1e-6 ; + } + + if ( theta <= 1e-6 ) + { + theta = 1e-6 ; + } + + double diff = ABS( prevK - k ) + ABS( prevTheta - theta ) ; + if ( diff < 1e-5 ) + break ; + + ++iterCnt ; + //if ( det <= 1e-4 && det >=-1e-4 && k >= 5000 ) //&& diff < 1 ) + if ( k >= 10000 ) //&& diff < 1 ) + { + k = prevK ; + theta = prevTheta ; + break ; + } + if ( iterCnt == 1000 ) + break ; + } +} + +double MixtureGammaEM( double *x, int n, double &pi, double *k, double *theta, int tries, double meanBound[2], int iter = 1000 ) +{ + int i ; + double *z = new double[n] ; // the expectation that it assigned to model 0. + double *oneMinusZ = new double[n] ; + int t = 0 ; + double history[5] = {-1, -1, -1, -1, -1} ; + double maxX = -1 ; + double sumX = 0 ; + if ( n <= 0 ) + return 0 ; + + for ( i = 0 ; i < n ; ++i ) + { + sumX += x[i] ; + if ( x[i] > maxX ) + maxX = x[i] ; + } + if ( maxX > meanBound[1] && meanBound[1] >= 0 ) + maxX = meanBound[1] ; + + /*if ( meanBound[1] == -1 ) + { + // The EM for coverage + maxX = 10.0 ; + }*/ + + while ( 1 ) + { + double npi, nk[2], ntheta[2] ; + double sum = 0 ; + for ( i = 0 ; i < n ; ++i ) + { + //double lf0 = -k[0] * log( theta[0] ) + ( k[0] - 1 ) * log( cov[i]) - cov[i] / theta[0] - lgamma( k[0] ); + //double lf1 = -k[1] * log( theta[1] ) + ( k[1] - 1 ) * log( cov[i]) - cov[i] / theta[1] - lgamma( k[1] ); + //z[i] = exp( lf0 + log( pi ) ) / ( exp( lf0 + log( pi ) ) + exp( lf1 + log( 1 - pi ) ) ) ; + if ( pi != 0 ) + z[i] = MixtureGammaAssignment( x[i], pi, k, theta ) ; + else + z[i] = 0 ; + /*if ( isnan( z[i] ) ) + { + printf( "nan: %lf %lf %lf %lf\n", x[i], pi, k, theta ) ; + }*/ + oneMinusZ[i] = 1 - z[i] ; + sum += z[i] ; + } + + // compute new pi. + npi = sum / n ; + + // Use gradient descent to compute new k and theta. + if ( 1 ) //pi > 0 ) + { + double bound ; + if ( meanBound[1] != -1 ) // the EM for ratio + { + bound = ( theta[1] * k[1] > 1 ) ? 1 : ( theta[1] * k[1] ) / ( 1 + tries ); + GradientDescentGammaDistribution( nk[0], ntheta[0], k[0], theta[0], k[1], -1, -1, bound, x, z, n ) ; // It seems setting an upper bound 1 for k[0] is not a good idea. + } + else + { + bound = ( theta[1] * k[1] > 1 ) ? 1 : ( theta[1] * k[1] ) / ( 1 + tries ) ; + GradientDescentGammaDistribution( nk[0], ntheta[0], k[0], theta[0], k[1], -1, meanBound[0], bound, x, z, n ) ; // It seems setting an upper bound 1 for k[0] is not a good idea. + } + GradientDescentGammaDistribution( nk[1], ntheta[1], k[1], theta[1], -1, k[0], theta[0] * k[0], maxX, x, oneMinusZ, n ) ; + } + else + { + GradientDescentGammaDistribution( nk[1], ntheta[1], k[1], theta[1], 0, 0, 0, 0, x, oneMinusZ, n ) ; + } + + double diff ; + if ( isnan( npi ) || isnan( nk[0] ) || isnan( nk[1] ) || isnan( ntheta[0] ) || isnan( ntheta[1] ) ) + { + delete[] z ; + delete[] oneMinusZ ; + return -1 ; + } + diff = ABS( nk[0] - k[0] ) + ABS( nk[1] - k[1] ) + + ABS( ntheta[0] - theta[0] ) + ABS( ntheta[1] - theta[1] ) ; // pi is fully determined by these 4 parameters. + if ( diff < 1e-4 ) + break ; + diff = ABS( nk[0] - history[1] ) + ABS( nk[1] - history[2] ) + + ABS( ntheta[0] - history[3] ) + ABS( ntheta[1] - history[4] ) ; // pi is fully determined by these 4 parameters. + if ( diff < 1e-4 ) + break ; + + history[0] = pi ; + history[1] = k[0] ; + history[2] = k[1] ; + history[3] = theta[0] ; + history[4] = theta[1] ; + + pi = npi ; + k[0] = nk[0] ; + k[1] = nk[1] ; + theta[0] = ntheta[0] ; + theta[1] = ntheta[1] ; + + /*double logLikelihood = 0 ; + for ( i = 0 ; i < n ; ++i ) + logLikelihood += log( pi * exp( LogGammaDensity( x[i], k[0], theta[0]) ) + + (1 - pi ) * exp( LogGammaDensity( x[i], k[1], theta[1] ) ) ) ;*/ + + //printf( "%d: %lf %lf %lf %lf %lf\n", t, pi, k[0], theta[0], k[1], theta[1] ) ; + + ++t ; + if ( iter != -1 && t >= iter ) + break ; + } + delete[] z ; + delete[] oneMinusZ ; + return 0 ; +} + +bool IsParametersTheSame( double *k, double *theta ) +{ + if ( ABS( k[0] - k[1] ) < 1e-2 && ABS( theta[0] - theta[1] ) < 1e-2 ) + return true ; + return false ; +} + +int RatioAndCovEM( double *covRatio, double *cov, int n, double &piRatio, double kRatio[2], + double thetaRatio[2], double &piCov, double kCov[2], double thetaCov[2] ) +{ + int i ; + piRatio = 0.6 ; // mixture coefficient for model 0 and 1 + kRatio[0] = 0.9 ; + kRatio[1] = 0.45 ; + thetaRatio[0] = 0.05 ; + thetaRatio[1] = 1 ; + double meanBound[2] = {-1, 1} ; // [0] is for the lower bound of the noise model, [1] is for the upper bound of the true model + + /*double *filteredCovRatio = new double[n] ;// ignore the ratio that is greater than 5. + int m = 0 ; + for ( i = 0 ; i < n ; ++i ) + if ( covRatio[i] < 1.0 ) + { + filteredCovRatio[m] = covRatio[i] ; + ++m ; + }*/ + srand( 17 ) ; + int maxTries = 10 ; + int t = 0 ; + double *buffer = new double[n] ; + for ( i = 0 ; i < n ; ++i ) + buffer[i] = covRatio[i] ; + qsort( buffer, n, sizeof( double ), CompDouble ) ; + //covRatio = buffer ; + while ( 1 ) + { + //printf( "EM\n" ) ; + MixtureGammaEM( covRatio, n, piRatio, kRatio, thetaRatio, t, meanBound ) ; + //printf( "%lf %lf %lf %lf %lf\n", piRatio, kRatio[0], kRatio[1], thetaRatio[0], thetaRatio[1] ) ; + if ( piRatio > 0.999 || piRatio < 0.001 || IsParametersTheSame( kRatio, thetaRatio ) ) + { + ++t ; + if ( t > maxTries ) + break ; + piRatio = 0.6 ; + kRatio[0] += ( ( rand() * 0.5 - RAND_MAX ) / (double)RAND_MAX * 0.1 ) ; + if ( kRatio[0] <= 0 ) + kRatio[0] = 0.9 ; + kRatio[1] += ( ( rand() * 0.5 - RAND_MAX ) / (double)RAND_MAX * 0.1 ) ; + if ( kRatio[1] <= 0 ) + kRatio[1] = 0.45 ; + thetaRatio[0] += ( ( rand() * 0.5 - RAND_MAX ) / (double)RAND_MAX * 0.1 ) ; + if ( thetaRatio[0] <= 0 ) + thetaRatio[0] = 0.05 ; + thetaRatio[1] += ( ( rand() * 0.5 - RAND_MAX ) / (double)RAND_MAX * 0.1 ) ; + if ( thetaRatio[1] <= 0 ) + thetaRatio[1] = 1 ; + if ( kRatio[0] < kRatio[1] ) + { + if ( rand() & 1 ) + kRatio[0] = kRatio[1] ; + else + kRatio[1] = kRatio[0] ; + } + if ( kRatio[0] * thetaRatio[0] > kRatio[1] * thetaRatio[1] ) + { + thetaRatio[0] = kRatio[1] * thetaRatio[1] / kRatio[0] ; + } + //printf( "%lf %lf %lf %lf %lf\n", piRatio, kRatio[0], kRatio[1], thetaRatio[0], thetaRatio[1] ) ; + + continue ; + } + + break ; + } + //delete[] filteredCovRatio ; + if ( t > maxTries && piRatio > 0.999 ) + { + /*piRatio = 0.6 ; // mixture coefficient for model 0 and 1 + kRatio[0] = 0.9 ; + kRatio[1] = 0.45 ; + thetaRatio[0] = 0.05 ; + thetaRatio[1] = 1 ;*/ + piRatio = 0.999 ; + } + if ( IsParametersTheSame( kRatio, thetaRatio ) || piRatio <= 1e-3 ) + piRatio = 1e-3 ; + + piCov = piRatio ; // mixture coefficient for model 0 and 1 + kCov[0] = 0.9 ; + kCov[1] = 0.45 ; + thetaCov[0] = 3 ; + thetaCov[1] = 12 ; + + // only do one iteration of EM, so that pi does not change? + // But it seems it still better to run full EM. + meanBound[0] = 1.01 ; + meanBound[1] = -1 ; + + //printf( "for coverage:\n" ) ; + //piCov = 0.001000 ; + MixtureGammaEM( cov, n, piCov, kCov, thetaCov, 0, meanBound ) ; + //printf( "for coverage done\n" ) ; + piCov = piRatio ; + + delete []buffer ; + + return 0 ; +} + +double GetPValue( double x, double *k, double *theta ) +{ + int fault ; + double p ; + p = 1 - gammad( x / theta[0], k[0], &fault ) ; + return p ; +} + +// if x's value is less than the average of (k0-1)*theta0, then we force x=(k0-1)*theta0, +// the mode of the model 0. Of course, it does not affect when k0<=1 already. +double MixtureGammaAssignmentAdjust( double x, double pi, double* k, double *theta ) +{ + if ( x < ( k[0] - 1 ) * theta[0] ) + { + x = ( k[0] - 1 ) * theta[0] ; + } + return MixtureGammaAssignment( x, pi, k, theta ) ; +} + + +// Transform the cov number for better fitting +double TransformCov( double c ) +{ + double ret ; + // original it is c-1. + // Use -2 instead of -1 is that many region covered to 1 reads will be filtered when + // build the subexons. + // + //ret = sqrt( c ) - 1 ; + if ( c <= 2 + 1e-6 ) + ret = 1e-6 ; + else + ret = c - 2 ; + return ret ; + //return log( c ) / log( 2.0 ) ; +} + +int main( int argc, char *argv[] ) +{ + int i, j ; + bool noStats = false ; + if ( argc < 3 ) + { + fprintf( stderr, usage ) ; + exit( 1 ) ; + } + + gMinDepth = 2 ; + + for ( i = 3 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "--noStats" ) ) + { + noStats = true ; + continue ; + } + else if ( !strcmp( argv[i], "--minDepth" ) ) + { + gMinDepth = atoi( argv[i + 1] ) ; + ++i ; + continue ; + } + else + { + fprintf( stderr, "Unknown argument: %s\n", argv[i] ) ; + return 0 ; + } + } + + Alignments alignments ; + alignments.Open( argv[1] ) ; + std::vector splitSites ; // only compromised the + std::vector allSplitSites ; + + // read in the splice site + FILE *fp ; + fp = fopen( argv[2], "r" ) ; + char chrom[50] ; + int64_t start, end ; + int support ; + char strand[3] ; + int uniqSupport, secondarySupport, uniqEditDistance, secondaryEditDistance ; + while ( fscanf( fp, "%s %" PRId64 " %" PRId64 " %d %s %d %d %d %d", chrom, &start, &end, &support, strand, + &uniqSupport, &secondarySupport, &uniqEditDistance, &secondaryEditDistance ) != EOF ) + { + if ( support <= 0 ) + continue ; + //if ( !( uniqSupport >= 1 + // || secondarySupport > 10 ) ) + //if ( uniqSupport <= 0.01 * ( uniqSupport + secondarySupport ) || ( uniqSupport == 0 && secondarySupport < 20 ) ) + //if ( uniqSupport == 0 && secondarySupport <= 10 ) + // continue ; + int chrId = alignments.GetChromIdFromName( chrom ) ; + struct _splitSite ss ; + --start ; + --end ; + ss.pos = start ; + ss.chrId = chrId ; + ss.type = 2 ; + ss.oppositePos = end ; + ss.strand = strand[0] ; + ss.support = support ; + ss.uniqSupport = uniqSupport ; + ss.mismatchSum = uniqEditDistance + secondaryEditDistance ; + splitSites.push_back( ss ) ; + + ss.pos = end ; + ss.type = 1 ; + ss.oppositePos = start ; + ss.strand = strand[0] ; + ss.support = support ; + ss.uniqSupport = uniqSupport ; + ss.mismatchSum = uniqEditDistance + secondaryEditDistance ; + splitSites.push_back( ss ) ; + } + fclose( fp ) ; + //printf( "ss:%d\n", splitSites.size() ) ; + + //printf( "ss:%d\n", splitSites.size() ) ; + + alignments.GetGeneralInfo( true ) ; + // Build the blocks + Blocks regions ; + alignments.Rewind() ; + regions.BuildExonBlocks( alignments ) ; + //printf( "%d\n", regions.exonBlocks.size() ) ; + + FilterAndSortSplitSites( splitSites ) ; + FilterNearSplitSites( splitSites ) ; + FilterRepeatSplitSites( splitSites ) ; + regions.FilterSplitSitesInRegions( splitSites ) ; + regions.FilterGeneMergeSplitSites( splitSites ) ; + + + allSplitSites = splitSites ; + KeepUniqSplitSites( splitSites ) ; + + //for ( i = 0 ; i < splitSites.size() ; ++i ) + // printf( "%d %d\n", splitSites[i].pos + 1, splitSites[i].oppositePos + 1 ) ; + // Split the blocks using split site + regions.SplitBlocks( alignments, splitSites ) ; + //printf( "%d\n", regions.exonBlocks.size() ) ; + /*for ( i = 0 ; i < regions.exonBlocks.size() ; ++i ) + { + struct _block &e = regions.exonBlocks[i] ; + printf( "%s %" PRId64 " %" PRId64 " %d %d\n", alignments.GetChromName( e.chrId ), e.start + 1, e.end + 1, e.leftType, e.rightType ) ; + } + return 0 ;*/ + // Recompute the coverage for each block. + alignments.Rewind() ; + //printf( "Before computeDepth: %d\n", regions.exonBlocks.size() ) ; + + regions.ComputeDepth( alignments ) ; + //printf( "After computeDepth: %d\n", regions.exonBlocks.size() ) ; + + // Merge blocks that may have a hollow coverage by accident. + regions.MergeNearBlocks() ; + //printf( "After merge: %d\n", regions.exonBlocks.size() ) ; + + // Put the intron informations + regions.AddIntronInformation( allSplitSites, alignments ) ; + //printf( "After add information.\n" ) ; + + // Compute the average ratio against the left and right connected subexons. + regions.ComputeRatios() ; + //printf( "After compute ratios.\n" ) ; + + //printf( "Finish building regions.\n" ) ; + if ( noStats ) + { + // just output the subexons. + if ( realpath( argv[1], buffer ) == NULL ) + { + strcpy( buffer, argv[1] ) ; + } + printf( "#%s\n", buffer ) ; + printf( "#fitted_ir_parameter_ratio: pi: -1 k0: -1 theta0: -1 k1: -1 theta1: -1\n" ) ; + printf( "#fitted_ir_parameter_cov: pi: -1 k0: -1 theta0: -1 k1: -1 theta1: -1\n" ) ; + + int blockCnt = regions.exonBlocks.size() ; + for ( int i = 0 ; i < blockCnt ; ++i ) + { + struct _block &e = regions.exonBlocks[i] ; + double avgDepth = (double)e.depthSum / ( e.end - e.start + 1 ) ; + printf( "%s %" PRId64 " %" PRId64 " %d %d %lf -1 -1 -1 -1 ", alignments.GetChromName( e.chrId ), e.start + 1, e.end + 1, e.leftType, e.rightType, avgDepth ) ; + int prevCnt = e.prevCnt ; + if ( i > 0 && e.start == regions.exonBlocks[i - 1].end + 1 && + e.leftType == regions.exonBlocks[i - 1].rightType ) + { + printf( "%d ", prevCnt + 1 ) ; + for ( j = 0 ; j < prevCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.prev[j] ].end + 1 ) ; + printf( "%" PRId64 " ", regions.exonBlocks[i - 1].end + 1 ) ; + } + else + { + printf( "%d ", prevCnt ) ; + for ( j = 0 ; j < prevCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.prev[j] ].end + 1 ) ; + } + + int nextCnt = e.nextCnt ; + if ( i < blockCnt - 1 && e.end == regions.exonBlocks[i + 1].start - 1 && + e.rightType == regions.exonBlocks[i + 1].leftType ) + { + printf( "%d %" PRId64 " ", nextCnt + 1, regions.exonBlocks[i + 1].start + 1 ) ; + } + else + printf( "%d ", nextCnt ) ; + for ( j = 0 ; j < nextCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.next[j] ].start + 1 ) ; + printf( "\n" ) ; + + } + return 0 ; + } + + // Extract the blocks for different events. + int blockCnt = regions.exonBlocks.size() ; + std::vector irBlocks ; // The regions corresponds to intron retention events. + double *leftClassifier = new double[ blockCnt ] ; + double *rightClassifier = new double[ blockCnt ] ; + std::vector overhangBlocks ; //blocks like (...[ or ]...) + std::vector islandBlocks ; // blocks like (....) + + for ( i = 0 ; i < blockCnt ; ++i ) + { + int ltype = regions.exonBlocks[i].leftType ; + int rtype = regions.exonBlocks[i].rightType ; + leftClassifier[i] = -1 ; + rightClassifier[i] = -1 ; + + //double avgDepth = (double)regions.exonBlocks[i].depthSum / ( regions.exonBlocks[i].end - regions.exonBlocks[i].start + 1 ) ; + + if ( ltype == 2 && rtype == 1 ) + { + // candidate intron retention. + // Note that when I compute the ratio, it is already made sure that the avgDepth>1. + double ratio = regions.PickLeftAndRightRatio( regions.exonBlocks[i] ) ; + + //printf( "%lf %lf\n", regions.exonBlocks[i].leftRatio, regions.exonBlocks[i].rightRatio ) ; + if ( ratio > 0 ) + { + regions.exonBlocks[i].ratio = ratio ; + irBlocks.push_back( regions.exonBlocks[i] ) ; + irBlocks[ irBlocks.size() - 1 ].contigId = i ; + } + } + else if ( ( ltype == 0 && rtype == 1 ) || ( ltype == 2 && rtype == 0 ) ) + { + // subexons like (...[ or ]...) + double ratio ; + if ( ltype == 0 ) + { + ratio = regions.exonBlocks[i].rightRatio ; + } + else if ( ltype == 2 ) + { + ratio = regions.exonBlocks[i].leftRatio ; + } + if ( ratio > 0 ) + { + regions.exonBlocks[i].ratio = ratio ; + overhangBlocks.push_back( regions.exonBlocks[i] ) ; + overhangBlocks[ overhangBlocks.size() - 1].contigId = i ; + } + } + else if ( ltype == 0 && rtype == 0 ) + { + islandBlocks.push_back( regions.exonBlocks[i] ) ; + islandBlocks[ islandBlocks.size() - 1].contigId = i ; + } + } + + // Compute the histogram for each intron. + int irBlockCnt = irBlocks.size() ; + double *cov = new double[irBlockCnt] ; + double *covRatio = new double[ irBlockCnt ] ; + double piRatio, kRatio[2], thetaRatio[2] ; + double piCov, kCov[2], thetaCov[2] ; + for ( i = 0 ; i < irBlockCnt ; ++i ) + { + double avgDepth = regions.GetAvgDepth( irBlocks[i] ) ; + //cov[i] = ( avgDepth - 1 ) / ( flankingAvg - 1 ) ; + cov[i] = TransformCov( avgDepth ) ; + + covRatio[i] = regions.PickLeftAndRightRatio( irBlocks[i] ) ; + //cov[i] = avgDepth / anchorAvg ; + //printf( "%"PRId64" %d %d: %lf %lf\n", irBlocks[i].depthSum, irBlocks[i].start, irBlocks[i].end, avgDepth, cov[i] ) ; + } + + /*fp = fopen( "ratio.out", "r" ) ; + int irBlockCnt = 0 ; + double *cov = new double[10000] ; + while ( 1 ) + { + double r ; + if ( fscanf( fp, "%lf", &r ) == EOF ) + break ; + cov[ irBlockCnt ] = r ; + ++irBlockCnt ; + } + fclose( fp ) ;*/ + RatioAndCovEM( covRatio, cov, irBlockCnt, piRatio, kRatio, thetaRatio, piCov, kCov, thetaCov ) ; + + for ( i = 0 ; i < irBlockCnt ; ++i ) + { + //double lf0 = -k[0] * log( theta[0] ) + ( k[0] - 1 ) * log( cov[i]) - cov[i] / theta[0] - lgamma( k[0] ) ; + //double lf1 = -k[1] * log( theta[1] ) + ( k[1] - 1 ) * log( cov[i]) - cov[i] / theta[1] - lgamma( k[1] ) ; + + double p1, p2, p ; + + p1 = MixtureGammaAssignmentAdjust( covRatio[i], piRatio, kRatio, thetaRatio ) ; + p2 = MixtureGammaAssignmentAdjust( cov[i], piCov, kCov, thetaCov ) ; + + /*p1 = GetPValue( covRatio[i], kRatio, thetaRatio ) ; //1 - gammad( covRatio[i] / thetaRatio[0], kRatio[0], &fault ) ; + if ( piRatio != 0 ) + p2 = GetPValue( cov[i], kCov, thetaCov ) ;//1 - gammad( cov[i] / thetaCov[0], kCov[0], &fault ) ; + else + p2 = p1 ;*/ + //printf( "%lf %lf: %lf %lf\n", covRatio[i], cov[i], p1, p2 ) ; + p = p1 > p2 ? p1 : p2 ; + + + //printf( "%d %d: avg: %lf ratio: %lf p: %lf\n", irBlocks[i].start, irBlocks[i].end, irBlocks[i].depthSum / (double)( irBlocks[i].end - irBlocks[i].start + 1 ), covRatio[i], + // p ) ; + leftClassifier[ irBlocks[i].contigId ] = p ; + rightClassifier[ irBlocks[i].contigId ] = p ; + } + + // Process the classifier for overhang subexons and the subexons to see whether we need soft boundary + int overhangBlockCnt = overhangBlocks.size() ; + delete []cov ; + delete []covRatio ; + + cov = new double[ overhangBlockCnt ] ; + covRatio = new double[overhangBlockCnt] ; + double overhangPiRatio, overhangKRatio[2], overhangThetaRatio[2] ; + double overhangPiCov, overhangKCov[2], overhangThetaCov[2] ; + + for ( i = 0 ; i < overhangBlockCnt ; ++i ) + { + covRatio[i] = overhangBlocks[i].ratio ; + cov[i] = TransformCov( regions.GetAvgDepth( overhangBlocks[i] ) ) ; + } + RatioAndCovEM( covRatio, cov, overhangBlockCnt, overhangPiRatio, overhangKRatio, overhangThetaRatio, overhangPiCov, overhangKCov, overhangThetaCov ) ; + + for ( i = 0 ; i < overhangBlockCnt ; ++i ) + { + //double lf0 = -k[0] * log( theta[0] ) + ( k[0] - 1 ) * log( cov[i]) - cov[i] / theta[0] - lgamma( k[0] ) ; + //double lf1 = -k[1] * log( theta[1] ) + ( k[1] - 1 ) * log( cov[i]) - cov[i] / theta[1] - lgamma( k[1] ) ; + + double p1, p2, p ; + p1 = MixtureGammaAssignmentAdjust( covRatio[i], overhangPiRatio, overhangKRatio, overhangThetaRatio ) ; + p2 = MixtureGammaAssignmentAdjust( cov[i], overhangPiCov, overhangKCov, overhangThetaCov ) ; + + /*p1 = GetPValue( covRatio[i], overhangKRatio, overhangThetaRatio ) ; //1 - gammad( covRatio[i] / thetaRatio[0], kRatio[0], &fault ) ; + if ( overhangPiRatio != 0) + p2 = GetPValue( cov[i], overhangKCov, overhangThetaCov ) ;//1 - gammad( cov[i] / thetaCov[0], kCov[0], &fault ) ; + else + p2 = p1 ;*/ + + //p = p1 < p2 ? p1 : p2 ; + p = sqrt( p1 * p2 ) ; + + int idx = overhangBlocks[i].contigId ; + if ( regions.exonBlocks[idx].rightType == 0 ) + leftClassifier[ idx ] = rightClassifier[ idx ] = p ; + else + leftClassifier[ idx ] = rightClassifier[ idx ] = p ; + } + + for ( i = 0 ; i < blockCnt ; ++i ) + { + struct _block &e = regions.exonBlocks[i] ; + //if ( ( e.leftType == 0 && e.rightType == 1 ) || + // variance-stabailizing transformation of poisson distribution. But we are more conservative here. + // The multiply 2 before that is because we ignore the region below 0, so we need to somehow renormalize the distribution. + if ( e.leftType == 1 ) + { + if ( e.leftRatio >= 0 ) + leftClassifier[i] = 2 * alnorm( e.leftRatio * 2.0 , true ) ; + else + leftClassifier[i] = 1 ; + } + /*else if ( e.leftType == 0 ) + { + // If this region is a start of a gene, the other sample might introduce + // a new intron before it. So we want to test whether this region can still + // be a start of a gene even there is an intron before it. + for ( j = i + 1 ; j < blockCnt ; ++j ) + { + if ( regions.exonBlocks[j].contigId != regions.exonBlocks[i].contigId ) + break ; + } + + for ( k = i ; k < j ; ++k ) + if ( regions.exonBlocks[j].leftType == 1 ) + break ; + if ( k >= j ) + { + leftClassifier[i] = alnorm( ) + } + }*/ + + //if ( ( e.rightType == 0 && e.leftType == 2 ) || + if ( e.rightType == 2 ) + { + if ( e.rightRatio >= 0 ) + rightClassifier[i] = 2 * alnorm( e.rightRatio * 2.0, true ) ; + else + rightClassifier[i] = 1 ; + } + } + + // Process the result for subexons seems like single-exon transcript (...) + int islandBlockCnt = islandBlocks.size() ; + //std::sort( islandBlocks.begin(), islandBlocks.end(), CompBlocksByAvgDepth ) ; + for ( i = 0, j = 0 ; i < islandBlockCnt ; ++i ) + { + /*if ( regions.GetAvgDepth( islandBlocks[i] ) != regions.GetAvgDepth( islandBlocks[j] ) ) + j = i ; + leftClassifier[ islandBlocks[i].contigId ] = 1 - (j + 1) / (double)( islandBlockCnt + 1 ) ; + rightClassifier[ islandBlocks[i].contigId ] = 1 - (j + 1) / (double)( islandBlockCnt + 1 ) ;*/ + double p = GetPValue( TransformCov( regions.GetAvgDepth( islandBlocks[i] ) ), kCov, thetaCov ) ; + leftClassifier[ islandBlocks[i].contigId ] = p ; + rightClassifier[ islandBlocks[i].contigId ] = p ; + } + + + // Output the result. + if ( realpath( argv[1], buffer ) == NULL ) + { + strcpy( buffer, argv[1] ) ; + } + printf( "#%s\n", buffer ) ; + // TODO: higher precision. + printf( "#fitted_ir_parameter_ratio: pi: %lf k0: %lf theta0: %lf k1: %lf theta1: %lf\n", piRatio, kRatio[0], thetaRatio[0], kRatio[1], thetaRatio[1] ) ; + printf( "#fitted_ir_parameter_cov: pi: %lf k0: %lf theta0: %lf k1: %lf theta1: %lf\n", piCov, kCov[0], thetaCov[0], kCov[1], thetaCov[1] ) ; + + printf( "#fitted_overhang_parameter_ratio: pi: %lf k0: %lf theta0: %lf k1: %lf theta1: %lf\n", overhangPiRatio, overhangKRatio[0], overhangThetaRatio[0], overhangKRatio[1], overhangThetaRatio[1] ) ; + printf( "#fitted_overhang_parameter_cov: pi: %lf k0: %lf theta0: %lf k1: %lf theta1: %lf\n", overhangPiCov, overhangKCov[0], overhangThetaCov[0], overhangKCov[1], overhangThetaCov[1] ) ; + + + for ( int i = 0 ; i < blockCnt ; ++i ) + { + struct _block &e = regions.exonBlocks[i] ; + double avgDepth = regions.GetAvgDepth( e ) ; + printf( "%s %" PRId64 " %" PRId64 " %d %d %c %c %lf %lf %lf %lf %lf ", alignments.GetChromName( e.chrId ), e.start + 1, e.end + 1, e.leftType, e.rightType, + e.leftStrand, e.rightStrand, avgDepth, + e.leftRatio, e.rightRatio, leftClassifier[i], rightClassifier[i] ) ; + int prevCnt = e.prevCnt ; + if ( i > 0 && e.start == regions.exonBlocks[i - 1].end + 1 ) + //&& e.leftType == regions.exonBlocks[i - 1].rightType ) + { + printf( "%d ", prevCnt + 1 ) ; + for ( j = 0 ; j < prevCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.prev[j] ].end + 1 ) ; + printf( "%" PRId64 " ", regions.exonBlocks[i - 1].end + 1 ) ; + } + else + { + printf( "%d ", prevCnt ) ; + for ( j = 0 ; j < prevCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.prev[j] ].end + 1 ) ; + } + + int nextCnt = e.nextCnt ; + if ( i < blockCnt - 1 && e.end == regions.exonBlocks[i + 1].start - 1 ) + //&& e.rightType == regions.exonBlocks[i + 1].leftType ) + { + printf( "%d %" PRId64 " ", nextCnt + 1, regions.exonBlocks[i + 1].start + 1 ) ; + } + else + printf( "%d ", nextCnt ) ; + for ( j = 0 ; j < nextCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.next[j] ].start + 1 ) ; + printf( "\n" ) ; + } + + delete[] cov ; + delete[] covRatio ; + delete[] leftClassifier ; + delete[] rightClassifier ; +} diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/TranscriptDecider.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/TranscriptDecider.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,3492 @@ +#include "TranscriptDecider.hpp" + +void TranscriptDecider::OutputTranscript( int sampleId, struct _subexon *subexons, struct _transcript &transcript ) +{ + int i, j ; + // determine the strand + std::vector subexonInd ; + transcript.seVector.GetOnesIndices( subexonInd ) ; + + // Determine the strand + char strand[2] = "." ; + int size = subexonInd.size() ; + if ( size > 1 ) + { + // locate the intron showed up in this transcript. + for ( i = 0 ; i < size - 1 ; ++i ) + { + /*int nextCnt = subexons[ subexonInd[i] ].nextCnt ; + if ( nextCnt == 0 ) + continue ; + + for ( j = 0 ; j < nextCnt ; ++j ) + { + int a = subexons[ subexonInd[i] ].next[j] ; + if ( subexonInd[i + 1] == a + && subexons[ subexonInd[i] ].end + 1 < subexons[a].start ) // avoid the case like ..(...[... + { + break ; + } + } + if ( j < nextCnt )*/ + + if ( subexons[ subexonInd[i] ].end + 1 < subexons[ subexonInd[i + 1] ].start ) + { + if ( subexons[ subexonInd[i] ].rightStrand == 1 ) + strand[0] = '+' ; + else if ( subexons[ subexonInd[i] ].rightStrand == -1 ) + strand[0] = '-' ; + break ; + } + } + } + + // TODO: transcript_id + char *chrom = alignments.GetChromName( subexons[0].chrId ) ; + char prefix[10] = "" ; + struct _subexon *catSubexons = new struct _subexon[ size + 1 ] ; + // Concatenate adjacent subexons + catSubexons[0] = subexons[ subexonInd[0] ] ; + j = 1 ; + for ( i = 1 ; i < size ; ++i ) + { + if ( subexons[ subexonInd[i] ].start == catSubexons[j - 1].end + 1 ) + { + catSubexons[j - 1].end = subexons[ subexonInd[i] ].end ; + } + else + { + catSubexons[j] = subexons[ subexonInd[i] ] ; + ++j ; + } + } + size = j ; + + int gid = GetTranscriptGeneId( subexonInd, subexons ) ; + if ( 0 ) //numThreads <= 1 ) + { + fprintf( outputFPs[sampleId], "%s\tCLASSES\ttranscript\t%d\t%d\t1000\t%s\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; Abundance \"%.6lf\";\n", + chrom, catSubexons[0].start + 1, catSubexons[size - 1].end + 1, strand, + prefix, chrom, gid, + prefix, chrom, gid, transcriptId[ gid - baseGeneId ], transcript.FPKM ) ; + for ( i = 0 ; i < size ; ++i ) + { + fprintf( outputFPs[ sampleId ], "%s\tCLASSES\texon\t%d\t%d\t1000\t%s\t.\tgene_id \"%s%s.%d\"; " + "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; Abundance \"%.6lf\"\n", + chrom, catSubexons[i].start + 1, catSubexons[i].end + 1, strand, + prefix, chrom, gid, + prefix, chrom, gid, transcriptId[ gid - baseGeneId ], + i + 1, transcript.FPKM ) ; + } + } + else + { + struct _outputTranscript t ; + int len = 0 ; + t.chrId = subexons[0].chrId ; + t.geneId = gid ; + t.transcriptId = transcriptId[ gid - baseGeneId ] ; + t.FPKM = transcript.FPKM ; + t.sampleId = sampleId ; + t.exons = new struct _pair32[size] ; + for ( i = 0 ; i < size ; ++i ) + { + t.exons[i].a = catSubexons[i].start + 1 ; + t.exons[i].b = catSubexons[i].end + 1 ; + len += t.exons[i].b - t.exons[i].a + 1 ; + } + t.cov = transcript.abundance * alignments.readLen / len ; + t.ecnt = size ; + t.strand = strand[0] ; + //printf( "%lf\n", transcript.correlationScore ) ; + + if ( numThreads > 1 ) + outputHandler->Add( t ) ; + else + outputHandler->Add_SingleThread( t ) ; + } + ++transcriptId[ gid - baseGeneId ] ; + + delete[] catSubexons ; +} + +int TranscriptDecider::GetFather( int f, int *father ) +{ + if ( father[f] != f ) + return father[f] = GetFather( father[f], father ) ; + return f ; +} + +int TranscriptDecider::GetTranscriptGeneId( std::vector &subexonInd, struct _subexon *subexons ) +{ + int i ; + int size = subexonInd.size() ; + + for ( i = 0 ; i < size ; ++i ) + if ( subexons[ subexonInd[i] ].geneId != -2 ) + return subexons[ subexonInd[i] ].geneId ; + + // Some extreme case, where all the regions are mixture regions. + for ( i = 0 ; i < size - 1 ; ++i ) + if ( subexons[ subexonInd[i] ].end + 1 < subexons[ subexonInd[i + 1] ].start ) + { + return defaultGeneId[ ( subexons[ subexonInd[i] ].rightStrand + 1 ) / 2 ] ; + } + return defaultGeneId[0] ; +} + +int TranscriptDecider::GetTranscriptGeneId( struct _transcript &t, struct _subexon *subexons ) +{ + if ( subexons[ t.first ].geneId != -2 ) + return subexons[ t.first ].geneId ; + if ( subexons[ t.last ].geneId != -2 ) + return subexons[ t.last ].geneId ; + std::vector subexonInd ; + t.seVector.GetOnesIndices( subexonInd ) ; + return GetTranscriptGeneId( subexonInd, subexons ) ; +} + +void TranscriptDecider::InitTranscriptId() +{ + int i ; + for ( i = 0 ; i < usedGeneId - baseGeneId ; ++i ) + transcriptId[i] = 0 ; +} + +bool TranscriptDecider::IsStartOfMixtureStrandRegion( int tag, struct _subexon *subexons, int seCnt ) +{ + int j, k ; + int leftStrandCnt[2] = {0, 0}, rightStrandCnt[2] = {0, 0}; + for ( j = tag + 1 ; j < seCnt ; ++j ) + if ( subexons[j].start > subexons[j - 1].end + 1 ) + break ; + + for ( k = tag ; k < j ; ++k ) + { + if ( subexons[k].leftStrand != 0 ) + ++leftStrandCnt[ ( subexons[k].leftStrand + 1 ) / 2 ] ; + if ( subexons[k].rightStrand != 0 ) + ++rightStrandCnt[ ( subexons[k].rightStrand + 1 ) / 2 ] ; + } + + if ( rightStrandCnt[0] > 0 && leftStrandCnt[0] == 0 && leftStrandCnt[1] > 0 ) + return true ; + if ( rightStrandCnt[1] > 0 && leftStrandCnt[1] == 0 && leftStrandCnt[0] > 0 ) + return true ; + return false ; +} + +// Return 0 - uncompatible or does not overlap at all. 1 - fully compatible. 2 - Head of the constraints compatible with the tail of the transcript +// the partial compatible case (return 2) mostly likely happen in DP where we have partial transcript. +int TranscriptDecider::IsConstraintInTranscript( struct _transcript transcript, struct _constraint &c ) +{ + //printf( "%d %d, %d %d\n", c.first, c.last, transcript.first, transcript.last ) ; + if ( c.first < transcript.first || c.first > transcript.last + || !transcript.seVector.Test( c.first ) + || ( !transcript.partial && !transcript.seVector.Test( c.last ) ) ) // no overlap or starts too early or some chosen subexons does not compatible + return 0 ; + + // Extract the subexons we should focus on. + int s, e ; + s = c.first ; + e = c.last ; + bool returnPartial = false ; + if ( e > transcript.last ) // constraints ends after the transcript. + { + if ( transcript.partial ) + { + e = transcript.last ; + returnPartial = true ; + } + else + return 0 ; + } + /*printf( "%s: %d %d: (%d %d) (%d %d)\n", __func__, s, e, + transcript.seVector.Test(0), transcript.seVector.Test(1), + c.vector.Test(0), c.vector.Test(1) ) ;*/ + + compatibleTestVectorT.Assign( transcript.seVector ) ; + //compatibleTestVectorT.MaskRegionOutsideInRange( s, e, transcript.first, transcript.last ) ; + compatibleTestVectorT.MaskRegionOutside( s, e ) ; + + compatibleTestVectorC.Assign( c.vector ) ; + if ( c.last > transcript.last ) + { + //compatibleTestVectorC.MaskRegionOutsideInRange( s, e, c.first, c.last ) ; + //compatibleTestVectorC.MaskRegionOutside( s, e ) ; + compatibleTestVectorC.MaskRegionOutside( 0, e ) ; // Because the bits before s are already all 0s in C. + } + /*printf( "after masking %d %d. %d %d %d %d:\n", s, e, transcript.first, transcript.last, c.first, c.last ) ; + compatibleTestVectorT.Print() ; + compatibleTestVectorC.Print() ; */ + // Test compatible. + int ret = 0 ; + if ( compatibleTestVectorT.IsEqual( compatibleTestVectorC ) ) + { + if ( returnPartial ) + ret = 2 ; + else + ret = 1 ; + } + + return ret ; +} + +int TranscriptDecider::IsConstraintInTranscriptDebug( struct _transcript transcript, struct _constraint &c ) +{ + //printf( "%d %d, %d %d\n", c.first, c.last, transcript.first, transcript.last ) ; + if ( c.first < transcript.first || c.first > transcript.last ) // no overlap or starts too early. + return 0 ; + printf( "hi\n" ) ; + // Extract the subexons we should focus on. + int s, e ; + s = c.first ; + e = c.last ; + bool returnPartial = false ; + if ( e > transcript.last ) // constraints ends after the transcript. + { + if ( transcript.partial ) + { + e = transcript.last ; + returnPartial = true ; + } + else + return 0 ; + } + /*printf( "%s: %d %d: (%d %d) (%d %d)\n", __func__, s, e, + transcript.seVector.Test(0), transcript.seVector.Test(1), + c.vector.Test(0), c.vector.Test(1) ) ;*/ + + compatibleTestVectorT.Assign( transcript.seVector ) ; + compatibleTestVectorT.MaskRegionOutside( s, e ) ; + + compatibleTestVectorC.Assign( c.vector ) ; + if ( e > transcript.last ) + compatibleTestVectorC.MaskRegionOutside( s, e ) ; + /*printf( "after masking: (%d %d) (%d %d)\n", + compatibleTestVectorT.Test(0), compatibleTestVectorT.Test(1), + compatibleTestVectorC.Test(0), compatibleTestVectorC.Test(1) ) ;*/ + + // Test compatible. + int ret = 0 ; + if ( compatibleTestVectorT.IsEqual( compatibleTestVectorC ) ) + { + if ( returnPartial ) + ret = 2 ; + else + ret = 1 ; + } + compatibleTestVectorT.Print() ; + compatibleTestVectorC.Print() ; + printf( "ret=%d\n", ret ) ; + return ret ; +} +int TranscriptDecider::SubTranscriptCount( int tag, struct _subexon *subexons, int *f ) +{ + if ( f[tag] != -1 ) + return f[tag] ; + + int ret = 0 ; + int i ; + if ( subexons[tag].canBeEnd ) + ret = 1 ; + for ( i = 0 ; i < subexons[tag].nextCnt ; ++i ) + { + ret += SubTranscriptCount( subexons[tag].next[i], subexons, f ) ; + } + + if ( ret == 0 ) + ret = 1 ; + return f[tag] = ret ; +} + +void TranscriptDecider::CoalesceSameTranscripts( std::vector &t ) +{ + int i, k ; + if ( t.size() == 0 ) + return ; + + std::sort( t.begin(), t.end(), CompSortTranscripts ) ; + + int size = t.size() ; + k = 0 ; + for ( i = 1 ; i < size ; ++i ) + { + if ( t[k].seVector.IsEqual( t[i].seVector ) ) + { + t[k].abundance += t[i].abundance ; + t[i].seVector.Release() ; + } + else + { + ++k ; + if ( i != k ) + t[k] = t[i] ; + } + } + t.resize( k + 1 ) ; +} + +void TranscriptDecider::EnumerateTranscript( int tag, int strand, int visit[], int vcnt, struct _subexon *subexons, SubexonCorrelation &correlation, double correlationScore, std::vector &alltranscripts, int &atcnt ) +{ + int i ; + visit[ vcnt ] = tag ; + //printf( "%s: %d %d %d %d. %d %d\n", __func__, vcnt, tag, subexons[tag].nextCnt, strand, subexons[tag].start, subexons[tag].end ) ; + // Compute the correlation score + double minCor = correlationScore ; + for ( i = 0 ; i < vcnt ; ++i ) + { + double tmp = correlation.Query( visit[i], visit[vcnt] ) ; + if ( tmp < minCor ) + minCor = tmp ; + } + + if ( subexons[tag].canBeEnd ) + { + struct _transcript &txpt = alltranscripts[atcnt] ; + for ( i = 0 ; i <= vcnt ; ++i ) + txpt.seVector.Set( visit[i] ) ; + + txpt.first = visit[0] ; + txpt.last = visit[vcnt] ; + txpt.partial = false ; + txpt.correlationScore = minCor ; + + //printf( "%lf %d %d ", txpt.correlationScore, vcnt, visit[0] ) ; + //txpt.seVector.Print() ; + ++atcnt ; + } + + for ( i = 0 ; i < subexons[tag].nextCnt ; ++i ) + { + int a = subexons[tag].next[i] ; + if ( !SubexonGraph::IsSameStrand( subexons[tag].rightStrand, strand ) + && subexons[a].start > subexons[tag].end + 1 ) + continue ; + int backupStrand = strand ; + if ( subexons[a].start > subexons[tag].end + 1 && strand == 0 ) + strand = subexons[tag].rightStrand ; + EnumerateTranscript( subexons[tag].next[i], strand, visit, vcnt + 1, subexons, correlation, minCor, alltranscripts, atcnt ) ; + strand = backupStrand ; + } +} + +void TranscriptDecider::SearchSubTranscript( int tag, int strand, int parents[], int pcnt, struct _dp &pdp, int visit[], int vcnt, int extends[], int extendCnt, +std::vector &tc, int tcStartInd, struct _dpAttribute &attr ) +{ + int i ; + int size ; + double cover ; + bool keepSearch = true ; + bool belowMin = false ; + + struct _subexon *subexons = attr.subexons ; + + visit[vcnt] = tag ; + ++vcnt ; + struct _dp visitdp ; + + visitdp.cover = -1 ; + + struct _transcript &subTxpt = attr.bufferTxpt ; + subTxpt.seVector.Reset() ; + for ( i = 0 ; i < pcnt ; ++i ) + subTxpt.seVector.Set( parents[i] ) ; + subTxpt.first = parents[0] ; + subTxpt.last = parents[ pcnt - 1] ; + for ( i = 0 ; i < vcnt ; ++i ) + subTxpt.seVector.Set( visit[i] ) ; + subTxpt.last = visit[ vcnt - 1 ] ; + subTxpt.partial = true ; + + // Adjust the extendsCnt + /*printf( "%s: %d %d %d\n", __func__, vcnt , extendCnt, extends[ extendCnt - 1] ) ; + subTxpt.seVector.Print() ; + tc[extends[extendCnt - 1]].vector.Print() ; + printf( "Adjust extend:\n") ;*/ + for ( i = extendCnt - 1 ; i >= 0 ; --i ) + { + if ( tc[ extends[i] ].last <= tag || ( tc[ extends[i] ].vector.Test( tag ) && IsConstraintInTranscript( subTxpt, tc[ extends[i] ] ) != 0 ) ) + break ; + } + extendCnt = i + 1 ; + + // If the extension ends. + subTxpt.partial = false ; + if ( subexons[tag].nextCnt > 0 && ( extendCnt == 0 || tag >= tc[ extends[ extendCnt - 1 ] ].last ) ) + { + // Solve the subtranscript beginning with visit. + // Now we got the optimal transcript for visit. + visitdp = SolveSubTranscript( visit, vcnt, strand, tc, tcStartInd, attr ) ; + keepSearch = false ; + } + //printf( "%s %d %d: visitdp.cover=%lf\n", __func__, parents[0], tag, visitdp.cover ) ; + + // the constraints across the parents and visit. + size = tc.size() ; + if ( visitdp.cover >= 0 ) + { + cover = visitdp.cover ; + // Reset the subTxpt, since its content is modofitied in SolveSubTxpt called above. + subTxpt.seVector.Reset() ; + for ( i = 0 ; i < pcnt ; ++i ) + subTxpt.seVector.Set( parents[i] ) ; + subTxpt.seVector.Or( visitdp.seVector ) ; + subTxpt.first = parents[0] ; + subTxpt.last = visitdp.last ; + subTxpt.partial = false ; + + if ( !attr.forAbundance && attr.minAbundance > 0 ) + { + for ( i = 0 ; i < pcnt - 1 ; ++i ) + { + if ( attr.uncoveredPair.find( parents[i] * attr.seCnt + parents[i + 1] ) != attr.uncoveredPair.end() ) + belowMin = true ; + } + for ( i = -1 ; i < vcnt - 1 ; ++i ) + { + if ( i == -1 && pcnt >= 1 ) + { + if ( attr.uncoveredPair.find( parents[pcnt - 1] * attr.seCnt + visit[0] ) != attr.uncoveredPair.end() ) + belowMin = true ; + } + else + { + if ( attr.uncoveredPair.find( visit[i] * attr.seCnt + visit[i + 1] ) != attr.uncoveredPair.end() ) + belowMin = true ; + } + } + if ( attr.forAbundance && belowMin ) + cover = 1e-6 ; + } + + for ( i = tcStartInd ; i < size ; ++i ) + { + if ( tc[i].first > parents[ pcnt - 1] ) + break ; + + if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 ) + { + if ( tc[i].normAbund <= attr.minAbundance ) + { + belowMin = true ; + cover = -2 ; + break ; + } + + if ( tc[i].abundance <= 0 ) + continue ; + + if ( attr.forAbundance ) + { + if ( tc[i].normAbund < cover || cover == 0 ) + cover = tc[i].normAbund ; + } + else + { + ++cover ; + } + } + } + if ( belowMin && pdp.cover == -1 ) + { + pdp.cover = -2 ; + pdp.seVector.Assign( subTxpt.seVector ) ; + pdp.first = subTxpt.first ; + pdp.last = subTxpt.last ; + pdp.strand = strand ; + } + else if ( cover > pdp.cover ) + { + pdp.cover = cover ; + pdp.seVector.Assign( subTxpt.seVector ) ; + pdp.first = subTxpt.first ; + pdp.last = subTxpt.last ; + pdp.strand = strand ; + } + } + else if ( visitdp.cover == -2 && pdp.cover == -1 ) // no valid extension from visit + { + subTxpt.seVector.Reset() ; + for ( i = 0 ; i < pcnt ; ++i ) + subTxpt.seVector.Set( parents[i] ) ; + subTxpt.seVector.Or( visitdp.seVector ) ; + subTxpt.first = parents[0] ; + subTxpt.last = visitdp.last ; + + pdp.cover = -2 ; + pdp.seVector.Assign( subTxpt.seVector ) ; + pdp.first = subTxpt.first ; + pdp.last = subTxpt.last ; + pdp.strand = strand ; + } + + if ( subexons[tag].canBeEnd && ( visitdp.cover < 0 || attr.forAbundance ) ) + // This works is because that the extension always covers more constraints. So we only go this branch if the extension does not work + // and it goes this branch if it violates minAbundance + // But we need to go here when we want to compute the maxAbundance transcript. + // This part also works as the exit point of the recurive function. + { + bool belowMin = false ; + subTxpt.seVector.Reset() ; + for ( i = 0 ; i < pcnt ; ++i ) + subTxpt.seVector.Set( parents[i] ) ; + for ( i = 0 ; i < vcnt ; ++i ) + subTxpt.seVector.Set( visit[i] ) ; + subTxpt.first = parents[0] ; + subTxpt.last = visit[ vcnt - 1] ; + subTxpt.partial = false ; + + cover = 0 ; + if ( attr.forAbundance || attr.minAbundance > 0 ) + { + for ( i = 0 ; i < pcnt - 1 ; ++i ) + { + if ( attr.uncoveredPair.find( parents[i] * attr.seCnt + parents[i + 1] ) != attr.uncoveredPair.end() ) + belowMin = true ; + } + for ( i = -1 ; i < vcnt - 1 ; ++i ) + { + if ( i == -1 && pcnt >= 1 ) + { + if ( attr.uncoveredPair.find( parents[pcnt - 1] * attr.seCnt + visit[0] ) != attr.uncoveredPair.end() ) + belowMin = true ; + } + else + { + if ( attr.uncoveredPair.find( visit[i] * attr.seCnt + visit[i + 1] ) != attr.uncoveredPair.end() ) + belowMin = true ; + } + } + + //if ( belowMin == true ) + // printf( "turned belowMin. %d. %d %d: %d %d %d\n", attr.uncoveredPair.size(), pcnt, vcnt, parents[0], visit[0], visit[ vcnt - 1] ) ; + + if ( attr.forAbundance && belowMin ) + cover = 1e-6 ; + } + + for ( i = tcStartInd ; i < size ; ++i ) + { + // note that the value is parents[ pcnt - 1], because + // in above the part of "visit" is computed in SolveSubTranscript( visit ). + if ( tc[i].first > visit[ vcnt - 1] ) + break ; + if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 ) + { + if ( tc[i].normAbund <= attr.minAbundance ) + { + belowMin = true ; + cover = -2 ; + break ; + } + + if ( tc[i].abundance <= 0 ) + continue ; + if ( attr.forAbundance ) + { + if ( tc[i].normAbund < cover || cover == 0 ) + cover = tc[i].normAbund ; + } + else + { + ++cover ; + } + } + } + + if ( belowMin && pdp.cover == -1 ) + { + pdp.cover = -2 ; + pdp.seVector.Assign( subTxpt.seVector ) ; + pdp.first = subTxpt.first ; + pdp.last = subTxpt.last ; + pdp.strand = strand ; + } + else if ( cover > pdp.cover ) + { + pdp.cover = cover ; + pdp.seVector.Assign( subTxpt.seVector ) ; + pdp.first = subTxpt.first ; + pdp.last = subTxpt.last ; + pdp.strand = strand ; + } + } + //printf( "%s %d: pdp.cover=%lf\n", __func__, tag, pdp.cover ) ; + + // keep searching. + if ( keepSearch ) + { + for ( i = 0 ; i < subexons[tag].nextCnt ; ++i ) + { + int b = subexons[tag].next[i] ; + if ( ( SubexonGraph::IsSameStrand( subexons[tag].rightStrand, strand ) + && SubexonGraph::IsSameStrand( subexons[b].leftStrand, strand ) ) || + subexons[b].start == subexons[tag].end + 1 ) + { + int backupStrand = strand ; + if ( subexons[b].start > subexons[tag].end + 1 ) + strand = subexons[tag].rightStrand ; + + SearchSubTranscript( subexons[tag].next[i], strand, parents, pcnt, pdp, visit, vcnt, + extends, extendCnt, tc, tcStartInd, attr ) ; + strand = backupStrand ; + } + } + + } + + return ; +} + +struct _dp TranscriptDecider::SolveSubTranscript( int visit[], int vcnt, int strand, std::vector &tc, int tcStartInd, struct _dpAttribute &attr ) +{ + int i ; + int size ; + /*printf( "%s: ", __func__ ) ; + for ( i = 0 ; i < vcnt ; ++i ) + printf( "%d ", visit[i] ) ; + printf( ": %lf %d %d", attr.f1[ visit[0] ].cover, attr.f1[ visit[0] ].timeStamp, attr.timeStamp ) ; + printf( "\n" ) ;*/ + // Test whether it is stored in dp + if ( vcnt == 1 ) + { + if ( attr.f1[ visit[0] ].cover != -1 && attr.f1[ visit[0] ].strand == strand && ( attr.f1[ visit[0] ].timeStamp == attr.timeStamp || + ( attr.f1[ visit[0] ].minAbundance < attr.minAbundance && attr.f1[visit[0]].cover == -2 ) ) ) //even given lower minAbundance threshold, it fails + { + return attr.f1[ visit[0] ] ; + } + } + else if ( vcnt == 2 && attr.f2 ) + { + int a = visit[0] ; + int b = visit[1] ; + + if ( attr.f2[a][b].cover != -1 && attr.f2[a][b].strand == strand && ( attr.f2[a][b].timeStamp == attr.timeStamp || + ( attr.f2[a][b].minAbundance < attr.minAbundance && attr.f2[a][b].cover == -2 ) ) ) + { + return attr.f2[a][b] ; + } + } + else + { + int key = 0 ; + for ( i = 0 ; i < vcnt ; ++i ) + key = ( key * attr.seCnt + visit[i] ) % hashMax ; + if ( key < 0 ) + key += hashMax ; + + if ( attr.hash[key].cover != -1 && attr.hash[key].cnt == vcnt && attr.hash[key].strand == strand && + ( attr.hash[key].first == visit[0] ) && + ( attr.hash[key].timeStamp == attr.timeStamp || + ( attr.hash[key].minAbundance < attr.minAbundance && attr.hash[key].cover == -2 ) ) ) + { + struct _transcript subTxpt = attr.bufferTxpt ; + subTxpt.seVector.Reset() ; + for ( i = 0 ; i < vcnt ; ++i ) + subTxpt.seVector.Set( visit[i] ) ; + //subTxpt.seVector.Print() ; + //attr.hash[key].seVector.Print() ; + subTxpt.seVector.Xor( attr.hash[key].seVector ) ; + subTxpt.seVector.MaskRegionOutside( visit[0], visit[ vcnt - 1] ) ; + //printf( "hash test: %d %d\n", key, subTxpt.seVector.IsAllZero() ) ; + if ( subTxpt.seVector.IsAllZero() ) + { + return attr.hash[key] ; + } + + // Can't use the code below, because vcnt is the header of subexons. + /*for ( i = 0 ; i < vcnt ; ++i ) + if ( !attr.hash[key].seVector.Test( visit[i] ) ) + break ; + if ( i >= vcnt ) + return attr.hash[key] ;*/ + + } + } + // adjust tcStartInd + size = tc.size() ; + for ( i = tcStartInd ; i < size ; ++i ) + if ( tc[i].first >= visit[0] ) + break ; + tcStartInd = i ; + + + struct _subexon *subexons = attr.subexons ; + struct _dp visitdp ; + visitdp.seVector.Init( attr.seCnt ) ; + visitdp.cover = -1 ; + + struct _transcript &subTxpt = attr.bufferTxpt ; + // This happens when it is called from PickTranscriptsByDP, the first subexon might be the end. + subTxpt.seVector.Reset() ; + for ( i = 0 ; i < vcnt ; ++i ) + subTxpt.seVector.Set( visit[i] ) ; + subTxpt.first = visit[0] ; + subTxpt.last = visit[vcnt - 1] ; + + if ( subexons[ visit[vcnt - 1] ].canBeEnd ) + { + subTxpt.partial = false ; + double cover = 0 ; + for ( i = tcStartInd ; i < size ; ++i ) + { + if ( tc[i].first > subTxpt.last ) + break ; + + if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 ) + { + if ( tc[i].normAbund <= attr.minAbundance ) + { + cover = -2 ; + break ; + } + + if ( tc[i].abundance <= 0 ) + continue ; + if ( attr.forAbundance ) + { + if ( tc[i].normAbund < cover || cover == 0 ) + cover = tc[i].normAbund ; + } + else + ++cover ; + } + } + + visitdp.seVector.Assign( subTxpt.seVector ) ; + visitdp.cover = cover ; + visitdp.first = subTxpt.first ; + visitdp.last = subTxpt.last ; + visitdp.strand = strand ; + } + + // Now we extend. + size = tc.size() ; + int *extends = new int[tc.size() - tcStartInd + 1] ; + int extendCnt = 0 ; + subTxpt.partial = true ; + for ( i = tcStartInd ; i < size ; ++i ) + { + if ( tc[i].first > subTxpt.last ) + break ; + if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 2 ) + { + extends[extendCnt] = i ; + ++extendCnt ; + } + } + + // Sort the extend by the index of the last subexon. + if ( extendCnt > 0 ) + { + struct _pair32 *extendsPairs = new struct _pair32[extendCnt] ; + + for ( i = 0 ; i < extendCnt ; ++i ) + { + extendsPairs[i].a = extends[i] ; + extendsPairs[i].b = tc[ extends[i] ].last ; + } + qsort( extendsPairs, extendCnt, sizeof( struct _pair32 ), CompPairsByB ) ; + + for ( i = 0 ; i < extendCnt ; ++i ) + extends[i] = extendsPairs[i].a ; + + delete[] extendsPairs ; + } + + size = subexons[ visit[vcnt - 1] ].nextCnt ; + int nextvCnt = 1 ; + if ( extendCnt > 0 && tc[ extends[ extendCnt - 1 ] ].last - visit[ vcnt - 1 ] > 1 ) + nextvCnt = tc[ extends[ extendCnt - 1 ] ].last - visit[ vcnt - 1 ] ; + int *nextv = new int[ nextvCnt ] ; + for ( i = 0 ; i < size ; ++i ) + { + int a = visit[vcnt - 1] ; + int b = subexons[a].next[i] ; + if ( ( SubexonGraph::IsSameStrand( subexons[a].rightStrand, strand ) + && SubexonGraph::IsSameStrand( subexons[b].leftStrand, strand ) ) + || + subexons[b].start == subexons[a].end + 1 ) + { + int backupStrand = strand ; + if ( subexons[b].start > subexons[a].end + 1 ) + strand = subexons[a].rightStrand ; + SearchSubTranscript( subexons[ visit[vcnt - 1] ].next[i], strand, visit, vcnt, visitdp, nextv, 0, extends, extendCnt, tc, tcStartInd, attr ) ; + strand = backupStrand ; + + } + } + //printf( "%s %d(%d) %d %d %d: %lf\n", __func__, visit[0], subexons[ visit[vcnt - 1] ].canBeEnd, size, extendCnt, strand, visitdp.cover ) ; + delete[] nextv ; + delete[] extends ; + + // store the result in the dp structure. + // We return the structure stored in dp to simplify the memory access pattern. + // In other words, we assume the structure returned from this function always uses the memory from attr.dp + if ( vcnt == 1 ) + { + SetDpContent( attr.f1[ visit[0] ], visitdp, attr ) ; + visitdp.seVector.Release() ; + return attr.f1[ visit[0] ] ; + } + else if ( vcnt == 2 && attr.f2 ) + { + SetDpContent( attr.f2[ visit[0] ][ visit[1] ], visitdp, attr ) ; + visitdp.seVector.Release() ; + return attr.f2[ visit[0] ][ visit[1] ] ; + } + else + { + int key = 0 ; + for ( i = 0 ; i < vcnt ; ++i ) + key = ( key * attr.seCnt + visit[i] ) % hashMax ; + if ( key < 0 ) + key += hashMax ; + + //static int hashUsed = 0 ; + //if ( attr.hash[key].cover == -1 ) + // ++hashUsed ; + //printf( "%d/%d\n", hashUsed, HASH_MAX) ; + //printf( "hash write: %d\n", key ) ; + SetDpContent( attr.hash[key], visitdp, attr ) ; + attr.hash[key].cnt = vcnt ; + visitdp.seVector.Release() ; + return attr.hash[key] ; + } +} + + +void TranscriptDecider::PickTranscriptsByDP( struct _subexon *subexons, int seCnt, int iterBound, Constraints &constraints, SubexonCorrelation &correlation, struct _dpAttribute &attr, std::vector &alltranscripts ) +{ + int i, j, k ; + + std::vector transcripts ; + std::vector &tc = constraints.constraints ; + int tcCnt = tc.size() ; + int coalesceThreshold = 1024 ; + + //printf( "tcCnt=%d\n", tcCnt ) ; + + attr.timeStamp = 1 ; + attr.bufferTxpt.seVector.Init( seCnt ) ; + attr.subexons = subexons ; + attr.seCnt = seCnt ; + + double maxAbundance = -1 ; + // Initialize the dp data structure + /*memset( attr.f1, -1, sizeof( struct _dp ) * seCnt ) ; + for ( i = 0 ; i < seCnt ; ++i ) + memset( attr.f2[i], -1, sizeof( struct _dp ) * seCnt ) ; + memset( attr.hash, -1, sizeof( struct _dp ) * HASH_MAX ) ;*/ + for ( i = 0 ; i < seCnt ; ++i ) + ResetDpContent( attr.f1[i] ) ; + for ( i = 0 ; i < seCnt && attr.f2 ; ++i ) + for ( j = i ; j < seCnt ; ++j ) + ResetDpContent( attr.f2[i][j] ) ; + for ( i = 0 ; i < hashMax ; ++i ) + ResetDpContent( attr.hash[i] ) ; + + // Set the uncovered pair + attr.uncoveredPair.clear() ; + BitTable bufferTable( seCnt ) ; + k = 0 ; + for ( i = 0 ; i < seCnt ; ++i ) + { + for ( ; k < tcCnt ; ++k ) + { + if ( tc[k].last >= i ) + break ; + } + + if ( k >= tcCnt || tc[k].first > i ) + { + for ( j = 0 ; j < subexons[i].nextCnt ; ++j ) + { + attr.uncoveredPair[i * seCnt + subexons[i].next[j] ] = 1 ; + } + continue ; + } + + for ( j = 0 ; j < subexons[i].nextCnt ; ++j ) + { + bool covered = false ; + int l, n ; + + n = subexons[i].next[j] ; + for ( l = k ; l < tcCnt ; ++l ) + { + if ( tc[l].first > i ) + break ; + if ( tc[l].vector.Test( i ) && tc[l].vector.Test( n ) ) + { + if ( n == i + 1 ) + { + covered = true ; + break ; + } + else + { + bufferTable.Assign( tc[l].vector ) ; + bufferTable.MaskRegionOutside( i + 1, n - 1 ) ; + if ( bufferTable.IsAllZero() ) + { + covered = true ; + break ; + } + } + } + } + + if ( !covered ) + { + //printf( "set!: (%d: %d %d) (%d: %d %d)\n", i, subexons[i].start, subexons[i].end, n, subexons[n].start, subexons[n].end ) ; + attr.uncoveredPair[ i * seCnt + n ] = 1 ; + } + } + } + bufferTable.Release() ; + + + // Find the max abundance + attr.forAbundance = true ; + attr.minAbundance = 0 ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].canBeStart ) + { + int visit[1] = {i} ; + struct _dp tmp ; + + tmp = SolveSubTranscript( visit, 1, 0, tc, 0, attr ) ; + + if ( tmp.cover > maxAbundance ) + maxAbundance = tmp.cover ; + } + } + //PrintLog( "maxAbundance=%lf", maxAbundance ) ; + //exit( 1 ) ; + + // Pick the transcripts. Quantative Set-Cover + // Notice that by the logic in SearchSubTxpt and SolveSubTxpt, we don't need to reinitialize the data structure. + attr.forAbundance = false ; + int *coveredTc = new int[tcCnt] ; + int coveredTcCnt ; + struct _dp maxCoverDp ; + struct _dp bestDp ; + std::map cachedCoverResult ; + + maxCoverDp.seVector.Init( seCnt ) ; + bestDp.seVector.Init( seCnt ) ; + int iterCnt = 0 ; + + while ( 1 ) + { + double bestScore ; + + // iterately assign constraints + attr.minAbundance = 0 ; + + // Find the best candidate transcript. + bestDp.cover = -1 ; + bestScore = -1 ; + while ( 1 ) + { + // iterate the change of minAbundance + if ( cachedCoverResult.find( attr.minAbundance ) != cachedCoverResult.end() ) + { + struct _dp tmp = cachedCoverResult[ attr.minAbundance ] ; + SetDpContent( maxCoverDp, tmp, attr ) ; + } + else + { + maxCoverDp.cover = -1 ; + ++attr.timeStamp ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].canBeStart == false ) + continue ; + int visit[1] = {i} ; + struct _dp tmp ; + tmp = SolveSubTranscript( visit, 1, 0, tc, 0, attr ) ; + + if ( tmp.cover > maxCoverDp.cover && tmp.cover > 0 ) + { + SetDpContent( maxCoverDp, tmp, attr ) ; + } + //if ( subexons[i].start == 6870264 || subexons[i].start == 6872237 ) + // printf( "%d: %lf\n", i, tmp.cover ) ; + } + + if ( maxCoverDp.cover == -1 ) + break ; + struct _dp ccr ; + ccr.seVector.Init( seCnt ) ; + SetDpContent( ccr, maxCoverDp, attr ) ; + cachedCoverResult[ attr.minAbundance ] = ccr ; + } + // the abundance for the max cover txpt. + double min = -1 ; + struct _transcript &subTxpt = attr.bufferTxpt ; + subTxpt.seVector.Assign( maxCoverDp.seVector ) ; + subTxpt.first = maxCoverDp.first ; + subTxpt.last = maxCoverDp.last ; + + for ( i = 0 ; i < tcCnt ; ++i ) + { + if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 ) + { + if ( tc[i].normAbund < min || min == -1 ) + min = tc[i].normAbund ; + } + } + + if ( attr.minAbundance == 0 ) + { + std::vector subexonIdx ; + maxCoverDp.seVector.GetOnesIndices( subexonIdx ) ; + int size = subexonIdx.size() ; + for ( i = 0 ; i < size - 1 ; ++i ) + if ( attr.uncoveredPair.find( subexonIdx[i] * seCnt + subexonIdx[i + 1] ) != attr.uncoveredPair.end() ) + { + min = 1e-6 ; + break ; + } + } + + double score = ComputeScore( maxCoverDp.cover, 1.0, min, maxAbundance, 0 ) ; + if ( bestScore == -1 || score > bestScore ) + { + bestScore = score ; + SetDpContent( bestDp, maxCoverDp, attr ) ; + } + else if ( score < bestScore ) + { + if ( ComputeScore( maxCoverDp.cover, 1.0, maxAbundance, maxAbundance, 0 ) < bestScore ) + break ; + } + //PrintLog( "normAbund=%lf maxCoverDp.cover=%lf score=%lf timeStamp=%d", min, maxCoverDp.cover, score, attr.timeStamp ) ; + attr.minAbundance = min ; + } // end of iteration for minAbundance. + + if ( bestDp.cover == -1 ) + break ; + // Assign the constraints. + coveredTcCnt = 0 ; + double update = -1 ; + struct _transcript &subTxpt = attr.bufferTxpt ; + subTxpt.seVector.Assign( bestDp.seVector ) ; + subTxpt.first = bestDp.first ; + subTxpt.last = bestDp.last ; + subTxpt.partial = false ; + for ( i = 0 ; i < tcCnt ; ++i ) + { + if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 ) + { + if ( tc[i].abundance > 0 && + ( tc[i].abundance < update || update == -1 ) ) + { + update = tc[i].abundance ; + } + coveredTc[ coveredTcCnt ] = i ; + ++coveredTcCnt ; + } + /*else + { + printf( "%d: ", i ) ; + tc[i].vector.Print() ; + if ( i == 127 ) + { + printf( "begin debug:\n" ) ; + IsConstraintInTranscriptDebug( subTxpt, tc[i] ) ; + } + }*/ + } + update *= ( 1 + iterCnt / 50 ) ;//* ( 1 + iterCnt / 50 ) ; + + //PrintLog( "%d: update=%lf %d %d. %d %d %d", iterCnt, update, coveredTcCnt, tcCnt, + // bestDp.first, bestDp.last, subexons[ bestDp.first ].start ) ; + //bestDp.seVector.Print() ; + + struct _transcript nt ; + nt.seVector.Duplicate( bestDp.seVector ) ; + nt.first = bestDp.first ; + nt.last = bestDp.last ; + nt.partial = false ; + nt.abundance = 0 ; + for ( i = 0 ; i < coveredTcCnt ; ++i ) + { + j = coveredTc[i] ; + if ( tc[j].abundance > 0 ) + { + double tmp = ( tc[j].abundance > update ? update : tc[j].abundance ) ; + tc[j].abundance -= tmp ; + double factor = 1 ; + + nt.abundance += ( tc[j].support * update / tc[j].normAbund * factor ) ; + + if ( tc[j].abundance <= 0 ) + { + std::vector removeKey ; + for ( std::map::iterator it = cachedCoverResult.begin() ; it != cachedCoverResult.end() ; ++it ) + { + subTxpt.seVector.Assign( it->second.seVector ) ; + subTxpt.first = it->second.first ; + subTxpt.last = it->second.last ; + subTxpt.partial = false ; + if ( IsConstraintInTranscript( subTxpt, tc[j] ) == 1 ) + { + it->second.seVector.Release() ; + removeKey.push_back( it->first ) ; + } + } + int size = removeKey.size() ; + int l ; + for ( l = 0 ; l < size ; ++l ) + cachedCoverResult.erase( removeKey[l] ) ; + } + } + + if ( tc[j].abundance < 0 ) + tc[j].abundance = 0 ; + } + + transcripts.push_back( nt ) ; + if ( transcripts.size() >= transcripts.capacity() && (int)transcripts.size() >= coalesceThreshold ) + { + CoalesceSameTranscripts( transcripts ) ; + if ( transcripts.size() >= transcripts.capacity() / 2 ) + coalesceThreshold *= 2 ; + } + ++iterCnt ; + + if ( iterCnt >= iterBound ) + break ; + } + CoalesceSameTranscripts( transcripts ) ; + int size = transcripts.size() ; + // Compute the correlation score + for ( i = 0 ; i < size ; ++i ) + { + std::vector subexonInd ; + transcripts[i].seVector.GetOnesIndices( subexonInd ) ; + double cor = 2.0 ; + int s = subexonInd.size() ; + for ( j = 0 ; j < s ; ++j ) + for ( k = j + 1 ; k < s ; ++k ) + { + double tmp = correlation.Query( subexonInd[j], subexonInd[k] ) ; + if ( tmp < cor ) + cor = tmp ; + } + if ( cor > 1 ) + cor = 0 ; + transcripts[i].correlationScore = cor ; + } + + // store the result + for ( i = 0 ; i < size ; ++i ) + alltranscripts.push_back( transcripts[i] ) ; + + // Release the memory + for ( std::map::iterator it = cachedCoverResult.begin() ; it != cachedCoverResult.end() ; ++it ) + { + it->second.seVector.Release() ; + } + attr.bufferTxpt.seVector.Release() ; + + delete[] coveredTc ; + maxCoverDp.seVector.Release() ; + bestDp.seVector.Release() ; +} + + +// Add the preifx/suffix of transcripts to the list +void TranscriptDecider::AugmentTranscripts( struct _subexon *subexons, std::vector &alltranscripts, int limit, bool extend ) +{ + int i, j, k ; + int size = alltranscripts.size() ; + if ( size >= limit ) + return ; + + // Augment suffix, prefix transcripts + for ( i = 0 ; i < size ; ++i ) + { + std::vector subexonIdx ; + alltranscripts[i].seVector.GetOnesIndices( subexonIdx ) ; + int seIdxCnt = subexonIdx.size() ; + // suffix + for ( j = 1 ; j < seIdxCnt ; ++j ) + { + if ( subexons[ subexonIdx[j] ].canBeStart ) + { + struct _transcript nt ; + nt.first = subexonIdx[j] ; + nt.last = alltranscripts[i].last ; + nt.seVector.Duplicate( alltranscripts[i].seVector ) ; + nt.seVector.MaskRegionOutside( nt.first, nt.last ) ; + nt.partial = false ; + nt.correlationScore = 0 ; + nt.abundance = 0 ; + nt.constraintsSupport = NULL ; + + alltranscripts.push_back( nt ) ; + if ( alltranscripts.size() >= limit ) + return ; + } + } + + // prefix + for ( j = 0 ; j < seIdxCnt - 1 ; ++j ) + { + if ( subexons[ subexonIdx[j] ].canBeEnd ) + { + struct _transcript nt ; + nt.first = alltranscripts[i].first ; + nt.last = subexonIdx[j] ; + nt.seVector.Duplicate( alltranscripts[i].seVector ) ; + nt.seVector.MaskRegionOutside( nt.first, nt.last ) ; + nt.partial = false ; + nt.correlationScore = 0 ; + nt.abundance = 0 ; + nt.constraintsSupport = NULL ; + + alltranscripts.push_back( nt ) ; + if ( alltranscripts.size() >= limit ) + return ; + } + } + + if ( extend ) + { + //Extentions right. + for ( j = 0 ; j < seIdxCnt ; ++j ) + { + if ( subexons[ subexonIdx[j] ].nextCnt > 1 ) + { + for ( k = 0 ; k < subexons[ subexonIdx[j] ].nextCnt ; ++k ) + { + int idx = subexons[ subexonIdx[j] ].next[k] ; + + if ( alltranscripts[i].seVector.Test( idx ) ) + continue ; + int l ; + std::vector visited ; + while ( 1 ) + { + if ( subexons[idx].nextCnt > 1 || subexons[idx].prevCnt > 1 ) + { + break ; + } + + visited.push_back( idx ) ; + if ( subexons[idx].canBeEnd && subexons[idx].nextCnt == 0 ) + { + struct _transcript nt ; + nt.first = alltranscripts[i].first ; + nt.last = idx ; + nt.seVector.Duplicate( alltranscripts[i].seVector ) ; + nt.seVector.MaskRegionOutside( nt.first, subexonIdx[j] ) ; + int visitedSize = visited.size() ; + for ( l = 0 ; l < visitedSize ; ++l ) + nt.seVector.Set( visited[l] ) ; + nt.partial = false ; + nt.correlationScore = 0 ; + nt.abundance = 0 ; + nt.constraintsSupport = NULL ; + + alltranscripts.push_back( nt ) ; + if ( alltranscripts.size() >= limit ) + return ; + } + + if ( subexons[idx].nextCnt == 1 ) + idx = subexons[idx].next[0] ; + else + break ; + } + } + } + } + + // Extension towards left + for ( j = 0 ; j < seIdxCnt ; ++j ) + { + if ( subexons[ subexonIdx[j] ].prevCnt > 1 ) + { + for ( k = 0 ; k < subexons[ subexonIdx[j] ].prevCnt ; ++k ) + { + int idx = subexons[ subexonIdx[j] ].prev[k] ; + + if ( alltranscripts[i].seVector.Test( idx ) ) + continue ; + int l ; + std::vector visited ; + while ( 1 ) + { + if ( subexons[idx].nextCnt > 1 || subexons[idx].prevCnt > 1 ) + { + break ; + } + + visited.push_back( idx ) ; + if ( subexons[idx].canBeStart && subexons[idx].prevCnt == 0 ) + { + struct _transcript nt ; + nt.first = idx ; + nt.last = alltranscripts[i].last ; + nt.seVector.Duplicate( alltranscripts[i].seVector ) ; + nt.seVector.MaskRegionOutside( subexonIdx[j], nt.last ) ; + int visitedSize = visited.size() ; + for ( l = 0 ; l < visitedSize ; ++l ) + nt.seVector.Set( visited[l] ) ; + nt.partial = false ; + nt.correlationScore = 0 ; + nt.abundance = 0 ; + nt.constraintsSupport = NULL ; + + alltranscripts.push_back( nt ) ; + if ( alltranscripts.size() >= limit ) + return ; + } + + if ( subexons[idx].prevCnt == 1 ) + idx = subexons[idx].prev[0] ; + else + break ; + } + } + } + } + } // for if-extend + } + + CoalesceSameTranscripts( alltranscripts ) ; +} + +// Pick the transcripts from given transcripts. +void TranscriptDecider::PickTranscripts( struct _subexon *subexons, std::vector &alltranscripts, Constraints &constraints, + SubexonCorrelation &seCorrelation, std::vector &transcripts ) +{ + int i, j, k ; + std::vector chosen ; + std::vector &tc = constraints.matePairs ; + int atcnt = alltranscripts.size() ; + int tcCnt = tc.size() ; // transcript constraints + int seCnt = 0 ; + + if ( tcCnt == 0 ) + return ; + if ( atcnt > 0 ) + seCnt = alltranscripts[0].seVector.GetSize() ; + else + return ; + + double inf = -1 ; // infinity + int coalesceThreshold = 1024 ; + int *transcriptSeCnt = new int[ atcnt ] ; + int *transcriptLength = new int[atcnt] ; + double *transcriptAbundance = new double[atcnt] ; // the roughly estimated abundance based on constraints. + double *avgTranscriptAbundance = new double[atcnt] ; // the average normAbund from the compatible constraints. + + BitTable *btable = new BitTable[ atcnt ] ; + //BitTable lowCovSubexon ; // force the abundance to 0 for the transcript contains the subexon. + double *coveredPortion = new double[atcnt] ; + + memset( avgTranscriptAbundance, 0 ,sizeof( double ) * atcnt ) ; + for ( i = 0 ; i < atcnt ; ++i ) + btable[i].Init( tcCnt ) ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + int a = constraints.matePairs[j].i ; + int b = constraints.matePairs[j].j ; + + if ( constraints.constraints[a].support > inf ) + inf = constraints.constraints[a].support ; + if ( constraints.constraints[b].support > inf ) + inf = constraints.constraints[b].support ; + + if ( tc[j].normAbund > inf ) + inf = tc[j].normAbund ; + + tc[j].abundance = tc[j].normAbund ; + } + ++inf ; + bool btableSet = false ; + for ( i = 0 ; i < atcnt ; ++i ) + { + //printf( "correlation %d: %lf\n", i, alltranscripts[i].correlationScore ) ; + /*for ( int l = 0 ; l < subexonInd.size() ; ++l ) + { + for ( int m = l ; m < subexonInd.size() ; ++m ) + printf( "%lf ", seCorrelation.Query( l, m ) ) ; + printf( "\n" ) ; + }*/ + + for ( j = 0 ; j < tcCnt ; ++j ) + { + int a = tc[j].i ; + int b = tc[j].j ; + + //printf( "try set btble[ %d ].Set( %d ): %d %d\n", i, j, a, b ) ; + //alltranscripts[i].seVector.Print() ; + //constraints.constraints[a].vector.Print() ; + //constraints.constraints[b].vector.Print() ; + if ( IsConstraintInTranscript( alltranscripts[i], constraints.constraints[a] ) == 1 + && IsConstraintInTranscript( alltranscripts[i], constraints.constraints[b] ) == 1 ) + { + //printf( "set btble[ %d ].Set( %d ): %d %d\n", i, j, a, b ) ; + btable[i].Set( j ) ; + btableSet = true ; + } + } + transcriptSeCnt[i] = alltranscripts[i].seVector.Count() ; + } + if ( btableSet == false ) + { + for ( i = 0 ; i < atcnt ; ++i ) + btable[i].Release() ; + delete[] btable ; + return ; + } + + double maxAbundance = -1 ; // The abundance of the most-abundant transcript + double *adjustScore = new double[atcnt] ; + memset( adjustScore, 0, sizeof( double ) * atcnt ) ; + if ( atcnt > 0 /*&& alltranscripts[0].abundance == -1*/ ) + { + struct _pair32 *chain = new struct _pair32[seCnt] ; + bool *covered = new bool[seCnt] ; + bool *usedConstraints = new bool[constraints.constraints.size() ] ; + std::vector togetherChain ; // those subexons is more likely to show up in the same transcript, like an IR with overhang, should be together to represent a 3'/5'-end + + /*lowCovSubexon.Init( seCnt ) ; + double *avgDepth = new double[seCnt ] ; + + memset( avgDepth, 0, sizeof( double ) * seCnt ) ; + int size = constraints.constraints.size() ; + for ( i = 0 ; i < size ; ++i ) + { + std::vector subexonIdx ; + constraints.constraints[i].GetOnesIndices( subexonIdx ) ; + int seIdxCnt = subexonidx.size() ; + for ( j = 0 ; j < seIdxCnt ; ++j ) + avgDepth[ subexonidx[j] ] += constraints.constraints[i].support ; + } + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( avgDepth[i] * alignments.readLen / (double)( subexons[i].end - subexons[i].start + 1 ) < 1 ) + }*/ + + struct _pair32 firstRegion, lastRegion ; + + for ( i = 0 ; i < seCnt ; ) + { + for ( j = i + 1 ; j < seCnt ; ++j ) + { + if ( subexons[j].start > subexons[j - 1].end + 1 ) + break ; + } + + + int cnt = 0 ; + for ( k = i ; k < j ; ++k ) + { + if ( ( subexons[k].leftType == 2 && subexons[k].rightType == 1 ) + || ( subexons[k].leftType == 0 && subexons[k].rightType == 1 ) + || ( subexons[k].leftType == 2 && subexons[k].rightType == 0 ) ) + ++cnt ; + } + + if ( cnt <= 1 ) + { + i = j ; + continue ; + } + + BitTable tmpTable( seCnt ) ; + for ( k = i ; k < j ; ++k ) + { + if ( ( subexons[k].leftType == 2 && subexons[k].rightType == 1 ) + || ( subexons[k].leftType == 0 && subexons[k].rightType == 1 ) + || ( subexons[k].leftType == 2 && subexons[k].rightType == 0 ) ) + tmpTable.Set( k ) ; + } + togetherChain.push_back( tmpTable ) ; + i = j ; + } + + for ( i = 0 ; i < atcnt ; ++i ) + { + double value = inf ; + int tag = -1 ; + + alltranscripts[i].abundance = 0 ; + alltranscripts[i].constraintsSupport = new double[tcCnt] ; + + std::vector subexonIdx ; + alltranscripts[i].seVector.GetOnesIndices( subexonIdx ) ; + int seIdxCnt = subexonIdx.size() ; + transcriptLength[i] = 0 ; + + firstRegion.a = subexonIdx[0] ; + for ( j = 1 ; j < seIdxCnt ; ++j ) + { + if ( subexons[ subexonIdx[j] ].start > subexons[ subexonIdx[j - 1] ].end + 1 ) + break ; + } + firstRegion.b = subexonIdx[j - 1] ; + lastRegion.b = subexonIdx[ seIdxCnt - 1 ] ; + for ( j = seIdxCnt - 2 ; j >= 0 ; --j ) + { + if ( subexons[ subexonIdx[j] ].end < subexons[ subexonIdx[j + 1] ].start - 1 ) + break ; + } + lastRegion.a = subexonIdx[j + 1] ; + + for ( j = 0 ; j < seIdxCnt ; ++j ) + transcriptLength[i] += subexons[ subexonIdx[j] ].end - subexons[ subexonIdx[j] ].start + 1 ; + + //for ( j = firstRegion.b ; j < lastRegion.a ; ++j ) + for ( j = 0 ; j < seIdxCnt - 1 ; ++j ) + { + chain[j].a = subexonIdx[j] ; + chain[j].b = subexonIdx[j + 1] ; + covered[j] = false ; + } + memset( usedConstraints, false, sizeof( bool ) * constraints.constraints.size() ) ; + int compatibleCnt = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + alltranscripts[i].constraintsSupport[j] = 0 ; + if ( btable[i].Test(j) && tc[j].abundance > 0 ) + { + ++compatibleCnt ; + double adjustAbundance = tc[j].abundance ; + if ( seIdxCnt > 1 ) + { + if ( tc[j].i == tc[j].j + && ( constraints.constraints[ tc[j].i ].first + + constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].first + || constraints.constraints[ tc[j].i ].first + + constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].last ) ) + { + adjustAbundance = inf ; + } + else if ( tc[j].i != tc[j].j + && ( constraints.constraints[ tc[j].i ].first + + constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].first + || constraints.constraints[ tc[j].i ].first + + constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].last ) ) + { + adjustAbundance = constraints.constraints[ tc[j].j ].normAbund ; + } + else if ( tc[j].i != tc[j].j + && ( constraints.constraints[ tc[j].j ].first + + constraints.constraints[ tc[j].j ].last == 2 * alltranscripts[i].first + || constraints.constraints[ tc[j].j ].first + + constraints.constraints[ tc[j].j ].last == 2 * alltranscripts[i].last ) ) + { + adjustAbundance = constraints.constraints[ tc[j].i ].normAbund ; + } + } + if ( adjustAbundance < value ) + /*!( seIdxCnt > 1 + && ( ( ( constraints.constraints[ tc[j].i ].first >= firstRegion.a && constraints.constraints[ tc[j].i ].last <= firstRegion.b ) + && ( constraints.constraints[ tc[j].j ].first >= firstRegion.a && constraints.constraints[ tc[j].j ].last <= firstRegion.b ) ) + || ( ( constraints.constraints[ tc[j].i ].first >= lastRegion.a && constraints.constraints[ tc[j].i ].last <= lastRegion.b ) + && ( constraints.constraints[ tc[j].j ].first >= lastRegion.a && constraints.constraints[ tc[j].j ].last <= lastRegion.b ) ) ) ) + )*/ + { + // Not use the constraints totally within the 3'/5'-end in the transcript + value = adjustAbundance ; + tag = j ; + } + avgTranscriptAbundance[i] += tc[j].abundance ; + + if ( !usedConstraints[ tc[j].i ] ) + { + struct _constraint &c = constraints.constraints[ tc[j].i ] ; + for ( k = 0 ; k < seIdxCnt - 1 ; ++k ) + { + // Note that since the constraint is already compatible with the txpt, + // chain[k].a/b must be also adjacent in this constraint. + if ( c.vector.Test( chain[k].a ) && c.vector.Test( chain[k].b ) ) + covered[k] = true ; + } + usedConstraints[ tc[j].i ] = true ; + } + + if ( !usedConstraints[ tc[j].j ] ) + { + struct _constraint &c = constraints.constraints[ tc[j].j ] ; + for ( k = 0 ; k < seIdxCnt - 1 ; ++k ) + { + if ( c.vector.Test( chain[k].a ) && c.vector.Test( chain[k].b ) ) + covered[k] = true ; + } + usedConstraints[ tc[j].j ] = true ; + } + } + } + + // Get some penalty if something should together did not show up together + int size = togetherChain.size() ; + if ( size > 0 ) + { + BitTable bufferTable( seCnt ) ; + for ( j = 0 ; j < size ; ++j ) + { + bufferTable.Assign( togetherChain[j] ) ; + bufferTable.And( alltranscripts[i].seVector ) ; + //if ( !bufferTable.IsAllZero() && !bufferTable.IsEqual( togetherChain[j] ) ) + // value /= 2 ; + + if ( !bufferTable.IsAllZero() ) + { + if ( bufferTable.IsEqual( togetherChain[j] ) ) + //printf( "nice together!\n" ) ; + ; + else + value /= 2 ; + //printf( "bad together!\n" ) ; + } + } + bufferTable.Release() ; + } + + + // Every two-subexon chain should be covered by some reads if a transcript is expressed highly enough + int cnt = 0 ; + for ( j = 0 ; j < seIdxCnt - 1 ; ++j ) + if ( covered[j] == false ) // && j >= firstRegion.b && j <= lastRegion.a - 1 ) + { + value = 0 ; + } + else + ++cnt ; + if ( seIdxCnt > 1 ) + coveredPortion[i] = (double)cnt / (double)( seIdxCnt - 1 ) ; + else + coveredPortion[i] = 1 ; + if ( coveredPortion[i] == 0 ) + coveredPortion[i] = (double)0.5 / ( seIdxCnt ) ; + + // For short subexon (readLength-subexon_length-1>30), we further require a constraint cover three conseuctive subexon + /*memset( usedConstraints, false, sizeof( bool ) * constraints.constraints.size() ) ; + for ( j = 1 ; j < seIdxCnt - 1 ; ++j ) + { + int k = subexonIdx[j] ; + if ( alignments.readLen - ( subexons[k].end - subexons[k].start + 1 ) - 1 <= 30 ) + continue ; + // We need at least one of the side subexons are adjacent to the center one. + if ( subexons[ subexonIdx[j - 1] ].end + 1 < subexons[k].start && subexons[k].end + 1 < subexons[ subexonIdx[j + 1] ].start ) + continue ; + + int l = 0 ; + for ( l = 0 ; l < tcCnt ; ++l ) + { + if ( btable[i].Test(l) && tc[l].abundance > 0 ) + { + if ( !usedConstraints[ tc[l].i ] ) + { + struct _constraint &c = constraints.constraints[ tc[l].i ] ; + if ( c.vector.Test( subexonIdx[j - 1] ) && c.vector.Test( subexonIdx[j] ) && + c.vector.Test( subexonIdx[j + 1] ) ) + break ; + usedConstraints[ tc[l].i ] = true ; + } + + if ( !usedConstraints[ tc[l].j ] ) + { + struct _constraint &c = constraints.constraints[ tc[l].j ] ; + if ( c.vector.Test( subexonIdx[j - 1] ) && c.vector.Test( subexonIdx[j] ) && + c.vector.Test( subexonIdx[j + 1] ) ) + break ; + usedConstraints[ tc[l].j ] = true ; + } + } + } + // It is not covered + if ( l >= tcCnt ) + { + int residual = alignments.readLen - ( subexons[k].end - subexons[k].start + 1 ) - 1 ; + //printf( "residual: %d %d %lf\n", k, residual, value ) ; + if ( value * residual > 2 ) + { + value = 1 / (double)residual ; + } + } + }*/ + + if ( tag == -1 ) + value = 0 ; + if ( value > maxAbundance ) + maxAbundance = value ; + transcriptAbundance[i] = value ; + if ( tag != -1 ) + avgTranscriptAbundance[i] /= compatibleCnt ; + + //printf( "abundance %d: %lf %lf ", i, value, avgTranscriptAbundance[i] ) ; + //alltranscripts[i].seVector.Print() ; + } + if ( maxAbundance == 0 ) + { + for ( i = 0 ; i < atcnt ; ++i ) + { + transcriptAbundance[i] = coveredPortion[i] ; + } + maxAbundance = 1 ; + } + //printf( "%s: %lf\n", __func__, maxAbundance ) ; + int size = togetherChain.size() ; + for ( j = 0 ; j < size ; ++j ) + togetherChain[j].Release() ; + delete[] usedConstraints ; + delete[] covered ; + delete[] chain ; + } + else + { + for ( i = 0 ; i < atcnt ; ++i ) + { + transcriptAbundance[i] = alltranscripts[i].abundance ; + if ( transcriptAbundance[i] > maxAbundance ) + maxAbundance = transcriptAbundance[i] ; + coveredPortion[i] = 1 ; + } + if ( maxAbundance == 0 ) + maxAbundance = 1 ; + } + + // Obtain the prefix, suffix information of the transcripts. + int *nextSuffix, *nextPrefix ; + struct _pair32 *txptRank ; + nextSuffix = new int[atcnt] ; + nextPrefix = new int[atcnt] ; + txptRank = new struct _pair32[atcnt] ; + memset( nextSuffix, -1, sizeof( int ) * atcnt ) ; + memset( nextPrefix, -1, sizeof( int ) * atcnt ) ; + /*for ( i = 0 ; i < atcnt ; ++i ) + { + std::vector subexonIdx ; + txptRank[i].a = i ; + alltranscripts[i].seVector.GetOnesIndices( subexonIdx ) ; + txptRank[i].b = subexonIdx.size() ; + } + qsort( txptRank, atcnt, sizeof( struct _pair32 ), CompPairsByB) ; + BitTable bufferTable( seCnt ) ; + for ( i = atcnt - 1 ; i >= 0 ; --i ) + { + int a = txptRank[i].a ; + for ( j = i - 1 ; j >= 0 ; --j ) + { + if ( txptRank[i].b == txptRank[j].b ) + continue ; + + int b = txptRank[j].a ; + + if ( alltranscripts[b].last != alltranscripts[a].last ) + continue ; + + bufferTable.Assign( alltranscripts[a].seVector ) ; + bufferTable.MaskRegionOutside( alltranscripts[b].first, alltranscripts[b].last ) ; + if ( bufferTable.IsEqual( alltranscripts[b].seVector ) ) + { + nextSuffix[a] = b ; + break ; + } + } + } + for ( i = atcnt - 1 ; i >= 0 ; --i ) + { + int a = txptRank[i].a ; + for ( j = i - 1 ; j >= 0 ; --j ) + { + if ( txptRank[i].b == txptRank[j].b ) + continue ; + + int b = txptRank[j].a ; + + if ( alltranscripts[b].first != alltranscripts[a].first ) + continue ; + + bufferTable.Assign( alltranscripts[a].seVector ) ; + bufferTable.MaskRegionOutside( alltranscripts[b].first, alltranscripts[b].last ) ; + if ( bufferTable.IsEqual( alltranscripts[b].seVector ) ) + { + nextPrefix[a] = b ; + break ; + } + } + } + + bufferTable.Release() ;*/ + delete[] txptRank ; + + // Quantative Set-Cover + int iterCnt = -1 ; + double *coverCnt = new double[atcnt] ; + for ( i = 0 ; i < atcnt ; ++i ) + coverCnt[i] = -1 ; + int *list = new int[atcnt] ; + int listCnt ; + + while ( 1 ) + { + double max = -1 ; + int maxtag = -1 ; + double maxcnt = -1 ; + ++iterCnt ; + + // Find the optimal candidate. + for ( i = 0 ; i < atcnt ; ++i ) + { + double value = inf ; + double cnt = 0 ; + + if ( coverCnt[i] == -1 ) + { + for ( j = 0 ; j < tcCnt ; ++j ) + { + if ( tc[j].abundance > 0 && btable[i].Test( j ) ) + { + cnt += tc[j].effectiveCount ; + } + } + /*else + { + std::vector tcIdx ; + btable[i].GetOnesIndices( tcIdx ) ; + int size = tcIdx.size() ; + for ( j = 0 ; j < size ; ++j ) + { + if ( tc[ tcIdx[j] ].abundance > 0 ) + { + cnt += tc[ tcIdx[j] ].effectiveCount ; + } + } + }*/ + coverCnt[i] = cnt ; + } + else + { + cnt = coverCnt[i] ; + } + + value = transcriptAbundance[i] ; + if ( cnt < 1 ) // This transcript does not satisfy any undepleted constraints. + continue ; + cnt *= coveredPortion[i] ; + + double weight = 1 ; //* seCnt / transcriptSeCnt[i] ; + //if ( maxAbundance >= 1 && value / maxAbundance >= 0.2 ) + // seCntAdjust = sqrt( (double)( transcriptSeCnt[i] ) / seCnt ) ;//< 0.5 ? 0.5 : (double)( transcriptSeCnt[i] ) / seCnt ; + + if ( alltranscripts[i].FPKM > 0 && sampleCnt > 1 ) + weight = ( 1 + alltranscripts[i].FPKM / sampleCnt ) ; + + double score = ComputeScore( cnt, weight, value, maxAbundance, alltranscripts[i].correlationScore ) ; + if ( cnt > maxcnt ) + maxcnt = cnt ; + score += adjustScore[i] ; + if ( score > max ) + { + max = score ; + maxtag = i ; + } + else if ( score == max ) + { + if ( avgTranscriptAbundance[maxtag] < avgTranscriptAbundance[i] ) + { + max = score ; + maxtag = i ; + } + } + //printf( "score: %d %lf -> %lf\n", i, cnt, score ) ; + } + + if ( maxcnt == 0 || maxtag == -1 ) + break ; + + // Find the constraint that should be depleted. + double update = inf ; + int updateTag = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + if ( btable[ maxtag ].Test( j ) && tc[j].abundance > 0 && + tc[j].abundance <= update ) + { + update = tc[j].abundance ; + updateTag = j ; + } + } + + // Search suffix and prefix to see whether these fit better. + int p = nextSuffix[ maxtag] ; + while ( p != -1 ) + { + if ( transcriptAbundance[p] >= 10.0 * transcriptAbundance[maxtag] + && btable[p].Test( updateTag ) ) + { + //printf( "%d\n", p ) ; + maxtag = p ; + break ; + } + p = nextSuffix[p] ; + } + p = nextPrefix[maxtag] ; + while ( p != -1 ) + { + if ( transcriptAbundance[p] >= 10.0 * transcriptAbundance[maxtag] + && btable[p].Test( updateTag ) ) + { + maxtag = p ; + break ; + } + p = nextPrefix[p] ; + } + + + // Update the abundance. + int supportCnt = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + if ( btable[maxtag].Test( j ) ) + { + if ( tc[j].abundance > 0 ) + { + tc[j].abundance -= 1 * update ; + double factor = tc[j].effectiveCount ; + double tmp = ( tc[j].support * update / tc[j].normAbund * factor ) ; + alltranscripts[maxtag].constraintsSupport[j] += tmp ; + alltranscripts[maxtag].abundance += tmp ; + + if ( tc[j].abundance <= 0 ) + { + int l ; + for ( l = 0 ; l < atcnt ; ++l ) + { + if ( btable[l].Test(j) ) + coverCnt[l] -= tc[j].effectiveCount ; + } + } + ++supportCnt ; + } + else if ( alltranscripts[maxtag].constraintsSupport[j] == 0 ) + { + double sum = 0 ; + double takeOut = 0 ; + double factor = tc[j].effectiveCount ; + listCnt = 0 ; + for ( i = 0 ; i < atcnt ; ++i ) + { + if ( i == maxtag ) + continue ; + + if ( alltranscripts[i].abundance > 0 && btable[i].Test(j) ) + { + sum += alltranscripts[i].constraintsSupport[j] ; + + double tmp = ( alltranscripts[i].constraintsSupport[j] + alltranscripts[maxtag].constraintsSupport[j] ) * + transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[i] ) + - alltranscripts[maxtag].constraintsSupport[j] ; + if ( tmp > 0 ) + { + list[ listCnt ] = i ; + ++listCnt ; + takeOut += tmp ; //alltranscripts[i].constraintsSupport[j] * transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[i] ) ; + } + } + } + + double ratio = 1 ; + double takeOutFactor = 0.5 ; + if ( update < tc[j].normAbund ) + { + if ( takeOut > ( tc[j].support * update / tc[j].normAbund * factor ) * takeOutFactor ) + ratio = ( tc[j].support * update / tc[j].normAbund * factor ) * takeOutFactor / takeOut ; + } + else + { + if ( takeOut > ( tc[j].support * factor ) * takeOutFactor ) + ratio = ( tc[j].support * factor ) * takeOutFactor / takeOut ; + } + + if ( 1 ) //update < tc[j].normAbund ) + { + for ( i = 0 ; i < listCnt ; ++i ) + { + //double tmp = ( tc[j].support * update / tc[j].normAbund * factor ) * + // ( alltranscripts[ list[i] ].constraintsSupport[j] / sum ) ; + //if ( alltranscripts[ list[i] ].constraintsSupport[j] < tmp ) + // printf( "WARNING! %lf %lf, %lf\n", alltranscripts[ list[i] ].constraintsSupport[j], sum, tmp ) ; + + //double tmp = alltranscripts[ list[i] ].constraintsSupport[j] * transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[ list[i] ] ) * ratio ; + double tmp = ( ( alltranscripts[ list[i] ].constraintsSupport[j] + alltranscripts[maxtag].constraintsSupport[j] ) * + transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[ list[i] ] ) + - alltranscripts[maxtag].constraintsSupport[j] ) * ratio ; + + + alltranscripts[ list[i] ].constraintsSupport[j] -= tmp ; + alltranscripts[ list[i] ].abundance -= tmp ; + } + //double tmp = ( tc[j].support * update / tc[j].normAbund * factor ) ; + //printf( "%lf %lf. %lf %lf\n", takeOut, ratio, update, tc[j].normAbund ) ; + double tmp = takeOut * ratio ; + alltranscripts[maxtag].constraintsSupport[j] += tmp ; + alltranscripts[maxtag].abundance += tmp ; + } + /*else + { + double tmp = ( tc[j].support / (double)( listCnt + 1 ) ) * factor ; + for ( i = 0 ; i < listCnt ; ++i ) + { + alltranscripts[ list[i] ].abundance -= alltranscripts[ list[i] ].constraintsSupport[j] ; + + alltranscripts[ list[i] ].constraintsSupport[j] = tmp ; + alltranscripts[ list[i] ].abundance += tmp ; + } + alltranscripts[maxtag].constraintsSupport[j] += tmp ; + alltranscripts[maxtag].abundance += tmp ; + }*/ + + } + } + + if ( tc[j].abundance < 0 ) + { + tc[j].abundance = 0 ; + + } + } + tc[ updateTag ].abundance = 0 ; + if ( supportCnt == 0 ) + break ; + //adjustScore[maxtag] += 1 / (double)tcCnt ; + //printf( "maxtag=%d %lf %d\n", maxtag, update, updateTag ) ; + } + + for ( i = 0 ; i < atcnt ; ++i ) + { + if ( alltranscripts[i].abundance > 0 ) + { + struct _transcript nt = alltranscripts[i] ; + nt.seVector.Nullify() ; + nt.seVector.Duplicate( alltranscripts[i].seVector ) ; + nt.constraintsSupport = NULL ; + if ( transcriptAbundance[i] == 0 ) + nt.correlationScore = -1 ; + else + nt.correlationScore = 0 ; + nt.id = i ; + transcripts.push_back( nt ) ; + } + } + + // Release the memory of btable. + for ( i = 0 ; i < atcnt ; ++i ) + { + delete[] alltranscripts[i].constraintsSupport ; + btable[i].Release() ; + } + delete[] btable ; + + delete[] list ; + delete[] transcriptSeCnt ; + delete[] transcriptLength ; + delete[] transcriptAbundance ; + delete[] avgTranscriptAbundance ; + delete[] coveredPortion ; + delete[] adjustScore ; + delete[] coverCnt ; + + delete[] nextPrefix ; + delete[] nextSuffix ; + + // Redistribute weight if there is some constraints that are unbalanced. + /*tcnt = transcripts.size() ; + for ( i = 0 ; i < tcnt ; ++i ) + { + int maxRatio = -1 ; + for ( j = 0 ; j < tcCnt ; ++j ) + if ( transcripts[i].constraintsSupport[j] > 0 ) + { + double factor = tc[j].effectiveCount ; + if ( transcripts[]) + } + }*/ +} + +void TranscriptDecider::AbundanceEstimation( struct _subexon *subexons, int seCnt, Constraints &constraints, std::vector &transcripts ) +{ + int tcnt = transcripts.size() ; + int size ; + int i, j ; + if ( tcnt <= 0 ) + return ; + + std::vector &tc = constraints.matePairs ; + int tcCnt = tc.size() ; // transcript constraints + + BitTable *btable = new BitTable[ tcnt ] ; + int *transcriptLength = new int[tcnt] ; + int *compatibleList = new int[tcnt] ; + double *rho = new double[tcnt] ; // the abundance. + int iterCnt = 0 ; + + for ( i = 0 ; i < tcnt ; ++i ) + transcripts[i].constraintsSupport = new double[ tcCnt ] ; + + for ( i = 0 ; i < tcnt ; ++i ) + { + btable[i].Init( tcCnt ) ; + double min = -1 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + int a = tc[j].i ; + int b = tc[j].j ; + + if ( IsConstraintInTranscript( transcripts[i], constraints.constraints[a] ) == 1 + && IsConstraintInTranscript( transcripts[i], constraints.constraints[b] ) == 1 ) + { + //printf( "set btble[ %d ].Set( %d ): %d %d\n", i, j, a, b ) ; + btable[i].Set( j ) ; + + if ( min == -1 || tc[j].normAbund < min ) + min = tc[j].normAbund ; + } + } + + std::vector subexonIdx ; + transcripts[i].seVector.GetOnesIndices( subexonIdx ) ; + int subexonIdxCnt = subexonIdx.size() ; + int len = 0 ; + for ( j = 0 ; j < subexonIdxCnt ; ++j ) + len += subexons[ subexonIdx[j] ].end - subexons[ subexonIdx[j] ].start + 1 ; + transcriptLength[i] = len - alignments.fragLen + 2 * alignments.fragStdev ; + if ( transcriptLength[i] < 1 ) + transcriptLength[i] = 1 ; + rho[i] = transcripts[i].abundance / transcriptLength[i] ; // use the rough estimation generated before. + if ( transcripts[i].correlationScore == -1 && rho[i] > 0.1 / (double)alignments.readLen ) + rho[i] = 0.1 / (double)alignments.readLen ; + } + + while ( 1 ) + { + for ( i = 0 ; i < tcnt ; ++i ) + for ( j = 0 ; j < tcCnt ; ++j ) + { + transcripts[i].constraintsSupport[j] = 0 ; + } + for ( j = 0 ; j < tcCnt ; ++j ) + { + int clCnt = 0 ; + double sum = 0 ; + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( btable[i].Test(j) ) + { + compatibleList[ clCnt ] = i ; + ++clCnt ; + sum += rho[i] ; + } + } + + for ( i = 0 ; i < clCnt ; ++i ) + { + double factor = tc[j].effectiveCount ; + transcripts[ compatibleList[i] ].constraintsSupport[j] = ( rho[ compatibleList[i] ] / sum ) * tc[j].support * factor ; + } + } + + double diff = 0 ; + for ( i = 0 ; i < tcnt ; ++i ) + { + double newAbund = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + newAbund += transcripts[i].constraintsSupport[j] ; + double old = rho[i] ; + rho[i] = newAbund / transcriptLength[i] ; + //printf( "rho[%d]=%lf\n", i, rho[i] ) ; + if ( transcripts[i].correlationScore == -1 && rho[i] > 0.1 / (double)alignments.readLen ) + rho[i] = 0.1 / (double)alignments.readLen ; + + double tmp = ( old - rho[i] ) ; + diff += tmp < 0 ? -tmp : tmp ; + } + //printf( "%lf\n", diff ) ; + if ( diff < 1e-3) + break ; + + ++iterCnt ; + if ( iterCnt >= 1000 ) + break ; + } + + for ( i = 0 ; i < tcnt ; ++i ) + { + //printf( "%lf=>", transcripts[i].abundance ) ; + transcripts[i].abundance = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + transcripts[i].abundance += transcripts[i].constraintsSupport[j] ; + } + //printf( "%lf. (%lf)\n", transcripts[i].abundance, transcripts[i].correlationScore ) ; + //transcripts[i].seVector.Print() ; + } + + for ( i = 0 ; i < tcnt ; ++i ) + delete[] transcripts[i].constraintsSupport ; + + // Release the memory of btable. + for ( i = 0 ; i < tcnt ; ++i ) + { + btable[i].Release() ; + } + delete[] compatibleList ; + delete[] btable ; + delete[] transcriptLength ; + delete[] rho ; +} + +int TranscriptDecider::RefineTranscripts( struct _subexon *subexons, int seCnt, bool aggressive, + std::map *subexonChainSupport, int *txptSampleSupport, std::vector &transcripts, Constraints &constraints ) +{ + int i, j, k ; + int tcnt = transcripts.size() ; + if ( tcnt == 0 ) + return 0 ; + int tcCnt = constraints.matePairs.size() ; + + std::vector &tc = constraints.matePairs ; + std::vector &scc = constraints.constraints ; //single-end constraints.constraints + + // Remove transcripts whose FPKM are too small. + //printf( "%d %d\n", usedGeneId, baseGeneId ) ; + double *geneMaxFPKM = new double[usedGeneId - baseGeneId ] ; + int *geneMaxFPKMTag = new int[usedGeneId - baseGeneId ] ; + double *nonOverlapMaxFPKM = new double[ usedGeneId - baseGeneId ] ; // the max FPKM among all the transcripts not overlapping with maxFPKMTag transcripts. + memset( geneMaxFPKM, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ; + memset( geneMaxFPKMTag, 0, sizeof( int ) * ( usedGeneId - baseGeneId ) ) ; + memset( nonOverlapMaxFPKM, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ; + + double *geneMaxCov = new double[ usedGeneId - baseGeneId ] ; + memset( geneMaxCov, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ; + int *txptGid = new int[tcnt] ; + + /*for ( i = 0 ; i < tcnt ; ++i ) + { + printf( "%d: %lf ", i, transcripts[i].FPKM ) ; + transcripts[i].seVector.Print() ; + }*/ + + /*================================================================== + Remove transcripts that has too few relative FPKM. (-f) + ====================================================================*/ + for ( i = 0 ; i < tcnt ; ++i ) + { + int gid = GetTranscriptGeneId( transcripts[i], subexons ) ; + int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[i].abundance, transcripts[i].FPKM ) ; + //printf( "gid=%d\n", gid ) ; + //printf( "%lf %lf %d\n", transcripts[i].abundance, transcripts[i].FPKM, len ) ; + if ( transcripts[i].FPKM > geneMaxFPKM[gid - baseGeneId ] ) + { + geneMaxFPKM[ gid - baseGeneId ] = transcripts[i].FPKM ; + geneMaxFPKMTag[ gid - baseGeneId ] = i ; + } + if ( transcripts[i].abundance * alignments.readLen / len > geneMaxCov[gid - baseGeneId ] ) + geneMaxCov[gid - baseGeneId] = ( transcripts[i].abundance * alignments.readLen ) / len ; + txptGid[i] = gid ; + } + + for ( i = 0 ; i < tcnt ; ++i ) + { + int tag = txptGid[i] - baseGeneId ; + if ( ( transcripts[i].last < transcripts[ geneMaxFPKMTag[ tag ] ].first + || transcripts[i].first > transcripts[ geneMaxFPKMTag[tag] ].last ) && transcripts[i].FPKM > nonOverlapMaxFPKM[tag] ) + nonOverlapMaxFPKM[tag] = transcripts[i].FPKM ; + } + BitTable bufferTable ; + bufferTable.Duplicate( transcripts[0].seVector ) ; + + if ( !aggressive ) + { + // Rescue the transcripts covering unique constraints. + int cnt = 0 ; + int tag = 0 ; + int *uniqCount = new int[tcnt] ; + memset( uniqCount, 0, sizeof( int ) * tcnt ) ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + cnt = 0 ; + if ( tc[j].uniqSupport <= 5 ) + continue ; + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( IsConstraintInTranscript( transcripts[i], scc[ tc[j].i ] ) && + IsConstraintInTranscript( transcripts[i], scc[ tc[j].j] ) ) + { + tag = i ; + ++cnt ; + } + if ( cnt >= 2 ) + break ; + } + if ( cnt == 1 ) + { + ++uniqCount[tag] ; + } + } + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( uniqCount[i] >= 2 ) + { + transcripts[i].abundance *= 4 ; + transcripts[i].FPKM *= 4 ; + } + } + + delete[] uniqCount ; + } + + int sccCnt = scc.size() ; + double filterFactor = 1.0 ; + + for ( i = 0 ; i < tcnt ; ++i ) + { + //printf( "%d: %lf %lf\n", txptGid[i], transcripts[i].abundance, geneMaxFPKM[ txptGid[i] - baseGeneId ] ) ; + + if ( transcripts[i].FPKM < filterFactor * FPKMFraction * geneMaxFPKM[ txptGid[i] - baseGeneId ] ) + { + /*int cnt = 0 ; + int coverCnt = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + if ( transcripts[i].constraintsSupport[j] > 0 ) + ++coverCnt ; + double factor = tc[j].effectiveCount ; + if ( transcripts[i].constraintsSupport[j] >= factor * tc[j].support - 1e-3 + && tc[j].support >= 10 + && tc[j].uniqSupport >= 0.95 * tc[j].support ) + { + ++cnt ; + } + } + //cnt = 0 ; + if ( cnt >= 2 ) + { + ; + } + else*/ + transcripts[i].abundance = -transcripts[i].abundance ; + } + //if ( transcripts[i].FPKM >= 0.8 * geneMaxFPKM[ txptGid[i] - baseGeneId ] && geneMaxCov[ txptGid[i] - baseGeneId ] >= txptMinReadDepth ) + // continue ; + } + + if ( nonOverlapMaxFPKM != 0 ) + { + // Go two iterations to rescue, the first iteration should be just for marking. + std::vector rescueList ; + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( transcripts[i].abundance >= 0 ) + continue ; + + for ( j = 0 ; j < tcnt ; ++j ) + { + if ( transcripts[j].abundance < 0 || txptGid[i] != txptGid[j] ) + continue ; + if ( transcripts[i].first <= transcripts[j].last && transcripts[i].last >= transcripts[j].first ) + /*bufferTable.Assign( transcripts[i].seVector ) ; + bufferTable.And( transcripts[j].seVector ) ; + + if ( !bufferTable.IsAllZero() )*/ + break ; + } + if ( j >= tcnt && transcripts[i].FPKM >= FPKMFraction * nonOverlapMaxFPKM[ txptGid[i] - baseGeneId ] ) + { + //transcripts[i].abundance = -transcripts[i].abundance ; + rescueList.push_back( i ) ; + } + } + + int size = rescueList.size() ; + for ( i = 0 ; i < size ; ++i ) + transcripts[ rescueList[i] ].abundance *= -1 ; + } + + /*================================================================== + Remove transcripts that has too few read coverage (-d) + ====================================================================*/ + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( transcripts[i].abundance >= 0 ) + { + int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[i].abundance, transcripts[i].FPKM ) ; + double cov = ( transcripts[i].abundance * alignments.readLen ) / len ; + //printf( "%d: %d %d %lf %lf\n", i, len, transcripts[i].seVector.Count(), cov, geneMaxCov[ txptGid[i] - baseGeneId ] ) ; + + if ( ( tcnt > 1 || len <= 1000 || transcripts[i].seVector.Count() <= 3 ) && cov < txptMinReadDepth ) + { + //if ( usedGeneId == baseGeneId + 1 && /*transcripts[i].seVector.Count() > 3 + // && len > 1000 &&*/ geneMaxCov[ txptGid[i] - baseGeneId ] == cov ) + if ( geneMaxCov[ txptGid[i] - baseGeneId ] == cov ) + continue ; + + // Test whether it has some very abundant constraints. + /*int cnt = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + if ( transcripts[i].constraintsSupport[j] >= tc[j].support / 2.0 + && tc[j].support >= 10 + && tc[j].uniqSupport >= 0.95 * tc[j].support + && tc[j].normAbund >= 1 ) + { + ++cnt ; + } + } + + if ( cnt >= 1 ) + { + continue ; + }*/ + + // Test whether this transcript is fully covered. If so ,we can filter it. + + if ( geneMaxCov[ txptGid[i] - baseGeneId ] <= 5 ) + { + bufferTable.Reset() ; + for ( j = 0 ; j < sccCnt ; ++j ) + { + if ( !IsConstraintInTranscript( transcripts[i], scc[j] ) ) + continue ; + bufferTable.Or( scc[j].vector ) ; + } + if ( bufferTable.IsEqual( transcripts[i].seVector ) ) + transcripts[i].abundance = -transcripts[i].abundance ; + } + else + transcripts[i].abundance = -transcripts[i].abundance ; + + /*else + { + transcripts[i].seVector.Print() ; + bufferTable.Print() ; + OutputTranscript( stderr, subexons, transcripts[i] ) ; + }*/ + } + } + } + + /*================================================================== + Remove transcripts that is too short + ====================================================================*/ + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( transcripts[i].abundance <= 0 ) + continue ; + + int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[i].abundance, transcripts[i].FPKM ) ; + if ( len < 200 ) + { + transcripts[i].abundance = -transcripts[i].abundance ; + } + } + + // Rescue transcripts that showed up in many samples. + /*for ( i = 0 ; i < tcnt ; ++i ) + { + if ( transcripts[i].abundance > 0 ) + continue ; + if ( txptSampleSupport[ transcripts[i].id ] >= 3 && + txptSampleSupport[transcripts[i].id ] >= (int)( sampleCnt / 2 ) ) + transcripts[i].abundance = -transcripts[i].abundance ; + }*/ + + // Rescue some transcripts covering subexon chains showed up in many samples, but missing after filtration. + struct _constraint tmpC ; + tmpC.vector.Init( seCnt ) ; + + std::vector< struct _pair32 > missingChain ; + std::vector recoverCandidate ; + bool *used = new bool[tcnt] ; + memset( used, false, sizeof( bool ) * tcnt ) ; + + // Obtain the list of transcripts that should be recovered. + for ( i = 0 ; i < seCnt && sampleCnt > 1 ; ++i ) + { + + double maxFPKM = -1 ; + for ( std::map::iterator it = subexonChainSupport[i].begin() ; + it != subexonChainSupport[i].end() ; ++it ) + { + if ( sampleCnt >= 0 && ( it->second < 3 || it->second < (int)( 0.5 * sampleCnt ) ) && it->second <= sampleCnt / 2 ) + continue ; + + bool recover = true ; + tmpC.vector.Reset() ; + tmpC.vector.Set( i ) ; + tmpC.vector.Set( it->first ) ; + tmpC.first = i ; + tmpC.last = it->first ; + + + for ( j = 0 ; j < tcnt ; ++j ) + { + if ( transcripts[j].abundance < 0 ) + continue ; + + if ( IsConstraintInTranscript( transcripts[j], tmpC ) ) + { + recover = false ; + break ; + } + + if ( recover ) + { + for ( j = 0 ; j < tcnt ; ++j ) + { + if ( transcripts[j].abundance > 0 ) + continue ; + //printf( "%d %lf\n", IsConstraintInTranscript( transcripts[j], tmpC ), transcripts[j].FPKM ) ; + if ( IsConstraintInTranscript( transcripts[j], tmpC ) ) + { + /*if ( maxTag == -1 ) + maxTag = j ; + else + { + if ( txptSampleSupport[ transcripts[j].id ] > txptSampleSupport[ transcripts[maxTag ].id ] ) + maxTag = j ; + else if ( txptSampleSupport[ transcripts[j].id ] == txptSampleSupport[ transcripts[maxTag ].id ]) + { + if ( transcripts[j].FPKM > transcripts[maxTag].FPKM ) + maxTag = j ; + } + }*/ + + struct _pair32 np ; + np.a = i ; np.b = it->first ; + missingChain.push_back( np ) ; + + if ( !used[j] ) + { + recoverCandidate.push_back( j ) ; + used[j] = true ; + } + } + } + + /*if ( maxTag != -1 && txptSampleSupport[ transcripts[maxTag].id ] > 1 ) + { + //printf( "recover %d %d\n", maxTag, txptSampleSupport[ transcripts[maxTag].id ] ) ; + transcripts[maxTag].abundance *= -1 ; + }*/ + } + } + + } + } + + int size = recoverCandidate.size() ; + memset( used, false, sizeof( bool ) * tcnt ) ; + // Recover the candidates in the order of reliability + int *geneRecoverCnt = new int[ usedGeneId - baseGeneId ] ; + memset( geneRecoverCnt, 0, sizeof( int ) * ( usedGeneId - baseGeneId ) ) ; + int round = 1 ; + if ( aggressive && size > 1 ) + round = 1 ; + + for ( i = 0 ; i < size ; ++i ) + { + int maxTag = -1 ; + int maxCover = -1 ; + for ( j = 0 ; j < size ; ++j ) + { + if ( !used[ recoverCandidate[j] ] ) + { + /*int cover = 0 ; + + k = missingChain.size() ; + int l ; + for ( l = 0 ; l < k ; ++l ) + { + if ( missingChain[l].a == -1 ) + continue ; + + tmpC.vector.Reset() ; + tmpC.vector.Set( missingChain[l].a ) ; + tmpC.vector.Set( missingChain[l].b ) ; + tmpC.first = missingChain[l].a ; + tmpC.last = missingChain[l].b ; + + if ( IsConstraintInTranscript( transcripts[ recoverCandidate[j] ], tmpC ) ) + { + ++cover ; + } + }*/ + + if ( maxTag == -1 ) + { + maxTag = recoverCandidate[j] ; + //maxCover = cover ; + continue ; + } + + /*if ( cover > maxCover ) + { + maxTag = recoverCandidate[j] ; + maxCover = cover ; + } + else if ( cover == maxCover ) + {*/ + if ( txptSampleSupport[ transcripts[ recoverCandidate[j] ].id ] > + txptSampleSupport[ + transcripts[ maxTag ].id + ] ) + maxTag = recoverCandidate[j] ; + else if ( txptSampleSupport[ transcripts[ recoverCandidate[j] ].id ] == + txptSampleSupport[ transcripts[ maxTag ].id ] ) + { + if ( transcripts[ recoverCandidate[j] ].FPKM > transcripts[ maxTag ].FPKM ) + maxTag = recoverCandidate[j] ; + } + + /*else if ( transcripts[ recoverCandidate[j] ].FPKM > transcripts[ maxTag ].FPKM ) + maxTag = recoverCandidate[j] ; + else if ( transcripts[ recoverCandidate[j] ].FPKM == transcripts[ maxTag ].FPKM ) + { + if ( txptSampleSupport[ transcripts[ recoverCandidate[j] ].id ] > + txptSampleSupport[ transcripts[ maxTag ].id ] ) + maxTag = recoverCandidate[j] ; + }*/ + //} + } + } + + if ( maxTag == -1 || txptSampleSupport[ transcripts[ maxTag ].id ] <= 2 + || txptSampleSupport[ transcripts[maxTag].id ] < 0.5 * sampleCnt ) + break ; + + used[maxTag] = true ; + if ( geneRecoverCnt[ txptGid[maxTag] - baseGeneId ] >= round ) + continue ; + ++geneRecoverCnt[ txptGid[maxTag] - baseGeneId ] ; + + k = missingChain.size() ; + int cnt = 0 ; + for ( j = 0 ; j < k ; ++j ) + { + if ( missingChain[j].a == -1 ) + continue ; + + tmpC.vector.Reset() ; + tmpC.vector.Set( missingChain[j].a ) ; + tmpC.vector.Set( missingChain[j].b ) ; + tmpC.first = missingChain[j].a ; + tmpC.last = missingChain[j].b ; + + if ( IsConstraintInTranscript( transcripts[maxTag], tmpC ) ) + { + missingChain[j].a = -1 ; + ++cnt ; + } + } + + int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[maxTag].abundance, transcripts[maxTag].FPKM ) ; + double cov = ( transcripts[maxTag].abundance * alignments.readLen ) / len ; + if ( cnt >= 1 && cov > 1.0 ) + { + transcripts[maxTag].abundance *= -1 ; + } + } + delete[] used ; + delete[] geneRecoverCnt ; + tmpC.vector.Release() ; + + + tcnt = RemoveNegativeAbundTranscripts( transcripts ) ; + + + + delete []geneMaxCov ; + bufferTable.Release() ; + delete []geneMaxFPKM ; + delete []geneMaxFPKMTag ; + delete []nonOverlapMaxFPKM ; + delete []txptGid ; + + /*================================================================== + Remove transcripts that seems duplicated + ====================================================================*/ + for ( i = 0 ; i < tcnt ; ++i ) + { + int support = 0 ; + int uniqSupport = 0 ; + + for ( j = 0 ; j < tcCnt ; ++j ) + { + if ( !IsConstraintInTranscript( transcripts[i], scc[ tc[j].i ] ) || !IsConstraintInTranscript( transcripts[i], scc[ tc[j].j ] ) ) + continue ; + //support += scc[ tc[j].i ].support + scc[ tc[j].j ].support ; + //uniqSupport += scc[ tc[j].i ].uniqSupport + scc[ tc[j].j ].uniqSupport ; + support += tc[j].support ; + uniqSupport += tc[j].uniqSupport ; + + //printf( "constraint uniqness: %d: %d %d\n", i, tc[j].uniqSupport, tc[j].support ) ; + } + //printf( "%d: %d %d\n", i, uniqSupport, support ) ; + if ( (double)uniqSupport < 0.03 * support ) + transcripts[i].abundance = -1 ; + } + tcnt = RemoveNegativeAbundTranscripts( transcripts ) ; + + + /*================================================================== + Remove shadow transcripts, the abnormal 2-exon txpt whose intron is very close to the true one or one of the anchor exon is shorter than 25bp.... + ====================================================================*/ + int minusCnt = 0, plusCnt = 0 ; + int mainStrand ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].rightStrand == 1 ) + ++plusCnt ; + else if ( subexons[i].rightStrand == -1 ) + ++minusCnt ; + } + if ( plusCnt > minusCnt ) + mainStrand = 1 ; + else + mainStrand = -1 ; + + for ( i = 0 ; i < tcnt ; ++i ) + { + std::vector subexonIdx ; + transcripts[i].seVector.GetOnesIndices( subexonIdx ) ; + int size = subexonIdx.size() ; + int intronCnt = 0 ; + int anchorIdx = 0 ; // the subexon adjacent to the only intron. + + for ( j = 0 ; j < size - 1 ; ++j ) + { + if ( subexons[ subexonIdx[j] ].end + 1 < subexons[ subexonIdx[j + 1] ].start ) + { + ++intronCnt ; + anchorIdx = j ; + } + } + if ( intronCnt != 1 ) + continue ; + + int anchorExonLength[2] = {0, 0}; + int tag = 0 ; + for ( j = 0 ; j < size ; ++j ) + { + anchorExonLength[tag] += subexons[ subexonIdx[j] ].end - subexons[ subexonIdx[j] ].start + 1 ; + if ( tag == 0 && subexons[ subexonIdx[j] ].end + 1 < subexons[ subexonIdx[j + 1] ].start ) + ++tag ; + } + + int flag = 0 ; + if ( subexons[ subexonIdx[anchorIdx] ].rightStrand == mainStrand ) + { + j = subexonIdx[ anchorIdx ] ; + if ( subexons[j].end - subexons[j].start + 1 <= 20 || + ( subexons[j+ 1].start == subexons[j].end + 1 && subexons[j + 1].end - subexons[j + 1].start + 1 <= 20 + && subexons[j + 1].rightStrand == mainStrand ) ) + ++flag ; + j = subexonIdx[ anchorIdx + 1 ] ; + if ( subexons[j].end - subexons[j].start + 1 <= 20 || + ( subexons[j].start == subexons[j - 1].end + 1 && subexons[j - 1].end - subexons[j - 1].start + 1 <= 20 + && subexons[j - 1].leftStrand == mainStrand ) ) + ++flag ; + } + + if ( anchorExonLength[0] <= 25 || anchorExonLength[1] <= 25 ) + flag = 2 ; + + // the alignment support the intron must be unique and has enough support. + int support = 0 ; + int uniqSupport = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + if ( !IsConstraintInTranscript( transcripts[i], scc[ tc[j].i ] ) || !IsConstraintInTranscript( transcripts[i], scc[ tc[j].j ] ) ) + continue ; + if ( ( scc[ tc[j].i ].vector.Test( subexonIdx[ anchorIdx ] ) && scc[ tc[j].i ].vector.Test( subexonIdx[ anchorIdx + 1 ] ) ) + || ( scc[ tc[j].j ].vector.Test( subexonIdx[ anchorIdx ] ) && scc[ tc[j].j ].vector.Test( subexonIdx[ anchorIdx + 1 ] ) ) ) + { + support += tc[j].support ; + uniqSupport += tc[j].uniqSupport ; + } + + } + + if ( (double)uniqSupport < 0.3 * support || support < txptMinReadDepth ) + { + flag = 2 ; + } + + if ( flag == 2 ) + transcripts[i].abundance = -1 ; + + } + tcnt = RemoveNegativeAbundTranscripts( transcripts ) ; + + return transcripts.size() ; +} + +void TranscriptDecider::ComputeTranscriptsScore( struct _subexon *subexons, int seCnt, std::map *subexonChainSupport, std::vector &transcripts ) +{ + int i, j ; + int tcnt = transcripts.size() ; + struct _constraint tmpC ; + tmpC.vector.Init( seCnt ) ; + + for ( i = 0 ; i < tcnt ; ++i ) + transcripts[i].correlationScore = 0 ; + + for ( i = 0 ; i < seCnt ; ++i ) + { + for ( std::map::iterator it = subexonChainSupport[i].begin() ; + it != subexonChainSupport[i].end() ; ++it ) + { + if ( sampleCnt >= 0 && ( it->second < 3 || it->second < (int)( 0.1 * sampleCnt ) ) && it->second <= sampleCnt / 2 ) + continue ; + + tmpC.vector.Reset() ; + tmpC.vector.Set( i ) ; + tmpC.vector.Set( it->first ) ; + tmpC.first = i ; + tmpC.last = it->first ; + + for ( j = 0 ; j < tcnt ; ++j ) + { + if ( IsConstraintInTranscript( transcripts[j], tmpC ) ) + ++transcripts[j].correlationScore ; + } + } + } + + tmpC.vector.Release() ; +} + +int TranscriptDecider::Solve( struct _subexon *subexons, int seCnt, std::vector &constraints, SubexonCorrelation &subexonCorrelation ) +{ + int i, j, k ; + int cnt = 0 ; + int *f = new int[seCnt] ; // this is a general buffer for a type of usage. + bool useDP = false ; + + compatibleTestVectorT.Init( seCnt ) ; // this is the bittable used in compatible test function. + compatibleTestVectorC.Init( seCnt ) ; + + for ( i = 0 ; i < seCnt ; ++i ) + { + subexons[i].canBeStart = subexons[i].canBeEnd = false ; + + if ( subexons[i].prevCnt == 0 ) + subexons[i].canBeStart = true ; + else if ( subexons[i].leftClassifier < canBeSoftBoundaryThreshold && subexons[i].leftClassifier != -1 + && subexons[i].leftStrand != 0 ) // The case of overhang. + { + // We then look into whether there is a left-side end already showed up before this subexon in this region of subexons. + bool flag = true ; + for ( j = i - 1 ; j >= 0 ; --j ) + { + if ( subexons[j].end + 1 != subexons[j + 1].start ) + break ; + if ( subexons[i].canBeStart == true ) + { + flag = false ; + break ; + } + } + subexons[i].canBeStart = flag ; + } + + if ( subexons[i].nextCnt == 0 ) + subexons[i].canBeEnd = true ; + else if ( subexons[i].rightClassifier < canBeSoftBoundaryThreshold && subexons[i].rightClassifier != -1 + && subexons[i].rightStrand != 0 ) + { + subexons[i].canBeEnd = true ; + } + // Remove other soft end already showed up in this region of subexons. + if ( subexons[i].canBeEnd == true ) + { + for ( j = i - 1 ; j >= 0 ; --j ) + { + if ( subexons[j].end + 1 != subexons[j + 1].start ) + break ; + if ( subexons[j].canBeEnd == true ) + { + subexons[j].canBeEnd = false ; + break ; + } + } + } + //printf( "%d: %d %lf\n", subexons[i].canBeStart, subexons[i].prevCnt, subexons[i].leftClassifier ) ; + } + + // Go through the cases of mixture region to set canBeStart/End. + // e.g: +[...]+_____+[....]-...]+____+[..)_____-[...]- + // ^ then we need to force a start point here. + // Do we need to associate a strand information with canBeStart, canBeEnd? + for ( i = 0 ; i < seCnt ; ) + { + // [i, j) is a region. + for ( j = i + 1 ; j < seCnt ; ++j ) + if ( subexons[j].start > subexons[j - 1].end + 1 ) + break ; + if ( subexons[i].canBeStart == false ) // then subexons[i] must has a hard left boundary. + { + int leftStrandCnt[2] = {0, 0} ; + for ( k = i ; k < j ; ++k ) + { + if ( !SubexonGraph::IsSameStrand( subexons[k].rightStrand, subexons[i].leftStrand ) ) + break ; + if ( subexons[k].leftStrand != 0 ) + ++leftStrandCnt[ ( subexons[k].leftStrand + 1 ) / 2 ] ; + } + if ( k < j && leftStrandCnt[ ( subexons[k].rightStrand + 1 ) / 2 ] == 0 ) + subexons[i].canBeStart = true ; + } + + if ( subexons[j - 1].canBeEnd == false ) + { + int rightStrandCnt[2] = {0, 0} ; + for ( k = j - 1 ; k >= i ; --k ) + { + if ( !SubexonGraph::IsSameStrand( subexons[k].leftStrand, subexons[j - 1].rightStrand ) ) + break ; + if ( subexons[k].rightStrand != 0 ) + ++rightStrandCnt[ ( subexons[k].rightStrand + 1 ) / 2 ] ; + } + if ( k >= i && rightStrandCnt[ ( subexons[k].leftStrand + 1 ) / 2 ] == 0 ) + subexons[j - 1].canBeEnd = true ; + } + + //if ( subexons[i].start == 6870264) + // printf( "hi %d %d\n",i , subexons[i].canBeStart ) ; + i = j ; + } + /*for ( i = 0 ; i < seCnt ; ++i ) + { + printf( "%d %d: %d %d\n", subexons[i].start, subexons[i].end, subexons[i].canBeStart, subexons[i].canBeEnd ) ; + }*/ + + // Find the gene ids. + baseGeneId = subexons[0].lcCnt ; + usedGeneId = subexons[0].rcCnt ; + defaultGeneId[0] = defaultGeneId[1] = -1 ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].geneId < 0 ) + continue ; + + //if ( baseGeneId == -1 || subexons[i].geneId < baseGeneId ) + // baseGeneId = subexons[i].geneId ; + //if ( subexons[i].geneId > usedGeneId ) + // usedGeneId = subexons[i].geneId ; + + if ( ( subexons[i].rightStrand == -1 || subexons[i].leftStrand == -1 ) && + ( defaultGeneId[0] == -1 || subexons[i].geneId < defaultGeneId[0] ) ) + defaultGeneId[0] = subexons[i].geneId ; + if ( ( subexons[i].rightStrand == 1 || subexons[i].leftStrand == 1 ) && + ( defaultGeneId[1] == -1 || subexons[i].geneId < defaultGeneId[1] ) ) + defaultGeneId[1] = subexons[i].geneId ; + } + if ( defaultGeneId[0] == -1 ) + defaultGeneId[0] = baseGeneId ; + if ( defaultGeneId[1] == -1 ) + defaultGeneId[1] = usedGeneId - 1 ; + + // Go through the constraints to find the chain of subexons that should be kept. + std::map *subexonChainSupport = new std::map[ seCnt ] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + std::vector subexonIdx ; + std::vector chain ; + + int tcCnt = constraints[i].constraints.size() ; + int size ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + struct _constraint c = constraints[i].constraints[j] ; + if ( c.uniqSupport < 0.95 * c.support || c.support < 3 ) + continue ; + + subexonIdx.clear() ; + c.vector.GetOnesIndices( subexonIdx ) ; + size = subexonIdx.size() ; + + for ( k = 0 ; k < size - 1 ; ++k ) + { + struct _pair32 p ; + + p.a = subexonIdx[k] ; + p.b = subexonIdx[k + 1] ; + //if ( subexons[p.a].end + 1 == 113235898 && subexons[ p.b ].start + 1 == 113236121 ) + // printf( "bad bad %d %d %d\n", i, c.uniqSupport, c.support ) ; + + if ( subexons[ p.a ].end + 1 < subexons[ p.b ].start ) + chain.push_back( p ) ; + } + } + // Remove redundancy. + sort( chain.begin(), chain.end(), CompSortPairs ) ; + size = chain.size() ; + k = 0 ; + for ( j = 1 ; j < size ; ++j ) + { + if ( chain[j].a == chain[k].a && chain[j].b == chain[k].b ) + continue ; + else + { + ++k ; + chain[k] = chain[j] ; + } + } + chain.resize( k + 1 ) ; + + // Add those to sample count + size = k + 1 ; + for ( j = 0 ; j < size ; ++j ) + { + if ( subexonChainSupport[ chain[j].a ].count( chain[j].b ) ) + { + ++subexonChainSupport[ chain[j].a ][ chain[j].b ] ; + } + else + subexonChainSupport[ chain[j].a ][ chain[j].b ] = 1 ; + } + } + + /*for ( i = 0 ; i < seCnt ; ++i ) + { + printf( "%d:", i ) ; + for ( std::map::iterator it = subexonChainSupport[i].begin() ; it != subexonChainSupport[i].end() ; ++it ) + printf( " (%d %d) ", it->first, it->second ) ; + printf( "\n" ) ; + }*/ + + //printf( "%d %d %d\n", defaultGeneId[0], baseGeneId, usedGeneId ) ; + cnt = 0 ; + memset( f, -1, sizeof( int ) * seCnt ) ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].canBeStart ) + { + cnt += SubTranscriptCount( i, subexons, f ) ; + } + } + if ( cnt <= USE_DP ) + { + for ( i = 0 ; i < seCnt ; ++i ) + if ( f[i] > USE_DP ) + { + useDP = true ; + break ; + } + } + else + useDP = true ; + if ( !useDP ) + { + for ( i = 0 ; i < sampleCnt ; ++i ) + { + double msize = constraints[i].matePairs.size() ; + double csize = constraints[i].constraints.size() ; + if ( cnt > ( csize / msize ) * ( csize / msize ) * seCnt + && cnt > USE_DP / ( msize * msize ) && cnt > 50 ) + { + useDP = true ; + break ; + } + } + } + + int atCnt = cnt ; + printf( "%d: atCnt=%d seCnt=%d %d %d %d\n", subexons[0].start + 1, atCnt, seCnt, useDP, (int)constraints[0].constraints.size(), (int)constraints[0].matePairs.size() ) ; + fflush( stdout ) ; + std::vector alltranscripts ; + + if ( !useDP ) + { + int origSize = atCnt ; + alltranscripts.resize( atCnt ) ; + for ( i = 0 ; i < atCnt ; ++i ) + { + alltranscripts[i].seVector.Init( seCnt ) ; + alltranscripts[i].correlationScore = 1 ; + } + + atCnt = 0 ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].canBeStart ) + EnumerateTranscript( i, 0, f, 0, subexons, subexonCorrelation, 1, alltranscripts, atCnt ) ; + } + + for ( i = atCnt ; i < origSize ; ++i ) + alltranscripts[i].seVector.Release() ; + + alltranscripts.resize( atCnt ) ; + //printf( "transcript cnt: %d\n", atCnt ) ; + //printf( "%d %d\n", alltranscripts[0].seVector.Test( 1 ), constraints[0].matePairs.size() ) ; + } + else // Use dynamic programming to pick a set of candidate transcript. + { + std::vector sampleTranscripts ; + + // pre allocate the memory. + struct _dpAttribute attr ; + attr.f1 = new struct _dp[seCnt] ; + if ( seCnt <= 10000 ) + { + attr.f2 = new struct _dp*[seCnt] ; + for ( i = 0 ; i < seCnt ; ++i ) + attr.f2[i] = new struct _dp[seCnt] ; + } + else + attr.f2 = NULL ; + + hashMax = HASH_MAX ; + if (seCnt > 500) + hashMax = 1000003 ; + else if (seCnt > 1000) + hashMax = 10000019 ; + else if (seCnt > 1500) + hashMax = 20000003 ; + + attr.hash = dpHash ; + if ( hashMax != HASH_MAX ) + attr.hash = new struct _dp[hashMax] ; + + for ( i = 0 ; i < seCnt ; ++i ) + { + attr.f1[i].seVector.Nullify() ; + attr.f1[i].seVector.Init( seCnt ) ; + for ( j = i ; j < seCnt ; ++j ) + { + attr.f2[i][j].seVector.Nullify() ; + attr.f2[i][j].seVector.Init( seCnt ) ; + } + } + for ( i = 0 ; i < hashMax ; ++i ) + { + attr.hash[i].seVector.Nullify() ; + attr.hash[i].seVector.Init( seCnt ) ; + } + + // select candidate transcripts from each sample. + struct _pair32 *sampleComplexity = new struct _pair32[ sampleCnt ] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + sampleComplexity[i].a = i ; + sampleComplexity[i].b = constraints[i].constraints.size() ; + } + qsort( sampleComplexity, sampleCnt, sizeof( sampleComplexity[0] ), CompPairsByB ) ; + int downsampleCnt = -1 ; + + for ( i = sampleCnt - 1 ; i >= 0 ; --i ) + { + sampleTranscripts.clear() ; + int iterBound = constraints[ sampleComplexity[i].a ].constraints.size() ; + if ( i < sampleCnt - 1 ) + iterBound = 100 ; + + if ( i < sampleCnt - 10 && alltranscripts.size() > 1000 ) + iterBound = 10 ; + //printf( "%d %d: %d %d %d %d\n", subexons[0].start + 1, sampleComplexity[i].a, constraints[ sampleComplexity[i].a ].constraints.size(), constraints[ sampleComplexity[i].a ].matePairs.size(), + // alltranscripts.size(), iterBound ) ; fflush( stdout ) ; + if ( maxDpConstraintSize > 0 ) + { + Constraints truncatedConstraints ; + truncatedConstraints.TruncateConstraintsCoverFrom( constraints[ sampleComplexity[i].a ], seCnt, maxDpConstraintSize ) ; + PickTranscriptsByDP( subexons, seCnt, iterBound, truncatedConstraints, + subexonCorrelation, attr, sampleTranscripts ) ; + } + else if ( ( constraints[ sampleComplexity[i].a ].constraints.size() > 1000 + && constraints[ sampleComplexity[i].a ].constraints.size() * 10 < constraints[ sampleComplexity[i].a ].matePairs.size() ) + || ( downsampleCnt > 0 && (int)constraints[ sampleComplexity[i].a ].constraints.size() >= downsampleCnt ) + || seCnt >= 1500 ) + { + Constraints downsampledConstraints ; + int stride = (int)constraints[ sampleComplexity[i].a ].matePairs.size() / (int)constraints[ sampleComplexity[i].a ].constraints.size() ; + if ( downsampleCnt > 0 ) + stride = (int)constraints[ sampleComplexity[i].a ].constraints.size() / downsampleCnt ; + if ( stride < 1 ) + stride = 1 ; + downsampledConstraints.DownsampleConstraintsFrom( constraints[ sampleComplexity[i].a ], stride ) ; + if ( downsampleCnt <= 0 ) + downsampleCnt = downsampledConstraints.constraints.size() ; + if ( iterBound <= 10 ) + continue ; + PickTranscriptsByDP( subexons, seCnt, iterBound, downsampledConstraints, subexonCorrelation, attr, sampleTranscripts ) ; + } + else + { + PickTranscriptsByDP( subexons, seCnt, iterBound, constraints[ sampleComplexity[i].a ], subexonCorrelation, attr, sampleTranscripts ) ; + } + int size = sampleTranscripts.size() ; + for ( j = 0 ; j < size ; ++j ) + alltranscripts.push_back( sampleTranscripts[j] ) ; + + // we can further pick a smaller subsets of transcripts here if the number is still to big. + CoalesceSameTranscripts( alltranscripts ) ; + + AugmentTranscripts( subexons, alltranscripts, 1000, false ) ; + } + + // release the memory. + delete[] sampleComplexity ; + for ( i = 0 ; i < seCnt ; ++i ) + { + attr.f1[i].seVector.Release() ; + for ( j = i ; j < seCnt && attr.f2 ; ++j ) + attr.f2[i][j].seVector.Release() ; + } + for ( i = 0 ; i < hashMax ; ++i ) + attr.hash[i].seVector.Release() ; + + delete[] attr.f1 ; + for ( i = 0 ; i < seCnt && attr.f2 ; ++i ) + delete[] attr.f2[i] ; + delete[] attr.f2 ; + if (hashMax != HASH_MAX) + delete[] attr.hash ; + + } + + transcriptId = new int[usedGeneId - baseGeneId] ; + std::vector *predTranscripts = new std::vector[sampleCnt] ; + + atCnt = alltranscripts.size() ; + for ( i = 0 ; i < atCnt ; ++i ) + alltranscripts[i].FPKM = 0 ; + + for ( i = 0 ; i < sampleCnt ; ++i ) + { + int size = alltranscripts.size() ; + for ( j = 0 ; j < size ; ++j ) + alltranscripts[j].abundance = -1 ; + //printf( "pick: %d: %d %d\n", i, constraints[i].matePairs.size(), alltranscripts.size() ) ; + PickTranscripts( subexons, alltranscripts, constraints[i], subexonCorrelation, predTranscripts[i] ) ; + + /*double tmp = FPKMFraction ; + FPKMFraction = 0 ; + size = predTranscripts.size() ; + for ( j = 0 ; j < size ; ++j ) + { + ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[j] ) ; + } + RefineTranscripts( subexons, seCnt, predTranscripts, constraints[i] ) ; + FPKMFraction = tmp ;*/ + + } + + atCnt = alltranscripts.size() ; + int *txptSampleSupport = new int[atCnt] ; + memset( txptSampleSupport, 0, sizeof( int ) * atCnt ) ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + int size = predTranscripts[i].size() ; + for ( j = 0 ; j < size ; ++j ) + { + ++txptSampleSupport[ predTranscripts[i][j].id ] ; + ++alltranscripts[ predTranscripts[i][j].id ].FPKM ; + } + } + + for ( i = 0 ; i < sampleCnt ; ++i ) + { + int size = alltranscripts.size() ; + for ( j = 0 ; j < size ; ++j ) + alltranscripts[j].abundance = -1 ; + //printf( "pick: %d: %d %d\n", i, constraints[i].matePairs.size(), alltranscripts.size() ) ; + + size = predTranscripts[i].size() ; + for ( j = 0 ; j < size ; ++j ) + { + predTranscripts[i][j].seVector.Release() ; + } + predTranscripts[i].clear() ; + PickTranscripts( subexons, alltranscripts, constraints[i], subexonCorrelation, predTranscripts[i] ) ; + } + + std::vector *rawPredTranscriptIds = new std::vector[sampleCnt] ; + std::vector *rawPredTranscriptAbundance = new std::vector[sampleCnt] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + int size = predTranscripts[i].size() ; + + for ( j = 0 ; j < size ; ++j ) + { + rawPredTranscriptIds[i].push_back( predTranscripts[i][j].id ) ; + rawPredTranscriptAbundance[i].push_back( predTranscripts[i][j].abundance ) ; + } + } + + // Do the filtration. + for ( i = 0 ; i < sampleCnt ; ++i ) + { + int size = predTranscripts[i].size() ; + for ( j = 0 ; j < size ; ++j ) + { + ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[i][j] ) ; + } + size = RefineTranscripts( subexons, seCnt, false, subexonChainSupport, txptSampleSupport, predTranscripts[i], constraints[i] ) ; + + // Recompute the abundance. + AbundanceEstimation( subexons, seCnt, constraints[i], predTranscripts[i] ) ; + for ( j = 0 ; j < size ; ++j ) + ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[i][j] ) ; + size = RefineTranscripts( subexons, seCnt, true, subexonChainSupport, txptSampleSupport, predTranscripts[i], constraints[i] ) ; + + //ComputeTranscriptsScore( subexons, seCnt, subexonChainSupport, predTranscripts[i] ) ; + } + + // Rescue some filtered transcripts + memset( txptSampleSupport, 0, sizeof( int ) * atCnt ) ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + int size = predTranscripts[i].size() ; + for ( j = 0 ; j < size ; ++j ) + { + ++txptSampleSupport[ predTranscripts[i][j].id ] ; + } + } + + bool *predicted = new bool[atCnt] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + memset( predicted, false, sizeof( bool ) * atCnt ) ; + if ( predTranscripts[i].size() != rawPredTranscriptIds[i].size() ) + { + int psize = predTranscripts[i].size() ; + int rsize = rawPredTranscriptIds[i].size() ; + int tcCnt = constraints[i].matePairs.size() ; + + for ( j = 0 ; j < psize ; ++j ) + predicted[ predTranscripts[i][j].id ] = true ; + + for ( j = 0 ; j < rsize ; ++j ) + { + int id = rawPredTranscriptIds[i][j] ; + if ( predicted[ id ] == false && + ( txptSampleSupport[ id ] >= 3 && txptSampleSupport[id] >= 0.25 * sampleCnt ) ) + { + struct _transcript nt = alltranscripts[id] ; + nt.seVector.Nullify() ; + nt.seVector.Duplicate( alltranscripts[id].seVector ) ; + nt.constraintsSupport = NULL ; + nt.correlationScore = -1 ; + nt.abundance = rawPredTranscriptAbundance[i][j] ; + nt.id = id ; + predTranscripts[i].push_back( nt ) ; + } + } + if ( psize != predTranscripts[i].size() ) + AbundanceEstimation( subexons, seCnt, constraints[i], predTranscripts[i] ) ; + } + + int size = predTranscripts[i].size() ; + + if ( 0 ) //size == 1 ) + { + //AugmentTranscripts( subexons, predTranscripts[i], false ) ; + + int l = predTranscripts[i].size() ; + int tcCnt = constraints[i].matePairs.size() ; + for ( j = 0 ; j < l ; ++j ) + { + predTranscripts[i][j].abundance = 1.0 / alignments.readLen ; + } + AbundanceEstimation( subexons, seCnt, constraints[i], predTranscripts[i] ) ; + + std::vector subexonIdx ; + for ( j = 0 ; j < l ; ++j ) + { + subexonIdx.clear() ; + predTranscripts[i][j].seVector.GetOnesIndices( subexonIdx ) ; + int subexonIdxCnt = subexonIdx.size() ; + int len = 0 ; + for ( k = 0 ; k < subexonIdxCnt ; ++k ) + len += subexons[ subexonIdx[k] ].end - subexons[ subexonIdx[k] ].start + 1 ; + + if ( predTranscripts[i][j].abundance * alignments.readLen / len < 2.0 ) + predTranscripts[i][j].abundance = -1 ; + else + ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[i][j] ) ; + + } + RemoveNegativeAbundTranscripts( predTranscripts[i] ) ; + } + + // Output + size = predTranscripts[i].size() ; + InitTranscriptId() ; + for ( j = 0 ; j < size ; ++j ) + { + OutputTranscript( i, subexons, predTranscripts[i][j] ) ; + } + for ( j = 0 ; j < size ; ++j ) + { + predTranscripts[i][j].seVector.Release() ; + } + } + + delete []predicted ; + delete []transcriptId ; + delete []predTranscripts ; + delete []rawPredTranscriptIds ; + delete []rawPredTranscriptAbundance ; + delete []txptSampleSupport ; + + atCnt = alltranscripts.size() ; + for ( i = 0 ; i < atCnt ; ++i ) + alltranscripts[i].seVector.Release() ; + compatibleTestVectorT.Release() ; + compatibleTestVectorC.Release() ; + delete[] f ; + delete[] subexonChainSupport ; + return 0 ; +} + +void *TranscriptDeciderSolve_Wrapper( void *a ) +{ + int i ; + + struct _transcriptDeciderThreadArg &arg = *( (struct _transcriptDeciderThreadArg *)a ) ; + TranscriptDecider transcriptDecider( arg.FPKMFraction, arg.classifierThreshold, arg.txptMinReadDepth, arg.sampleCnt, *( arg.alignments ) ) ; + transcriptDecider.SetNumThreads( arg.numThreads + 1 ) ; + transcriptDecider.SetMultiThreadOutputHandler( arg.outputHandler ) ; + transcriptDecider.SetMaxDpConstraintSize( arg.maxDpConstraintSize ) ; + transcriptDecider.Solve( arg.subexons, arg.seCnt, arg.constraints, arg.subexonCorrelation ) ; + + int start = arg.subexons[0].start ; + int end = arg.subexons[ arg.seCnt - 1 ].end ; + int chrId = arg.subexons[0].chrId ; + // Release memory + for ( i = 0 ; i < arg.seCnt ; ++i ) + { + delete[] arg.subexons[i].prev ; + delete[] arg.subexons[i].next ; + } + delete[] arg.subexons ; + + // Put the work id back to the free threads queue. + pthread_mutex_lock( arg.ftLock ) ; + arg.freeThreads[ *( arg.ftCnt ) ] = arg.tid ; + ++*( arg.ftCnt ) ; + if ( *( arg.ftCnt ) == 1 ) + pthread_cond_signal( arg.fullWorkCond ) ; + pthread_mutex_unlock( arg.ftLock) ; + printf( "Thread %d: %s %d %d finished.\n", arg.tid, arg.alignments->GetChromName(chrId), start + 1, end + 1 ) ; + fflush( stdout ) ; + + + pthread_exit( NULL ) ; +} diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/TranscriptDecider.hpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/TranscriptDecider.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,544 @@ +#ifndef _MOURISL_CLASSES_TRANSCRIPTDECIDER_HEADER +#define _MOURISL_CLASSES_TRANSCRIPTDECIDER_HEADER + +#include +#include +#include +#include + +#include "alignments.hpp" +#include "SubexonGraph.hpp" +#include "SubexonCorrelation.hpp" +#include "BitTable.hpp" +#include "Constraints.hpp" + +#define HASH_MAX 100003 // default HASH_MAX +#define USE_DP 200000 + +struct _transcript +{ + BitTable seVector ; + double abundance ; + double correlationScore ; + double FPKM ; + double *constraintsSupport ; // record the assign ment of constraints. + + int first, last ; // indicate the index of the first and last subexons. + bool partial ; // wehther this is a partial transcript. + int id ; // the id for various usage: i.e transcript index in the alltranscripts. +} ; + +struct _outputTranscript +{ + int chrId ; + int geneId, transcriptId ; + struct _pair32 *exons ; + int ecnt ; + char strand ; + int sampleId ; + + double FPKM ; + double TPM ; + double cov ; +} ; + +struct _dp +{ + BitTable seVector ; + int first, last ; + // The "cnt" is for the hash structure. + // the first cnt set bits represent the subexons that are the key of the hash + // the remaining set bits are the optimal subtranscript follow the key. + int cnt ; + double cover ; + + double minAbundance ; + int timeStamp ; + int strand ; +} ; + + +struct _dpAttribute +{ + struct _dp *f1, **f2 ; + struct _dp *hash ; + + struct _transcript bufferTxpt ; + + bool forAbundance ; + + struct _subexon *subexons ; + int seCnt ; + + std::map uncoveredPair ; + + double minAbundance ; + int timeStamp ; +} ; + +class MultiThreadOutputTranscript ; + +struct _transcriptDeciderThreadArg +{ + int tid ; + struct _subexon *subexons ; + int seCnt ; + int sampleCnt ; + int numThreads ; + + int maxDpConstraintSize ; + double FPKMFraction, classifierThreshold, txptMinReadDepth ; + Alignments *alignments ; + std::vector constraints ; + SubexonCorrelation subexonCorrelation ; + MultiThreadOutputTranscript *outputHandler ; + + int *freeThreads ; // the stack for free threads + int *ftCnt ; + pthread_mutex_t *ftLock ; + pthread_cond_t *fullWorkCond ; +} ; + +class MultiThreadOutputTranscript +{ +private: + std::vector outputQueue ; + pthread_t *threads ; + pthread_mutex_t outputLock ; + int sampleCnt ; + int numThreads ; + std::vector outputFPs ; + Alignments &alignments ; + +public: + static int CompTranscripts( const struct _outputTranscript &a, const struct _outputTranscript &b ) + { + int i ; + if ( a.geneId != b.geneId ) + return a.geneId - b.geneId ; + if ( a.ecnt != b.ecnt ) + return a.ecnt - b.ecnt ; + + for ( i = 0 ; i < a.ecnt ; ++i ) + { + if ( a.exons[i].a != b.exons[i].a ) + return a.exons[i].a - b.exons[i].a ; + + if ( a.exons[i].b != b.exons[i].b ) + return a.exons[i].b - b.exons[i].b ; + } + return 0 ; + } + + static bool CompSortTranscripts( const struct _outputTranscript &a, const struct _outputTranscript &b ) + { + int tmp = CompTranscripts( a, b ) ; + if ( tmp < 0 ) + return true ; + else if ( tmp > 0 ) + return false ; + else + return a.sampleId < b.sampleId ; + } + + MultiThreadOutputTranscript( int cnt, Alignments &a ): alignments( a ) + { + sampleCnt = cnt ; + pthread_mutex_init( &outputLock, NULL ) ; + } + ~MultiThreadOutputTranscript() + { + pthread_mutex_destroy( &outputLock ) ; + int i ; + for ( i = 0 ; i < sampleCnt ; ++i ) + fclose( outputFPs[i] ) ; + } + + void SetThreadsPointer( pthread_t *t, int n ) + { + threads = t ; + numThreads = n ; + } + + void SetOutputFPs( char *outputPrefix ) + { + int i ; + char buffer[1024] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + if ( outputPrefix[0] ) + sprintf( buffer, "%s_sample_%d.gtf", outputPrefix, i ) ; + else + sprintf( buffer, "sample_%d.gtf", i ) ; + FILE *fp = fopen( buffer, "w" ) ; + outputFPs.push_back( fp ) ; + } + } + + void Add( struct _outputTranscript &t ) + { + pthread_mutex_lock( &outputLock ) ; + outputQueue.push_back( t ) ; + pthread_mutex_unlock( &outputLock ) ; + } + + void Add_SingleThread( struct _outputTranscript &t ) + { + outputQueue.push_back( t ) ; + } + + void ComputeFPKMTPM( std::vector &alignmentFiles ) + { + int i ; + int qsize = outputQueue.size() ; + double *totalFPK = new double[ sampleCnt ] ; + memset( totalFPK, 0, sizeof( double ) * sampleCnt ) ; + for ( i = 0 ; i < qsize ; ++i ) + { + totalFPK[ outputQueue[i].sampleId ] += outputQueue[i].FPKM ; + } + + for ( i = 0 ; i < qsize ; ++i ) + { + outputQueue[i].TPM = outputQueue[i].FPKM / ( totalFPK[ outputQueue[i].sampleId ] / 1000000.0 ) ; + outputQueue[i].FPKM /= ( alignmentFiles[ outputQueue[i].sampleId ].totalReadCnt / 1000000.0 ) ; + } + + delete[] totalFPK ; + } + + void OutputCommandInfo( int argc, char *argv[] ) + { + int i ; + int j ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + fprintf( outputFPs[i], "#PsiCLASS_v1.0.1\n#" ) ; + for ( j = 0 ; j < argc - 1 ; ++j ) + { + fprintf( outputFPs[i], "%s ", argv[j] ) ; + } + fprintf( outputFPs[i], "%s\n", argv[j] ) ; + } + } + + void OutputCommentToSampleGTF( int sampleId, char *s ) + { + fprintf( outputFPs[ sampleId ], "#%s\n", s ) ; + } + + void Flush() + { + std::sort( outputQueue.begin(), outputQueue.end(), CompSortTranscripts ) ; + int i, j ; + int qsize = outputQueue.size() ; + char prefix[10] = "" ; + + // Recompute the transcript id + int gid = -1 ; + int tid = 0 ; + for ( i = 0 ; i < qsize ; ) + { + for ( j = i + 1 ; j < qsize ; ++j ) + { + if ( CompTranscripts( outputQueue[i], outputQueue[j] ) ) + break ; + } + int l ; + if ( outputQueue[i].geneId != gid ) + { + gid = outputQueue[i].geneId ; + tid = 0 ; + } + else + ++tid ; + + for ( l = i ; l < j ; ++l ) + outputQueue[l].transcriptId = tid ; + + i = j ; + } + + // output + for ( i = 0 ; i < qsize ; ++i ) + { + struct _outputTranscript &t = outputQueue[i] ; + char *chrom = alignments.GetChromName( t.chrId ) ; + + fprintf( outputFPs[t.sampleId], "%s\tPsiCLASS\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\";\n", + chrom, t.exons[0].a, t.exons[t.ecnt - 1].b, t.strand, + prefix, chrom, t.geneId, + prefix, chrom, t.geneId, t.transcriptId, t.FPKM, t.TPM, t.cov ) ; + for ( j = 0 ; j < t.ecnt ; ++j ) + { + fprintf( outputFPs[ t.sampleId ], "%s\tPsiCLASS\texon\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; " + "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"\%.6lf\";\n", + chrom, t.exons[j].a, t.exons[j].b, t.strand, + prefix, chrom, t.geneId, + prefix, chrom, t.geneId, t.transcriptId, + j + 1, t.FPKM, t.TPM, t.cov ) ; + } + delete []t.exons ; + } + } +} ; + +class TranscriptDecider +{ +private: + int sampleCnt ; + int numThreads ; + double FPKMFraction ; + double txptMinReadDepth ; + int hashMax ; + int maxDpConstraintSize ; + + Constraints *constraints ; + //struct _subexon *subexons ; + //int seCnt ; + + int usedGeneId ; + int baseGeneId, defaultGeneId[2] ; + + int *transcriptId ; // the next transcript id for each gene id (we shift the gene id to 0 in this array.) + Alignments &alignments ; // for obtain the chromosome names. + + std::vector outputFPs ; + + BitTable compatibleTestVectorT, compatibleTestVectorC ; + double canBeSoftBoundaryThreshold ; + + MultiThreadOutputTranscript *outputHandler ; + + // Test whether subexon tag is a start subexon in a mixture region that corresponds to the start of a gene on another strand. + bool IsStartOfMixtureStrandRegion( int tag, struct _subexon *subexons, int seCnt ) ; + + // The functions to pick transcripts through dynamic programming + struct _dp *dpHash ; + void SearchSubTranscript( int tag, int strand, int parents[], int pcnt, struct _dp &pdp, int visit[], int vcnt, int extends[], int extendCnt, std::vector &tc, int tcStartInd, struct _dpAttribute &attr ) ; + struct _dp SolveSubTranscript( int visit[], int vcnt, int strand, std::vector &tc, int tcStartInd, struct _dpAttribute &attr ) ; + void PickTranscriptsByDP( struct _subexon *subexons, int seCnt, int iterBound, Constraints &constraints, SubexonCorrelation &correlation, struct _dpAttribute &attr, std::vector &allTranscripts ) ; + + void SetDpContent( struct _dp &a, struct _dp &b, const struct _dpAttribute &attr ) + { + a.seVector.Assign( b.seVector ) ; + a.first = b.first ; + a.last = b.last ; + a.cnt = b.cnt ; + a.cover = b.cover ; + + a.strand = b.strand ; + a.minAbundance = attr.minAbundance ; + a.timeStamp = attr.timeStamp ; + } + + void ResetDpContent( struct _dp &d ) + { + d.seVector.Reset() ; + d.first = -1 ; + d.last = -1 ; + d.cnt = -1 ; + d.cover = -1 ; + d.minAbundance = -1 ; + d.timeStamp = -1 ; + } + + void AugmentTranscripts( struct _subexon *subexons, std::vector &alltranscripts, int limit, bool extend ) ; + // Test whether a constraints is compatible with the transcript. + // Return 0 - uncompatible or does not overlap at all. 1 - fully compatible. 2 - Head of the constraints compatible with the tail of the transcript + int IsConstraintInTranscript( struct _transcript transcript, struct _constraint &c ) ; + int IsConstraintInTranscriptDebug( struct _transcript transcript, struct _constraint &c ) ; + + // Count how many transcripts are possible starting from subexons[tag]. + int SubTranscriptCount( int tag, struct _subexon *subexons, int f[] ) ; + + // The methods when there is no need for DP + void EnumerateTranscript( int tag, int strand, int visit[], int vcnt, struct _subexon *subexons, SubexonCorrelation &correlation, double correlationScore, std::vector &alltranscripts, int &atcnt ) ; + // For the simpler case, we can pick sample by sample. + void PickTranscripts( struct _subexon *subexons, std::vector &alltranscripts, Constraints &constraints, SubexonCorrelation &seCorrelation, std::vector &transcripts ) ; + + static bool CompSortTranscripts( const struct _transcript &a, const struct _transcript &b ) + { + if ( a.first < b.first ) + return true ; + else if ( a.first > b.first ) + return false ; + + int diffPos = a.seVector.GetFirstDifference( b.seVector ) ; + if ( diffPos == -1 ) + return false ; + if ( a.seVector.Test( diffPos ) ) + return true ; + else + return false ; + } + + static bool CompSortPairs( const struct _pair32 &x, const struct _pair32 &y ) + { + if ( x.a != y.a ) + return x.a < y.a ; + else + return x.b < y.b ; + } + + static bool CompSortPairsByB( const struct _pair32 &x, const struct _pair32 &y ) + { + return x.b < y.b ; + } + + static int CompPairsByB( const void *p1, const void *p2 ) + { + return ((struct _pair32 *)p1)->b - ((struct _pair32 *)p2)->b ; + } + + double ComputeScore( double cnt, double weight, double a, double A, double correlation ) + { + if ( a > A * 0.1 ) + return ( cnt * weight ) * ( 1 + pow( a / A, 0.25 ) ) + correlation ; + else + return ( cnt * weight ) * ( 1 + a / A ) + correlation ; + //return ( cnt ) * ( exp( 1 + a / A ) ) + correlation ; + } + + int GetFather( int f, int *father ) ; + + void ConvertTranscriptAbundanceToFPKM( struct _subexon *subexons, struct _transcript &t, int readCnt = 1000000 ) + { + int txptLen = 0 ; + int i, size ; + + std::vector subexonInd ; + t.seVector.GetOnesIndices( subexonInd ) ; + size = subexonInd.size() ; + for ( i = 0 ; i < size ; ++i ) + txptLen += ( subexons[ subexonInd[i] ].end - subexons[ subexonInd[i] ].start + 1 ) ; + double factor = 1 ; + if ( alignments.matePaired ) + factor = 0.5 ; + t.FPKM = t.abundance * factor / ( ( readCnt / 1000000.0 ) * ( txptLen / 1000.0 ) ) ; + } + + int GetTranscriptLengthFromAbundanceAndFPKM( double abundance, double FPKM, int readCnt = 1000000 ) + { + double factor = 1 ; + if ( alignments.matePaired ) + factor = 0.5 ; + return int( abundance * factor / ( FPKM / 1000.0 ) / ( readCnt / 1000000.0 ) + 0.5 ) ; + } + + void CoalesceSameTranscripts( std::vector &t ) ; + + + // Initialize the structure to store transcript id + void InitTranscriptId() ; + + int GetTranscriptGeneId( std::vector &subexonInd, struct _subexon *subexons ) ; + int GetTranscriptGeneId( struct _transcript &t, struct _subexon *subexons ) ; + int RemoveNegativeAbundTranscripts( std::vector &transcripts ) + { + int i, j ; + int tcnt = transcripts.size() ; + j = 0 ; + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( transcripts[i].abundance < 0 ) + { + transcripts[i].seVector.Release() ; // Don't forget release the memory. + continue ; + } + transcripts[j] = transcripts[i] ; + ++j ; + } + transcripts.resize( j ) ; + return j ; + } + + void AbundanceEstimation( struct _subexon *subexons, int seCnt, Constraints &constraints, std::vector &transcripts ) ; + + int RefineTranscripts( struct _subexon *subexons, int seCnt, bool aggressive, std::map *subexonChainSupport, int *txptSampleSupport, std::vector &transcripts, Constraints &constraints ) ; + + void ComputeTranscriptsScore( struct _subexon *subexons, int seCnt, std::map *subexonChainSupport, std::vector &transcripts ) ; + + void OutputTranscript( int sampleId, struct _subexon *subexons, struct _transcript &transcript ) ; + + void PrintLog( const char *fmt, ... ) + { + char buffer[10021] ; + va_list args ; + va_start( args, fmt ) ; + vsprintf( buffer, fmt, args ) ; + + time_t mytime = time(NULL) ; + struct tm *localT = localtime( &mytime ) ; + char stime[500] ; + strftime( stime, sizeof( stime ), "%c", localT ) ; + fprintf( stderr, "[%s] %s\n", stime, buffer ) ; + } + + +public: + TranscriptDecider( double f, double c, double d, int sampleCnt, Alignments &a ): alignments( a ) + { + FPKMFraction = f ; + canBeSoftBoundaryThreshold = c ; + txptMinReadDepth = d ; + usedGeneId = 0 ; + defaultGeneId[0] = -1 ; + defaultGeneId[1] = -1 ; + maxDpConstraintSize = -1 ; + numThreads = 1 ; + this->sampleCnt = sampleCnt ; + dpHash = new struct _dp[ HASH_MAX ] ; // pre-allocated buffer to hold dp information. + } + ~TranscriptDecider() + { + int i ; + if ( numThreads == 1 ) + { + int size = outputFPs.size() ; + for ( i = 0 ; i < size ; ++i ) + { + fclose( outputFPs[i] ) ; + } + } + delete[] dpHash ; + } + + + // @return: the number of assembled transcript + int Solve( struct _subexon *subexons, int seCnt, std::vector &constraints, SubexonCorrelation &subexonCorrelation ) ; + + void SetOutputFPs( char *outputPrefix ) + { + int i ; + char buffer[1024] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + if ( outputPrefix[0] ) + sprintf( buffer, "%s_sample_%d.gtf", outputPrefix, i ) ; + else + sprintf( buffer, "sample_%d.gtf", i ) ; + FILE *fp = fopen( buffer, "w" ) ; + outputFPs.push_back( fp ) ; + } + } + + void SetMultiThreadOutputHandler( MultiThreadOutputTranscript *h ) + { + outputHandler = h ; + } + + void SetNumThreads( int t ) + { + numThreads = t ; + } + + void SetMaxDpConstraintSize(int size) + { + maxDpConstraintSize = size ; + } +} ; + +void *TranscriptDeciderSolve_Wrapper( void *arg ) ; + +#endif diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/Vote.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/Vote.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,432 @@ +// The program that vote to pick the trusted transcripts. +#include +#include +#include +#include +#include +#include +#include + +#include "defs.h" +#include "TranscriptDecider.hpp" + +char usage[] = "./transcript-vote [OPTIONS] > output.gtf:\n" + "Required:\n" + "\t--lg: path to the list of GTF files.\n" + "Optional:\n" + "\t-d FLOAT: threshold of average coverage depth across all the samples. (default: 1)\n" + //"\t-n INT: the number of samples a transcript showed up. (default: 3)\n" + ; + + +/*struct _transcript +{ + int chrId ; + int geneId ; + char strand ; + struct _pair32 *exons ; + int ecnt ; + int sampleId ; +} ;*/ + +void GetGTFField( char *s, const char *field, char *ret ) +{ + char *p = strstr( s, field ) ; + if ( p == NULL ) + return ; + for ( ; *p != ' ' ; ++p ) + ; + p += 2 ; // add extra 1 to skip \" + sscanf( p, "%s", ret ) ; + //printf( "+%s %d\n", tid, strlen( tid ) ) ; + p = ret + strlen( ret ) ; + while ( *p != '\"' ) + --p ; + *p = '\0' ; +} + +int GetTailNumber( char *s ) +{ + int len = strlen( s ) ; + int ret = 0 ; + int i ; + int factor = 1 ; + for ( i = len - 1 ; i >= 0 && s[i] >= '0' && s[i] <= '9' ; --i, factor *= 10 ) + { + ret += factor * ( s[i] - '0' ) ; + } + return ret ; +} + +int CompDouble( const void *p1, const void *p2 ) +{ + double a = *(double *)p1 ; + double b = *(double *)p2 ; + if ( a > b ) + return 1 ; + else if ( a < b ) + return -1 ; + else + return 0 ; +} + +int main( int argc, char *argv[] ) +{ + int i, j, k ; + double minAvgDepth = 1.0 ; + double fraction = 1.0 ; + int minSampleCnt = 3 ; + std::map chrNameToId ; + std::map chrIdToName ; + + FILE *fpGTFlist = NULL ; + FILE *fp = NULL ; + for ( i = 1 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "--lg" ) ) + { + fpGTFlist = fopen( argv[i + 1], "r" ) ; + ++i ; + } + else if ( !strcmp( argv[i], "-d" ) ) + { + minAvgDepth = atof( argv[i + 1] ) ; + ++i ; + } + /*else if ( !strcmp( argv[i], "-n" ) ) + { + minSampleCnt = atoi( argv[i + 1] ) ; + ++i ; + }*/ + else + { + printf( "%s", usage ) ; + exit( 1 ) ; + } + } + if ( fpGTFlist == NULL ) + { + printf( "Must use --lg option to speicfy the list of GTF files.\n%s", usage ) ; + exit( 1 ) ; + } + + std::vector transcripts ; + std::vector outputTranscripts ; + + char buffer[4096] ; + char line[10000] ; + char chrom[50], tool[20], type[40], strand[3] ; + //char tid[50] ; + int start, end ; + std::vector tmpExons ; + int sampleCnt = 0 ; + int gid = -1 ; + int tid = -1 ; + int chrId = -1 ; + int chrIdUsed = 0 ; + char cStrand = '.' ; + char prefix[50] ; + double FPKM = 0, TPM = 0, cov = 0 ; + + while ( fgets( buffer, sizeof( buffer ), fpGTFlist ) != NULL ) + { + int len = strlen( buffer ) ; + if ( buffer[len - 1] == '\n' ) + { + buffer[len - 1] = '\0' ; + --len ; + } + fp = fopen( buffer, "r" ) ; + + + while ( fgets( line, sizeof( line ), fp ) != NULL ) + { + if ( line[0] == '#' ) + continue ; + + sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ; + + if ( strcmp( type, "exon" ) ) + { + if ( tmpExons.size() > 0 ) + { + struct _outputTranscript nt ; + nt.sampleId = sampleCnt ; + nt.chrId = chrId ; + nt.geneId = gid ; + nt.transcriptId = tid ; + nt.strand = cStrand ; + nt.ecnt = tmpExons.size() ; + nt.exons = new struct _pair32[nt.ecnt] ; + nt.FPKM = FPKM ; + nt.TPM = TPM ; + nt.cov = cov ; + for ( i = 0 ; i < nt.ecnt ; ++i ) + nt.exons[i] = tmpExons[i] ; + tmpExons.clear() ; + transcripts.push_back( nt ) ; + } + continue ; + } + + if ( chrNameToId.find( std::string( chrom ) ) == chrNameToId.end() ) + { + chrId = chrIdUsed ; + + std::string s( chrom ) ; + chrNameToId[s] = chrIdUsed ; + chrIdToName[ chrIdUsed ] = s ; + ++chrIdUsed ; + } + else + chrId = chrNameToId[ std::string( chrom ) ] ; + + GetGTFField( line, "gene_id", buffer ) ; + gid = GetTailNumber( buffer ) ; + + GetGTFField( line, "transcript_id", buffer ) ; + tid = GetTailNumber( buffer ) ; + + GetGTFField( line, "FPKM", buffer ) ; + FPKM = atof( buffer ) ; + + GetGTFField( line, "TPM", buffer ) ; + TPM = atof( buffer ) ; + + GetGTFField( line, "cov", buffer ) ; + cov = atof( buffer ) ; + + cStrand = strand[0] ; + + // Look for the user-defined prefix. + int len = strlen( buffer ) ; + j = 0 ; + for ( i = len - 1 ; i >= 0 ; --i ) + { + if ( buffer[i] == '.' ) + { + ++j ; + if ( j >= 2 ) + break ; + } + } + + for ( j = 0 ; j 0 ) + { + prefix[j] = '.' ; + prefix[j + 1] = '\0' ; + } + else + prefix[0] = '\0' ; + + struct _pair32 ne ; + ne.a = start ; + ne.b = end ; + tmpExons.push_back( ne ) ; + } + if ( tmpExons.size() > 0 ) + { + struct _outputTranscript nt ; + nt.sampleId = sampleCnt ; + nt.chrId = chrId ; + nt.geneId = gid ; + nt.transcriptId = tid ; + nt.strand = cStrand ; + nt.ecnt = tmpExons.size() ; + nt.exons = new struct _pair32[nt.ecnt] ; + nt.FPKM = FPKM ; + nt.TPM = TPM ; + nt.cov = cov ; + for ( i = 0 ; i < nt.ecnt ; ++i ) + nt.exons[i] = tmpExons[i] ; + tmpExons.clear() ; + transcripts.push_back( nt ) ; + } + + ++sampleCnt ; + fclose( fp ) ; + } + fclose( fpGTFlist ) ; + + // Coalesce same transcripts + std::sort( transcripts.begin(), transcripts.end(), MultiThreadOutputTranscript::CompSortTranscripts ) ; + std::vector sampleSupport ; + int size = transcripts.size() ; + if ( minSampleCnt > sampleCnt ) + minSampleCnt = sampleCnt ; + + for ( i = 0 ; i < size ; ) + { + int l ; + for ( j = i + 1 ; j < size ; ++j ) + { + if ( MultiThreadOutputTranscript::CompTranscripts( transcripts[i], transcripts[j] ) ) + break ; + } + // [i,j) are the same transcripts. + /*if ( j - i >= fraction * sampleCnt && j - i >= minSampleCnt ) + { + outputTranscripts.push_back( transcripts[i] ) ; + }*/ + + double sumFPKM = 0 ; + double sumTPM = 0 ; + double sumCov = 0 ; + for ( l = i ; l < j ; ++l ) + { + sumFPKM += transcripts[l].FPKM ; + sumTPM += transcripts[l].TPM ; + sumCov += transcripts[l].cov ; + } + + transcripts[k] = transcripts[i] ; + for ( l = i + 1 ; l < j ; ++l ) + delete[] transcripts[l].exons ; + + transcripts[k].FPKM = sumFPKM / ( j - i ) ; + transcripts[k].TPM = sumTPM / ( j - i ) ; + transcripts[k].cov = sumCov / ( j - i ) ; + + if ( ( j - i < int( fraction * sampleCnt ) || j - i < minSampleCnt ) && sumCov < minAvgDepth * sampleCnt ) + //if ( transcripts[k].score * ( j - i ) < 1 && ( j - i ) < sampleCnt * 0.5 ) + //if ( sumTPM < sampleCnt || sumFPKM < sampleCnt ) + //if ( sumCov < minAvgDepthn * sampleCnt ) + { + transcripts[k].FPKM = -1 ; + } + else + sampleSupport.push_back( j - i ) ; + ++k ; + i = j ; + } + transcripts.resize( k ) ; + + // The motivation to separate the coalscing and voting is to allow + // a gene with no txpt passing the vote to pick a representative. + size = k ; + for ( i = 0 ; i < size ; ) + { + int cnt = 0 ; + if ( transcripts[i].FPKM != -1 ) + ++cnt ; + + for ( j = i + 1 ; j < size ; ++j ) + { + if ( transcripts[i].geneId != transcripts[j].geneId ) + break ; + if ( transcripts[j].FPKM != -1 ) + ++cnt ; + } + int l ; + /*double *freq = new double[cnt] ; + for ( l = 0 ; l < cnt ; ++l ) + freq[l] = transcripts[l].FPKM / sampleCnt ; + qsort( freq, cnt, sizeof( double ), CompDouble ) ; + + for ( l = 0 ; l < cnt ; ++l ) + printf( "%lf\n", freq[l] ) ; + printf( "===========\n" ) ; + delete[] freq ;*/ + /*if ( cnt == 0 ) + { + int maxEcnt = -1 ; + int maxCnt = 0 ; + int maxtag ; + for ( l = i ; l < j ; ++l ) + if ( transcripts[l].ecnt > maxEcnt ) + maxEcnt = transcripts[l].ecnt ; + + if ( maxEcnt >= 2 ) + { + int maxSampleSupport = -1 ; + maxtag = i ; + for ( l = i ; l < j ; ++l ) + { + if ( transcripts[l].ecnt > 1 && transcripts[l].ecnt >= 2 ) + { + if ( sampleSupport[l] > maxSampleSupport ) + { + maxSampleSupport = sampleSupport[l] ; + maxtag = l ; + maxCnt = 1 ; + } + else if ( sampleSupport[l] == maxSampleSupport ) + ++maxCnt ; + } + } + + if ( maxSampleSupport >= 3 && maxSampleSupport >= ( fraction / 10 ) * sampleCnt && transcripts[maxtag].TPM > 0) + { + if ( maxCnt > 1 ) + { + int maxTPM = -1 ; + for ( l = i ; l < j ; ++l ) + { + if ( sampleSupport[l] != maxSampleSupport ) + continue ; + if ( transcripts[l].TPM > maxTPM ) + { + maxTPM = transcripts[l].TPM ; + maxtag = l ; + } + } + + } + transcripts[maxtag].FPKM = 0 ; + //fprintf( stderr, "recovered: %d %d\n", sampleSupport[ maxtag ], transcripts[ maxtag ].ecnt ) ; + outputTranscripts.push_back( transcripts[maxtag] ) ; + } + } + } + else*/ + { + for ( l = i ; l < j ; ++l ) + { + //if ( transcripts[l].FPKM >= fraction * sampleCnt && transcripts[l].FPKM >= minSampleCnt ) + if ( transcripts[l].FPKM != -1 ) + { + outputTranscripts.push_back( transcripts[l] ) ; + } + } + } + + i = j ; + } + + // Output + size = outputTranscripts.size() ; + //printf( "%d\n", size ) ; + int transcriptId = 0 ; + int prevGid = -1 ; + for ( i = 0 ; i < size ; ++i ) + { + struct _outputTranscript &t = outputTranscripts[i] ; + const char *chrom = chrIdToName[t.chrId].c_str() ; + /*if ( t.geneId != prevGid ) + transcriptId = 0 ; + else + ++transcriptId ;*/ + transcriptId = outputTranscripts[i].transcriptId ; + fprintf( stdout, "%s\tPsiCLASS\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n", + chrom, t.exons[0].a, t.exons[t.ecnt - 1].b, t.strand, + prefix, chrom, t.geneId, + prefix, chrom, t.geneId, transcriptId, t.FPKM, t.TPM, t.cov, sampleSupport[i] ) ; + for ( j = 0 ; j < t.ecnt ; ++j ) + { + fprintf( stdout, "%s\tPsiCLASS\texon\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; " + "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n", + chrom, t.exons[j].a, t.exons[j].b, t.strand, + prefix, chrom, t.geneId, + prefix, chrom, t.geneId, transcriptId, + j + 1, t.FPKM, t.TPM, t.cov, + sampleSupport[i] ) ; + } + + prevGid = t.geneId ; + } + return 0 ; +} diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/alignments.hpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/alignments.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,549 @@ +// The class handles reading the bam file + +#ifndef _LSONG_RSCAF_ALIGNMENT_HEADER +#define _LSONG_RSCAF_ALIGNMENT_HEADER + +#include "samtools-0.1.19/sam.h" +#include +#include +#include +#include +#include +#include + +#include "defs.h" + +class Alignments +{ +private: + samfile_t *fpSam ; + bam1_t *b ; + + char fileName[1024] ; + bool opened ; + std::map chrNameToId ; + bool allowSupplementary ; + bool allowClip ; + bool hasClipHead, hasClipTail ; + int segmentsSum ; // the sum of segments. + + bool atBegin ; + bool atEnd ; + + static int CompInt( const void *p1, const void *p2 ) + { + return (*(int *)p1 ) - (*(int *)p2 ) ; + } + + void Open() + { + fpSam = samopen( fileName, "rb", 0 ) ; + if ( !fpSam->header ) + { + fprintf( stderr, "Can not open %s.\n", fileName ) ; + exit( 1 ) ; + } + + // Collect the chromosome information + for ( int i = 0 ; i < fpSam->header->n_targets ; ++i ) + { + std::string s( fpSam->header->target_name[i] ) ; + chrNameToId[s] = i ; + } + opened = true ; + atBegin = true ; + atEnd = false ; + } +public: + struct _pair segments[MAX_SEG_COUNT] ; + int segCnt ; + + int totalReadCnt ; + int fragLen, fragStdev ; + int readLen ; + bool matePaired ; + + Alignments() + { + b = NULL ; + opened = false ; + atBegin = true ; + atEnd = false ; + allowSupplementary = false ; + allowClip = true ; + + totalReadCnt = 0 ; + fragLen = 0 ; + fragStdev = 0 ; + readLen = 0 ; + matePaired = false ; + } + ~Alignments() + { + if ( b ) + bam_destroy1( b ) ; + } + + void Open( char *file ) + { + strcpy( fileName, file ) ; + Open() ; + } + + void Rewind() + { + Close() ; + Open() ; + + atBegin = true ; + atEnd = false ; + } + + void Close() + { + samclose( fpSam ) ; + fpSam = NULL ; + } + + bool IsOpened() + { + return opened ; + } + + bool IsAtBegin() + { + return atBegin ; + } + + bool IsAtEnd() + { + return atEnd ; + } + + bool HasClip() + { + return hasClipHead || hasClipTail ; + } + + bool HasClipHead() + { + return hasClipHead ; + } + bool HasClipTail() + { + return hasClipTail ; + } + + int Next() + { + int i ; + int start = 0, len = 0 ; + uint32_t *rawCigar ; + + if ( atBegin == true ) + totalReadCnt = 0 ; + + atBegin = false ; + while ( 1 ) + { + while ( 1 ) + { + if ( b ) + bam_destroy1( b ) ; + b = bam_init1() ; + + if ( samread( fpSam, b ) <= 0 ) + { + atEnd = true ; + return 0 ; + } + + if ( ( b->core.flag & 0x900 ) == 0 ) + ++totalReadCnt ; + + if ( b->core.flag & 0xC ) + continue ; + + //if ( ( b->core.flag & 0x900 ) == 0 ) + break ; + } + + // to many repeat. + if ( bam_aux_get( b, "NH" ) ) + { + if ( bam_aux2i( bam_aux_get( b, "NH" ) ) >= 5 ) + continue ; + } + + // Compute the exons segments from the reads + segCnt = 0 ; + start = b->core.pos ; //+ 1 ; + rawCigar = bam1_cigar( b ) ; + // Check whether the query length is compatible with the read + if ( bam_cigar2qlen( &b->core, rawCigar ) != b->core.l_qseq ) + continue ; + + bool clipMiddle = false ; + int clipSum = 0 ; + hasClipHead = hasClipTail = false ; + len = 0 ; + segmentsSum = 0 ; + for ( i = 0 ; i < b->core.n_cigar ; ++i ) + { + int op = rawCigar[i] & BAM_CIGAR_MASK ; + int num = rawCigar[i] >> BAM_CIGAR_SHIFT ; + + switch ( op ) + { + case BAM_CMATCH: + case BAM_CDEL: + len += num ; break ; + case BAM_CSOFT_CLIP: + case BAM_CHARD_CLIP: + case BAM_CPAD: + { + if ( i == 0 ) + hasClipHead = true ; + else if ( i == b->core.n_cigar - 1 ) + hasClipTail = true ; + else + clipMiddle = true ; + + clipSum += num ; + } + case BAM_CINS: + num = 0 ; break ; + case BAM_CREF_SKIP: + { + segments[ segCnt ].a = start ; + segments[ segCnt ].b = start + len - 1 ; + ++segCnt ; + segmentsSum += len ; + start = start + len + num ; + len = 0 ; + } break ; + default: + len += num ; break ; + } + } + if ( clipMiddle ) // should never happend + continue ; + + if ( clipSum >= 2 && !allowClip ) + continue ; + + if ( len > 0 ) + { + segments[ segCnt ].a = start ; + segments[ segCnt ].b = start + len - 1 ; + ++segCnt ; + segmentsSum += len ; + } + /*if ( !strcmp( bam1_qname( b ), "chr1:109656301-109749401W:ENST00000490758.2:381:1480:1090:1290:X" ) ) + { + for ( i = 0 ; i < segCnt ; ++i ) + printf( "(%d %d) ", segments[i].a, segments[i].b ) ; + printf( "\n" ) ; + }*/ + + // Check whether the mates are compatible + //int mChrId = b->core.mtid ; + int64_t mPos = b->core.mpos ; + + if ( b->core.mtid == b->core.tid ) + { + for ( i = 0 ; i < segCnt - 1 ; ++i ) + { + if ( mPos >= segments[i].b && mPos <= segments[i + 1].a ) + break ; + } + if ( i < segCnt - 1 ) + continue ; + } + + break ; + } + + return 1 ; + } + + + int GetChromId() + { + return b->core.tid ; + } + + char* GetChromName( int tid ) + { + return fpSam->header->target_name[ tid ] ; + } + + int GetChromIdFromName( const char *s ) + { + std::string ss( s ) ; + if ( chrNameToId.find( ss ) == chrNameToId.end() ) + { + printf( "Unknown genome name: %s\n", s ) ; + exit( 1 ) ; + } + return chrNameToId[ss] ; + } + + int GetChromLength( int tid ) + { + return fpSam->header->target_len[ tid ] ; + } + + int GetChromCount() + { + return fpSam->header->n_targets ; + } + + void GetMatePosition( int &chrId, int64_t &pos ) + { + if ( b->core.flag & 0x8 ) + { + chrId = -1 ; + pos = -1 ; + } + else + { + chrId = b->core.mtid ; + pos = b->core.mpos ; //+ 1 ; + } + } + + int GetRepeatPosition( int &chrId, int64_t &pos ) + { + // Look at the CC field. + if ( !bam_aux_get( b, "CC" ) || !bam_aux_get( b, "CP" ) ) + { + chrId = -1 ; + pos = -1 ; + return 0 ; + } + + std::string s( bam_aux2Z( bam_aux_get(b, "CC" ) ) ) ; + chrId = chrNameToId[ s ] ; + pos = bam_aux2i( bam_aux_get( b, "CP" ) ) ;// Possible error for 64bit + return 1 ; + } + + void GetFileName( char *buffer ) + { + strcpy( buffer, fileName ) ; + } + + int GetReadLength() + { + return b->core.l_qseq ; + } + + int GetRefCoverLength() + { + return segmentsSum ; + } + + bool IsFirstMate() + { + if ( b->core.flag & 0x40 ) + return true ; + return false ; + } + + bool IsReverse() + { + if ( b->core.flag & 0x10 ) + return true ; + return false ; + } + + bool IsMateReverse() + { + if ( b->core.flag & 0x20 ) + return true ; + return false ; + } + + char *GetReadId() + { + return bam1_qname( b ) ; + } + + bool IsUnique() + { + if ( bam_aux_get( b, "NH" ) ) + { + if ( bam_aux2i( bam_aux_get( b, "NH" ) ) > 1 ) + return false ; + } + if ( IsSupplementary() && GetFieldZ( (char *)"XZ" ) != NULL ) + return false ; + return true ; + } + + bool IsPrimary() + { + if ( ( b->core.flag & 0x900 ) == 0 ) + return true ; + else + return false ; + } + + // -1:minus, 0: unknown, 1:plus + int GetStrand() + { + if ( segCnt == 1 ) + return 0 ; + if ( bam_aux_get( b, "XS" ) ) + { + if ( bam_aux2A( bam_aux_get( b, "XS" ) ) == '-' ) + return -1 ; + else + return 1 ; + } + else + return 0 ; + } + + int GetFieldI( char *f ) + { + if ( bam_aux_get( b, f ) ) + { + return bam_aux2i( bam_aux_get( b, f ) ) ; + } + return -1 ; + } + + char *GetFieldZ( char *f ) + { + if ( bam_aux_get( b, f ) ) + { + return bam_aux2Z( bam_aux_get( b, f ) ) ; + } + return NULL ; + } + + int GetNumberOfHits() + { + if ( bam_aux_get( b, "NH" ) ) + { + return bam_aux2i( bam_aux_get( b, "NH" ) ) ; + } + return 1 ; + } + + bool IsSupplementary() + { + if ( ( b->core.flag & 0x800 ) == 0 ) + return false ; + else + return true ; + } + + void SetAllowSupplementary( bool in ) + { + allowSupplementary = in ; + } + + void SetAllowClip( bool in ) + { + allowClip = in ; + } + + bool IsGCRich( bool threshold = 0.9 ) + { + int i = 0 ; + int gcCnt = 0 ; + for ( i = 0 ; i < b->core.l_qseq ; ++i ) + { + int bit = bam1_seqi( bam1_seq( b ), i ) ; + if ( bit == 2 || bit == 4 ) + ++gcCnt ; + } + if ( gcCnt >= threshold * b->core.l_qseq ) + return true ; + } + + void GetGeneralInfo( bool stopEarly = false ) + { + int i, k ; + + const int sampleMax = 1000000 ; + int *lens = new int[sampleMax] ; + int *mateDiff = new int[sampleMax] ; + int lensCnt = 0 ; + int mateDiffCnt = 0 ; + bool end = false ; + + while ( 1 ) + { + while ( 1 ) + { + if ( b ) + bam_destroy1( b ) ; + b = bam_init1() ; + + if ( samread( fpSam, b ) <= 0 ) + { + end = true ; + break ; + } + if ( b->core.flag & 0xC ) + continue ; + + if ( ( b->core.flag & 0x900 ) == 0 ) + break ; + } + if ( end ) + break ; + + if ( lensCnt < sampleMax ) + { + lens[ lensCnt ] = b->core.l_qseq ; + ++lensCnt ; + } + + if ( mateDiffCnt < sampleMax && b->core.tid == b->core.mtid + && b->core.pos < b->core.mpos ) + { + mateDiff[ mateDiffCnt ] = b->core.mpos - b->core.pos ; + ++mateDiffCnt ; + } + ++totalReadCnt ; + if ( totalReadCnt >= sampleMax && stopEarly ) + break ; + } + + // Get the read length info and fragment length info + qsort( lens, lensCnt, sizeof( int ), CompInt ) ; + readLen = lens[ lensCnt - 1 ] ; + + if ( mateDiffCnt > 0 ) + { + matePaired = true ; + + qsort( mateDiff, mateDiffCnt, sizeof( int ), CompInt ) ; + long long int sum = 0 ; + long long int sumsq = 0 ; + + for ( i = 0 ; i < mateDiffCnt * 0.7 ; ++i ) + { + sum += ( mateDiff[i] + readLen ); + sumsq += ( mateDiff[i] + readLen ) * ( mateDiff[i] + readLen ) ; + } + k = i ; + fragLen = (int)( sum / k ) ; + fragStdev = (int)sqrt( sumsq / k - fragLen * fragLen ) ; + } + else + { + fragLen = readLen ; + fragStdev = 0 ; + } + //printf( "readLen = %d\nfragLen = %d, fragStdev = %d\n", readLen, fragLen, fragStdev ) ; + delete[] lens ; + delete[] mateDiff ; + } +} ; +#endif diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/blocks.hpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/blocks.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1580 @@ +// The class manage the blocks +// Li Song + +#ifndef _LSONG_CLASSES_BLOCKS_HEADER +#define _LSONG_CLASSES_BLOCKS_HEADER + +#include +#include +#include +#include +#include +#include +#include + +#include "defs.h" + +extern bool VERBOSE ; +extern FILE *fpOut ; + +extern int gMinDepth ; + + +struct _splitSite // means the next position belongs to another block +{ + int64_t pos ; + char strand ; + int chrId ; + int type ; // 1-start of an exon. 2-end of an exon. + + int support ; + int uniqSupport ; + + int mismatchSum ; + + int64_t oppositePos ; // the position of the other sites to form the intron. +} ; + +struct _block +{ + int chrId ; + int contigId ; + int64_t start, end ; + int64_t leftSplice, rightSplice ; // Some information about the left splice site and right splice site. + // record the leftmost and rightmost coordinates of a splice site within this block + //or the length of read alignment support the splice sites. + //Or the number of support. + int64_t depthSum ; + + int leftType, rightType ; //0-soft boundary, 1-start of an exon, 2-end of an exon. + + double leftRatio, rightRatio ; // the adjusted ratio-like value to the left and right anchor subexons. + double ratio ; + + char leftStrand, rightStrand ; + + int *depth ; + int prevCnt, nextCnt ; // Some connection information for the subexons. + int *prev ; + int *next ; +} ; + +// adjacent graph +struct _adj +{ + int ind ; + int info ; + int support ; + int next ; +} ; + +class Blocks +{ + private: + std::map exonBlocksChrIdOffset ; + + int64_t Overlap( int64_t s0, int64_t e0, int64_t s1, int64_t e1, int64_t &s, int64_t &e ) + { + s = e = -1 ; + if ( e0 < s1 || s0 > e1 ) + return 0 ; + s = s0 > s1 ? s0 : s1 ; + e = e0 < e1 ? e0 : e1 ; + return e - s + 1 ; + } + + void Split( const char *s, char delimit, std::vector &fields ) + { + int i ; + fields.clear() ; + if ( s == NULL ) + return ; + + std::string f ; + for ( i = 0 ; s[i] ; ++i ) + { + if ( s[i] == delimit || s[i] == '\n' ) + { + fields.push_back( f ) ; + f.clear() ; + } + else + f.append( 1, s[i] ) ; + } + fields.push_back( f ) ; + f.clear() ; + } + + void BuildBlockChrIdOffset() + { + // Build the map for the offsets of chr id in the exonBlock list. + exonBlocksChrIdOffset[ exonBlocks[0].chrId] = 0 ; + int cnt = exonBlocks.size() ; + for ( int i = 1 ; i < cnt ; ++i ) + { + if ( exonBlocks[i].chrId != exonBlocks[i - 1].chrId ) + exonBlocksChrIdOffset[ exonBlocks[i].chrId ] = i ; + } + } + + bool CanReach( int from, int to, struct _adj *adj, bool *visited ) + { + if ( visited[from] ) + return false ; + visited[from] = true ; + int p ; + p = adj[from].next ; + while ( p != -1 ) + { + if ( adj[p].ind == to ) + return true ; + if ( adj[p].ind < to + && CanReach( adj[p].ind, to, adj, visited ) ) + return true ; + p = adj[p].next ; + } + return false ; + } + + void AdjustAndCreateExonBlocks( int tag, std::vector &newExonBlocks ) + { + int i, j ; + if ( exonBlocks[tag].depth != NULL ) + { + // Convert the increment and decrement into actual depth. + int len = exonBlocks[tag].end - exonBlocks[tag].start + 1 ; + int *depth = exonBlocks[tag].depth ; + for ( i = 1 ; i < len ; ++i ) + depth[i] = depth[i - 1] + depth[i] ; + + struct _block island ; // the portion created by the hollow. + island.start = island.end = -1 ; + island.depthSum = 0 ; + + /*if ( exonBlocks[tag].start == 1562344 ) + { + for ( i = 0 ; i < len ; ++i ) + printf( "%d\n", depth[i] ) ; + }*/ + // Adjust boundary accordingly. + int64_t adjustStart = exonBlocks[tag].start ; + int64_t adjustEnd = exonBlocks[tag].end ; + + if ( exonBlocks[tag].leftType == 0 && exonBlocks[tag].rightType != 0 ) + { + for ( i = len - 1 ; i >= 0 ; --i ) + if ( depth[i] < gMinDepth ) + break ; + ++i ; + if ( exonBlocks[tag].rightType == 2 && i + exonBlocks[tag].start > exonBlocks[tag].rightSplice ) + i = exonBlocks[tag].rightSplice - exonBlocks[tag].start ; + adjustStart = i + exonBlocks[tag].start ; + + if ( adjustStart > exonBlocks[tag].start ) + { + int s, e ; + // firstly [s,e] is the range of depth array. + for ( s = 0 ; s = gMinDepth ) + break ; + for ( e = i - 1 ; e >= s ; -- e ) + if ( depth[e] >= gMinDepth ) + break ; + if ( e >= s ) + { + island = exonBlocks[tag] ; + island.depthSum = 0 ; + island.leftType = island.rightType = 0 ; + for ( j = s ; j <= e ; ++j ) + island.depthSum += depth[j] ; + island.start = s + exonBlocks[tag].start ; // offset the coordinate. + island.end = e + exonBlocks[tag].start ; + } + } + } + else if ( exonBlocks[tag].leftType != 0 && exonBlocks[tag].rightType == 0 ) + { + for ( i = 0 ; i < len ; ++i ) + if ( depth[i] < gMinDepth ) + break ; + --i ; + if ( exonBlocks[tag].leftType == 1 && i + exonBlocks[tag].start < exonBlocks[tag].leftSplice ) + i = exonBlocks[tag].leftSplice - exonBlocks[tag].start ; + adjustEnd = i + exonBlocks[tag].start ; + + if ( adjustEnd < exonBlocks[tag].end ) + { + int s, e ; + // firstly [s,e] is the range of depth array. + for ( s = i + 1 ; s < len ; ++s ) + if ( depth[s] >= gMinDepth ) + break ; + for ( e = len - 1 ; e >= s ; --e ) + if ( depth[e] >= gMinDepth ) + break ; + if ( e >= s ) + { + island = exonBlocks[tag] ; + island.depthSum = 0 ; + island.leftType = island.rightType = 0 ; + for ( j = s ; j <= e ; ++j ) + island.depthSum += depth[j] ; + island.start = s + exonBlocks[tag].start ; // offset the coordinate. + island.end = e + exonBlocks[tag].start ; + } + } + } + else if ( exonBlocks[tag].leftType == 0 && exonBlocks[tag].rightType == 0 ) + { + for ( i = 0 ; i < len ; ++i ) + if ( depth[i] >= gMinDepth ) + break ; + adjustStart = i + exonBlocks[tag].start ; + + for ( i = len - 1 ; i >= 0 ; --i ) + if ( depth[i] >= gMinDepth ) + break ; + adjustEnd = i + exonBlocks[tag].start ; + } + else if ( exonBlocks[tag].leftType == 1 && exonBlocks[tag].rightType == 2 + && exonBlocks[tag].end - exonBlocks[tag].start + 1 >= 1000 ) + { + // The possible merge of two genes or merge of UTRs, if we don't break the low coverage part. + // If we decide to cut, I'll reuse the variable "island" to represent the subexon on right hand side. + int l ; + int gapSize = 30 ; + for ( i = 0 ; i <= len - ( gapSize + 1 ) ; ++i ) + { + if ( depth[i] < gMinDepth ) + { + for ( l = i ; l = gMinDepth ) + break ; + if ( l = ( gapSize + 1 ) - 1 ; --j ) + { + if ( depth[j] < gMinDepth ) + { + for ( l = j ; l >= j - ( gapSize + 1 ) + 1 ; --l ) + if ( depth[l] >= gMinDepth ) + break ; + + if ( l >= j - ( gapSize + 1 ) + 1 ) + { + j = l + 1 ; + continue ; + } + else + break ; + } + } + + if ( j - i + 1 > gapSize ) + { + // we break. + --i ; ++j ; + + adjustEnd = i + exonBlocks[tag].start ; + + if ( j < len - 1 ) + { + island = exonBlocks[tag] ; + island.depthSum = 0 ; + island.leftType = 0 ; + for ( l = j ; l <= len - 1 ; ++l ) + island.depthSum += depth[j] ; + island.start = j + exonBlocks[tag].start ; // offset the coordinate. + island.end = exonBlocks[tag].end ; + } + + exonBlocks[tag].rightType = 0 ; // put it here, so the island can get the right info + //fprintf( stderr, "break %d %d %d %d.\n", (int)exonBlocks[tag].start, (int)adjustEnd, i + (int)exonBlocks[tag].start, j + (int)exonBlocks[tag].start ) ; + } + } + + int lostDepthSum = 0 ; + for ( i = exonBlocks[tag].start ; i < adjustStart ; ++i ) + lostDepthSum += depth[i - exonBlocks[tag].start ] ; + for ( i = adjustEnd + 1 ; i < exonBlocks[tag].end ; ++i ) + lostDepthSum += depth[i - exonBlocks[tag].start ] ; + exonBlocks[tag].depthSum -= lostDepthSum ; + + delete[] exonBlocks[tag].depth ; + exonBlocks[tag].depth = NULL ; + + //if ( exonBlocks[tag].start == 1562344 ) + // printf( "%d %d\n", adjustStart, adjustEnd ) ; + if ( ( len > 1 && adjustEnd - adjustStart + 1 <= 1 ) || ( adjustEnd - adjustStart + 1 <= 0 ) ) + return ; + exonBlocks[tag].start = adjustStart ; + exonBlocks[tag].end = adjustEnd ; + + if ( island.start != -1 && island.end < exonBlocks[tag].start ) + newExonBlocks.push_back( island ) ; + + newExonBlocks.push_back( exonBlocks[tag] ) ; + if ( island.start != -1 && island.start > exonBlocks[tag].end ) + newExonBlocks.push_back( island ) ; + } + } + public: + std::vector exonBlocks ; + + Blocks() { } + ~Blocks() + { + int blockCnt = exonBlocks.size() ; + for ( int i = 0 ; i < blockCnt ; ++i ) + { + if ( exonBlocks[i].next != NULL ) + { + delete[] exonBlocks[i].next ; + } + if ( exonBlocks[i].prev != NULL ) + { + delete[] exonBlocks[i].prev ; + } + } + } + + double GetAvgDepth( const struct _block &block ) + { + return block.depthSum / (double)( block.end - block.start + 1 ) ; + } + + int BuildExonBlocks( Alignments &alignments ) + { + int tag = 0 ; + while ( alignments.Next() ) + { + int i, j, k ; + int segCnt = alignments.segCnt ; + struct _pair *segments = alignments.segments ; + int eid = 0 ; // the exonblock id that the segment update + + // locate the first exonblock that beyond the start of the read. + while ( tag < (int)exonBlocks.size() && ( exonBlocks[tag].end < segments[0].a - 1 + || exonBlocks[tag].chrId != alignments.GetChromId() ) ) + { + ++tag ; + } + // due to the merge of exon blocks, we might need roll back + --tag ; + while ( tag >= 0 && ( exonBlocks[tag].end >= segments[0].a - 1 + && exonBlocks[tag].chrId == alignments.GetChromId() ) ) + { + --tag ; + } + ++tag ; + + for ( i = 0 ; i < segCnt ; ++i ) + { + //if ( i == 0 ) + // printf( "hi %d %s %d %d\n", i, alignments.GetReadId(), segments[i].a, segments[i].b ) ; + for ( j = tag ; j < (int)exonBlocks.size() ; ++j ) + { + if ( exonBlocks[j].end >= segments[i].a - 1 ) + break ; + } + if ( j >= (int)exonBlocks.size() ) + { + // Append a new block + struct _block newSeg ; + newSeg.chrId = alignments.GetChromId() ; + newSeg.start = segments[i].a ; + newSeg.end = segments[i].b ; + + newSeg.leftSplice = -1 ; + newSeg.rightSplice = -1 ; + if ( i > 0 ) + newSeg.leftSplice = segments[i].a ; + if ( i < segCnt - 1 ) + newSeg.rightSplice = segments[i].b ; + + exonBlocks.push_back( newSeg ) ; + + eid = exonBlocks.size() - 1 ; + } + else if ( exonBlocks[j].end < segments[i].b || + ( exonBlocks[j].start > segments[i].a && exonBlocks[j].start <= segments[i].b + 1 ) ) + { + // If overlaps with a current exon block, so we extend it + if ( exonBlocks[j].end < segments[i].b ) + { + // extends toward right + exonBlocks[j].end = segments[i].b ; + if ( i > 0 && ( exonBlocks[j].leftSplice == -1 || segments[i].a < exonBlocks[j].leftSplice ) ) + exonBlocks[j].leftSplice = segments[i].a ; + if ( i < segCnt - 1 && segments[i].b > exonBlocks[j].rightSplice ) + exonBlocks[j].rightSplice = segments[i].b ; + eid = j ; + + // Merge with next few exon blocks + for ( k = j + 1 ; k < (int)exonBlocks.size() ; ++k ) + { + if ( exonBlocks[k].start <= exonBlocks[j].end + 1 ) + { + if ( exonBlocks[k].end > exonBlocks[j].end ) + exonBlocks[j].end = exonBlocks[k].end ; + + if ( exonBlocks[k].leftSplice != -1 && ( exonBlocks[j].leftSplice == -1 || exonBlocks[k].leftSplice < exonBlocks[j].leftSplice ) ) + exonBlocks[j].leftSplice = exonBlocks[k].leftSplice ; + if ( exonBlocks[k].rightSplice != -1 && exonBlocks[k].rightSplice > exonBlocks[j].rightSplice ) + exonBlocks[j].rightSplice = exonBlocks[k].rightSplice ; + + } + else + break ; + } + + if ( k > j + 1 ) + { + // Remove the merged blocks + int a, b ; + for ( a = j + 1, b = k ; b < (int)exonBlocks.size() ; ++a, ++b ) + exonBlocks[a] = exonBlocks[b] ; + for ( a = 0 ; a < k - ( j + 1 ) ; ++a ) + exonBlocks.pop_back() ; + } + } + else if ( exonBlocks[j].start > segments[i].a && exonBlocks[j].start <= segments[i].b + 1 ) + { + // extends toward left + exonBlocks[j].start = segments[i].a ; + if ( i > 0 && ( exonBlocks[j].leftSplice == -1 || segments[i].a < exonBlocks[j].leftSplice ) ) + exonBlocks[j].leftSplice = segments[i].a ; + if ( i < segCnt - 1 && segments[i].b > exonBlocks[j].rightSplice ) + exonBlocks[j].rightSplice = segments[i].b ; + eid = j ; + + // Merge with few previous exon blocks + for ( k = j - 1 ; k >= 0 ; --k ) + { + if ( exonBlocks[k].end >= exonBlocks[k + 1].start - 1 ) + { + if ( exonBlocks[k + 1].start < exonBlocks[k].start ) + { + exonBlocks[k].start = exonBlocks[k + 1].start ; + } + + if ( exonBlocks[k].leftSplice != -1 && ( exonBlocks[j].leftSplice == -1 || exonBlocks[k].leftSplice < exonBlocks[j].leftSplice ) ) + exonBlocks[j].leftSplice = exonBlocks[k].leftSplice ; + if ( exonBlocks[k].rightSplice != -1 && exonBlocks[k].rightSplice > exonBlocks[j].rightSplice ) + exonBlocks[j].rightSplice = exonBlocks[k].rightSplice ; + + } + else + break ; + } + + if ( k < j - 1 ) + { + int a, b ; + eid = k + 1 ; + for ( a = k + 2, b = j + 1 ; b < (int)exonBlocks.size() ; ++a, ++b ) + exonBlocks[a] = exonBlocks[b] ; + for ( a = 0 ; a < ( j - 1 ) - k ; ++a ) + exonBlocks.pop_back() ; + + } + } + } + else if ( exonBlocks[j].start > segments[i].b + 1 ) + { + int size = exonBlocks.size() ; + int a ; + // No overlap, insert a new block + struct _block newSeg ; + newSeg.chrId = alignments.GetChromId() ; + newSeg.start = segments[i].a ; + newSeg.end = segments[i].b ; + + newSeg.leftSplice = -1 ; + newSeg.rightSplice = -1 ; + if ( i > 0 ) + newSeg.leftSplice = segments[i].a ; + if ( i < segCnt - 1 ) + newSeg.rightSplice = segments[i].b ; + + // Insert at position j + exonBlocks.push_back( newSeg ) ; + for ( a = size ; a > j ; --a ) + exonBlocks[a] = exonBlocks[a - 1] ; + exonBlocks[a] = newSeg ; + + eid = a ; + } + else + { + // The segment is contained in j + eid = -1 ; + } + + // Merge the block with the mate if the gap is very small + // Note that since reads are already sorted by coordinate, + // the previous exon blocks is built completely. + /*int64_t matePos ; + int mateChrId ; + alignments.GetMatePosition( mateChrId, matePos ) ; + if ( i == 0 && eid > 0 && mateChrId == exonBlocks[ eid ].chrId + && matePos < segments[0].a + && matePos >= exonBlocks[eid - 1].start + && matePos <= exonBlocks[eid - 1].end + && segments[0].a - matePos + 1 <= 500 + && exonBlocks[eid-1].chrId == exonBlocks[eid].chrId + && exonBlocks[eid].start - exonBlocks[eid - 1].end - 1 <= 30 ) + { + printf( "%d: (%d-%d) (%d-%d). %d %d\n", ( segments[0].a + alignments.readLen - 1 ) - matePos + 1, exonBlocks[eid - 1].start, exonBlocks[eid - 1].end, + exonBlocks[eid].start, + exonBlocks[eid].end, eid, exonBlocks.size() ) ; + exonBlocks[eid - 1].end = exonBlocks[eid].end ; + + if ( exonBlocks[eid].leftSplice != -1 && ( exonBlocks[eid - 1].leftSplice == -1 || exonBlocks[eid].leftSplice < exonBlocks[eid - 1].leftSplice ) ) + exonBlocks[eid - 1].leftSplice = exonBlocks[k].leftSplice ; + if ( exonBlocks[eid].rightSplice != -1 && exonBlocks[eid].rightSplice > exonBlocks[eid - 1].rightSplice ) + exonBlocks[eid - 1].rightSplice = exonBlocks[eid].rightSplice ; + + int size = exonBlocks.size() ; + for ( j = eid ; j < size - 1 ; ++j ) + exonBlocks[j] = exonBlocks[j + 1] ; + exonBlocks.pop_back() ; + }*/ + } + } + /*for ( int i = 0 ; i < (int)exonBlocks.size() ; ++i ) + { + printf( "%d %d\n", exonBlocks[i].start, exonBlocks[i].end ) ; + }*/ + + if ( exonBlocks.size() > 0 ) + { + BuildBlockChrIdOffset() ; + int cnt = exonBlocks.size() ; + for ( int i = 0 ; i < cnt ; ++i ) + { + exonBlocks[i].contigId = exonBlocks[i].chrId ; + + exonBlocks[i].leftType = 0 ; + exonBlocks[i].rightType = 0 ; + exonBlocks[i].depth = NULL ; + exonBlocks[i].nextCnt = 0 ; + exonBlocks[i].prevCnt = 0 ; + exonBlocks[i].leftStrand = '.' ; + exonBlocks[i].rightStrand = '.' ; + } + } + return exonBlocks.size() ; + } + + void FilterSplitSitesInRegions( std::vector &sites ) + { + int i, j, k ; + int size = sites.size() ; + int bsize = exonBlocks.size() ; + int tag = 0 ; + + for ( i = 0 ; i < bsize ; ++i ) + { + while ( tag < size && ( sites[tag].chrId < exonBlocks[i].chrId || + ( sites[tag].chrId == exonBlocks[i].chrId && sites[tag].pos < exonBlocks[i].start ) ) ) + { + ++tag ; + } + if ( tag >= size ) + break ; + + if ( sites[tag].chrId != exonBlocks[i].chrId || sites[tag].pos > exonBlocks[i].end ) + continue ; + + for ( j = tag + 1 ; j < size ; ++j ) + if ( sites[j].chrId != exonBlocks[i].chrId || sites[j].pos > exonBlocks[i].end ) + break ; + + // [tag,j) holds the split sites in this region. + int leftStrandSupport[2] = {0, 0} ; + int rightStrandSupport[2] = {0, 0} ; + int strandCnt[2] = { 0, 0 } ; + for ( k = tag ; k < j ; ++k ) + { + if ( sites[k].strand == '-' ) + { + ++strandCnt[0] ; + if ( sites[k].oppositePos < exonBlocks[i].start ) + leftStrandSupport[0] += sites[k].support ; + else if ( sites[k].oppositePos > exonBlocks[i].end ) + rightStrandSupport[0] += sites[k].support ; + } + else if ( sites[k].strand == '+' ) + { + ++strandCnt[1] ; + if ( sites[k].oppositePos < exonBlocks[i].start ) + leftStrandSupport[1] += sites[k].support ; + else if ( sites[k].oppositePos > exonBlocks[i].end ) + rightStrandSupport[1] += sites[k].support ; + } + + } + + + if ( leftStrandSupport[0] == 0 && rightStrandSupport[0] == 0 + && leftStrandSupport[1] != 0 && rightStrandSupport[1] != 0 && strandCnt[0] == 2 ) + { + // check whether a different strand accidentally show up in this region. + bool remove = false ; + for ( k = tag ; k < j ; ++k ) + { + if ( sites[k].strand == '-' && + sites[k].oppositePos >= sites[k].pos && sites[k].oppositePos <= exonBlocks[i].end && + sites[k].support <= 2 ) + { + remove = true ; + break ; + } + } + + if ( remove ) + for ( k = tag ; k < j ; ++k ) + if ( sites[k].strand == '-' ) + sites[k].support = -1 ; + } + else if ( leftStrandSupport[1] == 0 && rightStrandSupport[1] == 0 + && leftStrandSupport[0] != 0 && rightStrandSupport[0] != 0 && strandCnt[1] == 2 ) + { + // check whether a different strand accidentally show up in this region. + bool remove = false ; + for ( k = tag ; k < j ; ++k ) + { + if ( sites[k].strand == '+' && + sites[k].oppositePos >= sites[k].pos && sites[k].oppositePos <= exonBlocks[i].end && + sites[k].support <= 2 ) + { + remove = true ; + break ; + } + } + + if ( remove ) + for ( k = tag ; k < j ; ++k ) + if ( sites[k].strand == '+' ) + sites[k].support = -1 ; + } + + tag = j ; + } + + k = 0 ; + for ( i = 0 ; i < size ; ++i ) + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + ++k ; + } + sites.resize( k ) ; + } + + // Filter the pair of split sites that is likely merge two genes. + // We filter the case like [..]------------------------[..] + // [..]--[..] [..]-[...] + void FilterGeneMergeSplitSites( std::vector &sites ) + { + int i, j, k ; + int bsize = exonBlocks.size() ; + int ssize = sites.size() ; + + struct _pair32 *siteToBlock = new struct _pair32[ ssize ] ; + struct _adj *adj = new struct _adj[ ssize / 2 + bsize ] ; + bool *visited = new bool[bsize] ; + + int adjCnt = 0 ; + for ( i = 0 ; i < bsize ; ++i ) + { + adj[i].info = 0 ; // in the header, info means the number of next block + adj[i].support = 0 ; + adj[i].next = -1 ; + } + adjCnt = i ; + memset( siteToBlock, -1, sizeof( struct _pair32 ) * ssize ) ; + memset( visited, false, sizeof( bool ) * bsize ) ; + + // Build the graph. + k = 0 ; + for ( i = 0 ; i < bsize ; ++i ) + { + for ( ; k < ssize ; ++k ) + { + if ( sites[k].oppositePos < sites[k].pos ) + continue ; + if ( sites[k].chrId < exonBlocks[i].chrId + || ( sites[k].chrId == exonBlocks[i].chrId && sites[k].pos < exonBlocks[i].start ) ) + continue ; + break ; + } + + for ( ; k < ssize ; ++k ) + { + if ( sites[k].chrId > exonBlocks[i].chrId + || sites[k].pos > exonBlocks[i].end ) + break ; + + if ( sites[k].oppositePos <= exonBlocks[i].end ) // ignore self-loop + continue ; + + for ( j = i + 1 ; j < bsize ; ++j ) + if ( sites[k].oppositePos >= exonBlocks[j].start && sites[k].oppositePos <= exonBlocks[j].end ) + break ; + if ( sites[k].oppositePos >= exonBlocks[j].start && sites[k].oppositePos <= exonBlocks[j].end ) + { + int p ; + p = adj[i].next ; + while ( p != -1 ) + { + if ( adj[p].ind == j ) + { + ++adj[p].info ; + adj[p].support += sites[k].uniqSupport ; + break ; + } + p = adj[p].next ; + } + if ( p == -1 ) + { + adj[ adjCnt ].ind = j ; + adj[ adjCnt ].info = 1 ; + adj[ adjCnt ].support = sites[k].uniqSupport ; + adj[ adjCnt ].next = adj[i].next ; + adj[i].next = adjCnt ; + ++adj[i].info ; + ++adjCnt ; + } + + siteToBlock[k].a = i ; + siteToBlock[k].b = j ; + } + } + } + for ( k = 0 ; k < ssize ; ++k ) + { + if ( sites[k].oppositePos - sites[k].pos + 1 < 20000 || sites[k].uniqSupport >= 30 ) + continue ; + + int from = siteToBlock[k].a ; + int to = siteToBlock[k].b ; + if ( to - from - 1 < 2 || adj[from].info <= 1 ) + continue ; + + memset( &visited[from], false, sizeof( bool ) * ( to - from + 1 ) ) ; + + int cnt = 0 ; + int p = adj[from].next ; + int max = -1 ; + int maxP = 0 ; + while ( p != -1 ) + { + if ( adj[p].support >= 10 && adj[p].ind <= to ) + ++cnt ; + if ( adj[p].support > max || ( adj[p].support == max && adj[p].ind == to ) ) + { + max = adj[p].support ; + maxP = p ; + } + + if ( adj[p].ind == to && adj[p].info > 1 ) + { + cnt = 0 ; + break ; + } + p = adj[p].next ; + } + if ( cnt <= 1 ) + continue ; + + if ( max != -1 && adj[maxP].ind == to ) + continue ; + + // No other path can reach "to" + p = adj[from].next ; + while ( p != -1 ) + { + if ( adj[p].ind != to ) + { + //if ( sites[k].pos ==43917158 ) + // printf( "hi %d %d. (%d %d)\n", from, adj[p].ind, exonBlocks[ adj[p].ind ].start, exonBlocks[ adj[p].ind ].end ) ; + if ( CanReach( adj[p].ind, to, adj, visited ) ) + break ; + } + p = adj[p].next ; + } + if ( p != -1 ) + continue ; + + //if ( sites[k].pos == 34073267 ) + // printf( "hi %d %d. (%d %d)\n", from, to, exonBlocks[to].start, exonBlocks[to].end ) ; + + // There are some blocks between that can reach "to" + for ( i = from + 1 ; i < to ; ++i ) + { + p = adj[i].next ; + while ( p != -1 ) + { + if ( adj[p].ind == to && adj[p].support >= 10 ) + break ; + p = adj[p].next ; + } + if ( p != -1 ) + break ; + } + if ( i >= to ) + continue ; + + // Filter the site + //printf( "filtered: %d %d\n", sites[k].pos, sites[k].oppositePos ) ; + sites[k].support = -1 ; + for ( j = k + 1 ; j < ssize ; ++j ) + { + if ( sites[j].pos == sites[k].oppositePos && sites[j].oppositePos == sites[k].pos ) + sites[j].support = -1 ; + if ( sites[j].pos > sites[k].oppositePos ) + break ; + } + } + + k = 0 ; + for ( i = 0 ; i < ssize ; ++i ) + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + ++k ; + } + sites.resize( k ) ; + + // Filter the intron on the different strand of a gene. + ssize = k ; + k = 0 ; + for ( i = 0 ; i < bsize ; ) + { + int farthest = exonBlocks[i].end ; + + for ( j = i ; j < bsize ; ++j ) + { + if ( exonBlocks[j].start > farthest || exonBlocks[i].chrId != exonBlocks[j].chrId ) + break ; + int p = adj[i].next ; + while ( p != -1 ) + { + if ( exonBlocks[ adj[p].ind ].end > farthest ) + farthest = exonBlocks[ adj[p].ind ].end ; + p = adj[p].next ; + } + } + + for ( ; k < ssize ; ++k ) + { + if ( sites[k].chrId < exonBlocks[i].chrId || + ( sites[k].chrId == exonBlocks[i].chrId && sites[k].pos < exonBlocks[i].start ) ) + continue ; + break ; + } + + if ( sites[k].chrId > exonBlocks[i].chrId || sites[k].pos > farthest ) + { + i = j ; + continue ; + } + //printf( "%d %d. %d %d. %d %d\n", i, j, exonBlocks[i].start, farthest, k, sites[k].pos ) ; + + + int from = k ; + int to ; + for ( to = k ; to < ssize ; ++to ) + if ( sites[to].chrId != exonBlocks[i].chrId || sites[to].pos > farthest ) + break ; + + int strandSupport[2] = {0, 0}; + int strandCount[2] = {0, 0}; + for ( k = from ; k < to ; ++k ) + { + if ( sites[k].oppositePos <= sites[k].pos ) + continue ; + + int tag = ( sites[k].strand == '+' ? 1 : 0 ) ; + strandSupport[tag] += sites[k].support ; + ++strandCount[tag] ; + } + + // Not mixtured. + if ( strandCount[0] * strandCount[1] == 0 ) + { + i = j ; + k = to ; + continue ; + } + + char removeStrand = 0 ; + if ( strandCount[0] == 1 && strandCount[1] >= 3 + && strandSupport[1] >= 20 * strandSupport[0] && strandSupport[0] <= 5 ) + { + removeStrand = '-' ; + } + else if ( strandCount[1] == 1 && strandCount[0] >= 3 + && strandSupport[0] >= 20 * strandSupport[1] && strandSupport[1] <= 5 ) + { + removeStrand = '+' ; + } + + if ( removeStrand == 0 ) + { + i = j ; k = to ; + continue ; + } + + for ( k = from ; k < to ; ++k ) + { + if ( sites[k].strand == removeStrand && sites[k].oppositePos >= exonBlocks[i].start && sites[k].oppositePos <= farthest ) + { + //printf( "filtered: %d %d\n", sites[k].pos, sites[k].oppositePos ) ; + sites[k].support = -1 ; + } + } + + i = j ; + k = to ; + } + + k = 0 ; + for ( i = 0 ; i < ssize ; ++i ) + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + ++k ; + } + sites.resize( k ) ; + + delete[] visited ; + delete[] siteToBlock ; + delete[] adj ; + } + + void SplitBlocks( Alignments &alignments, std::vector< struct _splitSite > &splitSites ) + { + std::vector rawExonBlocks = exonBlocks ; + int i, j ; + int tag = 0 ; + int bsize = rawExonBlocks.size() ; + int ssize = splitSites.size() ; + + // Make sure not overflow + struct _splitSite tmp ; + tmp.pos = -1 ; + splitSites.push_back( tmp ) ; + + exonBlocks.clear() ; + for ( i = 0 ; i < bsize ; ++i ) + { + while ( tag < ssize && ( splitSites[tag].chrId < rawExonBlocks[i].chrId || + ( splitSites[tag].chrId == rawExonBlocks[i].chrId && splitSites[tag].pos < rawExonBlocks[i].start ) ) ) + ++tag ; + + int l ; + for ( l = tag ; l < ssize ; ++l ) + if ( splitSites[l].chrId != rawExonBlocks[i].chrId || splitSites[l].pos > rawExonBlocks[i].end ) + break ; + int64_t start = rawExonBlocks[i].start ; + int64_t end = rawExonBlocks[i].end ; + //printf( "%lld %lld\n", start, end ) ; + //printf( "%s %s: %d %d\n", alignments.GetChromName( splitSites[tag].chrId ), alignments.GetChromName( rawExonBlocks[i].chrId ), tag, l ) ; + for ( j = tag ; j <= l ; ++j ) + { + int leftType = 0 ; + int rightType = 0 ; + if ( j > tag ) + { + start = splitSites[j - 1].pos ; // end is from previous stage + leftType = splitSites[j - 1].type ; + } + else + start = rawExonBlocks[i].start ; + + if ( j <= l - 1 ) + { + end = splitSites[j].pos ; + rightType = splitSites[j].type ; + } + else + end = rawExonBlocks[i].end ; + + if ( leftType == 2 ) + ++start ; + if ( rightType == 1 ) + --end ; + + struct _block tmpB ; + tmpB = rawExonBlocks[i] ; + tmpB.start = start ; + tmpB.end = end ; + tmpB.depthSum = 0 ; + tmpB.ratio = 0 ; + //printf( "\t%lld %lld\n", start, end ) ; + tmpB.leftType = leftType ; + tmpB.rightType = rightType ; + tmpB.depth = NULL ; + + exonBlocks.push_back( tmpB ) ; + // handle the soft boundary is the same as the hard boundary case + // or adjacent splice sites + /*if ( j == tag && start == end ) + { + exonBlocks.pop_back() ; + --end ; + } + else if ( j == l && start > end ) + { + exonBlocks.pop_back() ; + }*/ + if ( start > end + //|| ( tmpB.start == rawExonBlocks[i].start && tmpB.end != rawExonBlocks[i].end && tmpB.end - tmpB.start + 1 <= 7 ) + //|| ( tmpB.end == rawExonBlocks[i].end && tmpB.start != rawExonBlocks[i].start && tmpB.end - tmpB.start + 1 <= 7 )) // the case of too short overhang + || ( tmpB.leftType == 0 && tmpB.rightType == 2 && tmpB.end - tmpB.start + 1 <= 9 ) + || ( tmpB.leftType == 1 && tmpB.rightType == 0 && tmpB.end - tmpB.start + 1 <= 9 ) ) + { + exonBlocks.pop_back() ; + } + /*else if ( start == end ) + { + + }*/ + } + } + BuildBlockChrIdOffset() ; + } + + void ComputeDepth( Alignments &alignments ) + { + // Go through the alignment list again to fill in the depthSum; + int i ; + int j ; + int tag = 0 ; + int blockCnt = exonBlocks.size() ; + + std::vector newExonBlocks ; + while ( alignments.Next() ) + { + int segCnt = alignments.segCnt ; + struct _pair *segments = alignments.segments ; + + while ( tag < blockCnt && ( exonBlocks[tag].chrId < alignments.GetChromId() || + ( exonBlocks[tag].chrId == alignments.GetChromId() && exonBlocks[tag].end < segments[0].a ) ) ) + { + AdjustAndCreateExonBlocks( tag, newExonBlocks ) ; + ++tag ; + } + for ( i = 0 ; i < segCnt ; ++i ) + { + for ( j = tag ; j < blockCnt && ( exonBlocks[j].chrId == alignments.GetChromId() && exonBlocks[j].start <= segments[i].b ) ; ++j ) + { + int64_t s = -1, e = -1 ; + exonBlocks[j].depthSum += Overlap( segments[i].a, segments[i].b, exonBlocks[j].start, exonBlocks[j].end, s, e ) ; + if ( s == -1 ) + continue ; + + if ( exonBlocks[j].depth == NULL ) + { + int len = exonBlocks[j].end - exonBlocks[j].start + 1 ; + exonBlocks[j].depth = new int[len + 1] ; + memset( exonBlocks[j].depth, 0, sizeof( int ) * ( len + 1 ) ) ; + exonBlocks[j].leftSplice = exonBlocks[j].start ; + exonBlocks[j].rightSplice = exonBlocks[j].end ; + } + int *depth = exonBlocks[j].depth ; + ++depth[s - exonBlocks[j].start] ; + --depth[e - exonBlocks[j].start + 1] ; // notice the +1 here, since the decrease of coverage actually happens at next base. + + // record the longest alignment stretch support the splice site. + if ( exonBlocks[j].leftType == 1 && segments[i].a == exonBlocks[j].start + && e > exonBlocks[j].leftSplice ) + { + exonBlocks[j].leftSplice = e ; + } + if ( exonBlocks[j].rightType == 2 && segments[i].b == exonBlocks[j].end + && s < exonBlocks[j].rightSplice ) + { + exonBlocks[j].rightSplice = s ; + } + } + } + } + + for ( ; tag < blockCnt ; ++tag ) + AdjustAndCreateExonBlocks( tag, newExonBlocks ) ; + exonBlocks.clear() ; + + // Due to multi-alignment, we may skip some alignments that determines leftSplice and + // rightSplice, hence filtered some subexons. As a result, some subexons' anchor subexon will disapper + // and we need to change its boundary type. + blockCnt = newExonBlocks.size() ; + for ( i = 0 ; i < blockCnt ; ++i ) + { + if ( ( i == 0 && newExonBlocks[i].leftType == 2 ) || + ( i > 0 && newExonBlocks[i].leftType == 2 && newExonBlocks[i - 1].end + 1 != newExonBlocks[i].start ) ) + { + newExonBlocks[i].leftType = 0 ; + } + + if ( ( i == blockCnt - 1 && newExonBlocks[i].rightType == 1 ) || + ( i < blockCnt - 1 && newExonBlocks[i].rightType == 1 && + newExonBlocks[i].end + 1 != newExonBlocks[i + 1].start ) ) + newExonBlocks[i].rightType = 0 ; + } + exonBlocks = newExonBlocks ; + } + + // If two blocks whose soft boundary are close to each other, we can merge them. + void MergeNearBlocks() + { + std::vector rawExonBlocks = exonBlocks ; + int i, k ; + int bsize = rawExonBlocks.size() ; + int *newIdx = new int[bsize] ; // used for adjust prev and next. Note that by merging, it won't change the number of prevCnt or nextCnt. + + exonBlocks.clear() ; + exonBlocks.push_back( rawExonBlocks[0] ) ; + k = 0 ; + newIdx[0] = 0 ; + for ( i = 1 ; i < bsize ; ++i ) + { + if ( rawExonBlocks[i].chrId == exonBlocks[k].chrId + && rawExonBlocks[i].leftType == 0 && exonBlocks[k].rightType == 0 + && rawExonBlocks[i].start - exonBlocks[k].end - 1 <= 50 + && rawExonBlocks[i].depthSum / double( rawExonBlocks[i].end - rawExonBlocks[i].start + 1 ) < 3.0 + && exonBlocks[k].depthSum / double( exonBlocks[k].end - exonBlocks[i].start + 1 ) < 3.0 ) + { + exonBlocks[k].end = rawExonBlocks[i].end ; + exonBlocks[k].rightType = rawExonBlocks[i].rightType ; + exonBlocks[k].rightStrand = rawExonBlocks[i].rightStrand ; + exonBlocks[k].depthSum += rawExonBlocks[i].depthSum ; + if ( rawExonBlocks[i].rightSplice != -1 ) + exonBlocks[k].rightSplice = rawExonBlocks[i].rightSplice ; + + if ( exonBlocks[k].nextCnt > 0 ) + delete[] exonBlocks[k].next ; + exonBlocks[k].nextCnt = rawExonBlocks[i].nextCnt ; + exonBlocks[k].next = rawExonBlocks[i].next ; + } + else + { + exonBlocks.push_back( rawExonBlocks[i] ) ; + ++k ; + } + newIdx[i] = k ; + } + BuildBlockChrIdOffset() ; + bsize = exonBlocks.size() ; + for ( i = 0 ; i < bsize ; ++i ) + { + int cnt = exonBlocks[i].prevCnt ; + for ( k = 0 ; k < cnt ; ++k ) + exonBlocks[i].prev[k] = newIdx[ exonBlocks[i].prev[k] ] ; + + cnt = exonBlocks[i].nextCnt ; + for ( k = 0 ; k < cnt ; ++k ) + exonBlocks[i].next[k] = newIdx[ exonBlocks[i].next[k] ] ; + } + delete[] newIdx ; + } + + void AddIntronInformation( std::vector &sites, Alignments &alignments ) + { + // Add the connection information, support information and the strand information. + int i, j, k, tag ; + tag = 0 ; + int scnt = sites.size() ; + int exonBlockCnt = exonBlocks.size() ; + bool flag ; + + for ( i = 0 ; i < exonBlockCnt ; ++i ) + { + exonBlocks[i].prevCnt = exonBlocks[i].nextCnt = 0 ; + exonBlocks[i].prev = exonBlocks[i].next = NULL ; + exonBlocks[i].leftStrand = exonBlocks[i].rightStrand = '.' ; + exonBlocks[i].leftSplice = exonBlocks[i].rightSplice = 0 ; + } + + // Now, we add the region id and compute the length of each region, here the region does not contain possible IR event. + int regionId = 0 ; + for ( i = 0 ; i < exonBlockCnt ; ) + { + if ( exonBlocks[i].leftType == 2 && exonBlocks[i].rightType == 1 ) + { + j = i + 1 ; + } + else + for ( j = i + 1 ; j < exonBlockCnt ; ++j ) + if ( exonBlocks[j].start > exonBlocks[j - 1].end + 1 || ( exonBlocks[j].leftType == 2 && exonBlocks[j].rightType == 1 ) ) + break ; + for ( k = i ; k < j ; ++k ) + exonBlocks[k].contigId = regionId ; + ++regionId ; + i = j ; + } + int *regionLength = new int[regionId] ; + memset( regionLength, 0, sizeof( int ) * regionId ) ; + for ( i = 0 ; i < exonBlockCnt ; ++i ) + { + regionLength[ exonBlocks[i].contigId ] += exonBlocks[i].end - exonBlocks[i].start + 1 ; + } + + for ( i = 0 ; i < scnt ; ) + { + for ( j = i ; j < scnt ; ++j ) + { + if ( sites[j].chrId != sites[i].chrId || + sites[j].pos != sites[i].pos || + sites[j].type != sites[i].type ) + break ; + } + // [i,j-1] are the indices of the sites with same coordinate + // It is possible a sites corresponds to 2 strands, then we should only keep one of them. + char strand = '.' ; + int strandSupport[2] = {0, 0}; + for ( k = i ; k < j ; ++k ) + { + if ( sites[k].strand == '-' ) + strandSupport[0] += sites[k].support ; + else if ( sites[k].strand == '+' ) + strandSupport[1] += sites[k].support ; + } + if ( strandSupport[0] > strandSupport[1] ) + strand = '-' ; + else if ( strandSupport[1] > strandSupport[0] ) + strand = '+' ; + + + int cnt = j - i ; + // Locate the first subexon that can overlap with this site + while ( tag < exonBlockCnt ) + { + if ( ( exonBlocks[tag].chrId == sites[i].chrId && exonBlocks[tag].end >= sites[i].pos ) + || exonBlocks[tag].chrId > sites[i].chrId ) + break ; + ++tag ; + } + flag = false ; + for ( k = tag; k < exonBlockCnt ; ++k ) + { + if ( exonBlocks[tag].start > sites[i].pos || + exonBlocks[tag].chrId > sites[i].chrId ) + break ; + + if ( exonBlocks[tag].start == sites[i].pos || + exonBlocks[tag].end == sites[i].pos ) + { + flag = true ; + break ; + } + } + + if ( !flag ) + { + i = j ; + continue ; + } + + if ( sites[i].type == 1 && exonBlocks[tag].start == sites[i].pos ) + { + exonBlocks[tag].prevCnt = 0 ; + exonBlocks[tag].prev = new int[cnt] ; + exonBlocks[tag].leftStrand = strand ; + + // And we also need to put the "next" here. + // Here we assume the oppositePos sorted in increasing order + k = tag - 1 ; + for ( int l = j - 1 ; l >= i ; --l ) + { + for ( ; k >= 0 ; --k ) + { + if ( exonBlocks[k].end < sites[l].oppositePos || exonBlocks[k].chrId != sites[l].chrId ) + break ; + if ( exonBlocks[k].end == sites[l].oppositePos ) //&& exonBlocks[k].rightType == sites[l].type ) + { + if ( sites[l].strand != strand || + sites[l].strand != exonBlocks[k].rightStrand ) + break ; + exonBlocks[k].next[ exonBlocks[k].nextCnt ] = tag ; + exonBlocks[tag].prev[ exonBlocks[tag].prevCnt] = k ; // Note that the prev are sorted in decreasing order + ++exonBlocks[k].nextCnt ; + ++exonBlocks[tag].prevCnt ; + double factor = 1.0 ; + if ( regionLength[ exonBlocks[k].contigId ] < alignments.readLen ) + { + int left ; + for ( left = k ; left >= 0 ; --left ) + if ( exonBlocks[left].contigId != exonBlocks[k].contigId ) + break ; + ++left ; + + int prevType = 0 ; + // only consider the portion upto k. + for ( int m = left ; m <= k ; ++m ) + if ( exonBlocks[m].leftType == 1 ) + ++prevType ; + + if ( prevType == 0 ) + factor = 2 * (double)alignments.readLen / regionLength[ exonBlocks[k].contigId ] ; + } + if ( regionLength[ exonBlocks[tag].contigId ] < alignments.readLen ) + { + int right ; + for ( right = tag ; right < exonBlockCnt ; ++right ) + if ( exonBlocks[right].contigId != exonBlocks[tag].contigId ) + break ; + --right ; + + int nextType = 0 ; + for ( int m = tag ; m <= right ; ++m ) + if ( exonBlocks[m].rightType == 2 ) + ++nextType ; + + if ( nextType == 0 ) + factor = 2 * (double)alignments.readLen / regionLength[ exonBlocks[tag].contigId ] ; + } + if ( factor > 10 ) + factor = 10 ; + + if ( exonBlocks[k].contigId != exonBlocks[tag].contigId ) // the support of introns between regions. + { + // Adjust the support if one of the anchor exon is too short. + // This avoid the creation of alternative 3'/5' end due to low coverage from too short anchor. + exonBlocks[tag].leftSplice += sites[l].support * factor ; + exonBlocks[k].rightSplice += sites[l].support * factor ; + } + break ; + } + } + } + } + else if ( sites[i].type == 2 && exonBlocks[tag].end == sites[i].pos ) + { + exonBlocks[tag].nextCnt = 0 ; // cnt ; it should reach cnt after putting the ids in + exonBlocks[tag].next = new int[cnt] ; + exonBlocks[tag].rightStrand = strand ; + } + + i = j ; + } + // Reverse the prev list + for ( i = 0 ; i < exonBlockCnt ; ++i ) + { + for ( j = 0, k = exonBlocks[i].prevCnt - 1 ; j < k ; ++j, --k ) + { + int tmp = exonBlocks[i].prev[j] ; + exonBlocks[i].prev[j] = exonBlocks[i].prev[k] ; + exonBlocks[i].prev[k] = tmp ; + } + } + + // If all of the subexon anchored the intron are filtered, we need to change the type of the other anchor. + for ( i = 0 ; i < exonBlockCnt ; ++i ) + { + if ( exonBlocks[i].leftType == 1 && exonBlocks[i].prevCnt == 0 ) + { + exonBlocks[i].leftType = 0 ; + exonBlocks[i].leftStrand = '.' ; + if ( i > 0 && exonBlocks[i - 1].rightType == 1 ) + exonBlocks[i - 1].rightType = 0 ; + } + if ( exonBlocks[i].rightType == 2 && exonBlocks[i].nextCnt == 0 ) + { + exonBlocks[i].rightType = 0 ; + exonBlocks[i].rightStrand = '.' ; + if ( i < exonBlockCnt - 1 && exonBlocks[i + 1].leftType == 2 ) + exonBlocks[i + 1].leftType = 0 ; + } + } + MergeNearBlocks() ; + + delete[] regionLength ; + } + + double PickLeftAndRightRatio( double l, double r ) + { + if ( l >= 0 && r >= 0 ) + return l < r ? l : r ; + else if ( l < 0 && r < 0 ) + return -1 ; + else if ( l < 0 ) + return r ; + else + return l ; + } + + double PickLeftAndRightRatio( const struct _block &b ) + { + return PickLeftAndRightRatio( b.leftRatio, b.rightRatio ) ; + } + + void ComputeRatios() + { + int i, j ; + int exonBlockCnt = exonBlocks.size() ; + for ( i = 0 ; i < exonBlockCnt ; ++i ) + { + exonBlocks[i].leftRatio = -1 ; + exonBlocks[i].rightRatio = -1 ; + if ( exonBlocks[i].leftType == 2 && exonBlocks[i].rightType == 1 ) + { + // ]...[ + double anchorAvg = 0 ; + double avgDepth = GetAvgDepth( exonBlocks[i] ) ; + if ( i >= 1 && exonBlocks[i - 1].chrId == exonBlocks[i].chrId ) + { + anchorAvg = GetAvgDepth( exonBlocks[i - 1] ) ; + if ( anchorAvg > 1 && avgDepth > 1 ) + exonBlocks[i].leftRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; + } + if ( i < exonBlockCnt - 1 && exonBlocks[i + 1].chrId == exonBlocks[i].chrId ) + { + anchorAvg = GetAvgDepth( exonBlocks[i + 1] ) ; + if ( anchorAvg > 1 && avgDepth > 1 ) + exonBlocks[i].rightRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; + } + } + + if ( exonBlocks[i].leftType == 0 && exonBlocks[i].rightType == 1 ) + { + // (..[ , overhang subexon. + double avgDepth = GetAvgDepth( exonBlocks[i] ) ; + double anchorAvg = GetAvgDepth( exonBlocks[i + 1] ) ; + if ( avgDepth > 1 && anchorAvg > 1 ) + exonBlocks[i].rightRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; + } + + /*if ( exonBlocks[i].leftType == 1 ) + { + // For the case (...[, the leftRatio is actuall the leftratio of the subexon on its right. + int len = 0 ; + double depthSum = 0 ; + int tag = i ; + + for ( j = 0 ; j < exonBlocks[tag].prevCnt ; ++j ) + { + int k = exonBlocks[tag].prev[j] ; + len += ( exonBlocks[k].end - exonBlocks[k].start + 1 ) ; + depthSum += exonBlocks[k].depthSum ; + } + double otherAvgDepth = depthSum / len ; + double avgDepth = GetAvgDepth( exonBlocks[tag] ) ; + + // adjust avgDepth by looking into the following exon blocks(subexon) in the same region whose left side is not hard end + for ( j = tag + 1 ; j < exonBlockCnt ; ++j ) + { + if ( exonBlocks[j].start != exonBlocks[j - 1].end + 1 || exonBlocks[j].leftType == 1 ) + break ; + double tmp = GetAvgDepth( exonBlocks[j] ) ; + if ( tmp > avgDepth ) + avgDepth = tmp ; + } + + if ( avgDepth < 1 ) + avgDepth = 1 ; + if ( otherAvgDepth < 1 ) + otherAvgDepth = 1 ; + + //exonBlocks[i].leftRatio = pow( log( avgDepth ) / log(2.0), 2.0 / 3.0 ) - pow( log( otherAvgDepth ) / log( 2.0 ), 2.0 / 3.0 ); + exonBlocks[i].leftRatio = sqrt( avgDepth ) - log( avgDepth ) - ( sqrt( otherAvgDepth ) + log( otherAvgDepth ) ) ; + + }*/ + if ( exonBlocks[i].rightType == 0 && exonBlocks[i].leftType == 2 ) + { + // for overhang subexon. + double avgDepth = GetAvgDepth( exonBlocks[i] ) ; + double anchorAvg = GetAvgDepth( exonBlocks[i - 1] ) ; + if ( avgDepth > 1 && anchorAvg > 1 ) + exonBlocks[i].leftRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; + + } + /*if ( exonBlocks[i].rightType == 2 ) + { + int len = 0 ; + double depthSum = 0 ; + int tag = i ; + + for ( j = 0 ; j < exonBlocks[tag].nextCnt ; ++j ) + { + int k = exonBlocks[tag].next[j] ; + len += ( exonBlocks[k].end - exonBlocks[k].start + 1 ) ; + depthSum += exonBlocks[k].depthSum ; + } + double otherAvgDepth = depthSum / len ; + double avgDepth = GetAvgDepth( exonBlocks[tag] ) ; + + // adjust avgDepth by looking into the earlier exon blocks(subexon) in the same region whose right side is not hard end + for ( j = tag - 1 ; j >= 0 ; --j ) + { + if ( exonBlocks[j].end + 1 != exonBlocks[j + 1].start || exonBlocks[j].rightType == 2 ) + break ; + double tmp = GetAvgDepth( exonBlocks[j] ) ; + if ( tmp > avgDepth ) + avgDepth = tmp ; + } + + if ( avgDepth < 1 ) + avgDepth = 1 ; + if ( otherAvgDepth < 1 ) + otherAvgDepth = 1 ; + + exonBlocks[i].rightRatio = sqrt( avgDepth ) - log( avgDepth ) - ( sqrt( otherAvgDepth ) + log( otherAvgDepth ) ) ; + //pow( log( avgDepth ) / log(2.0), 2.0 / 3.0 ) - pow( log( otherAvgDepth ) / log(2.0), 2.0 / 3.0 ); + }*/ + // The remaining case the islands, (...) + } + + // go through each region of subexons, and only compute the ratio for the first and last hard boundary + for ( i = 0 ; i < exonBlockCnt ; ) + { + // the blocks from [i,j) corresponds to a region, namely consecutive subexons. + for ( j = i + 1 ; j < exonBlockCnt ; ++j ) + if ( exonBlocks[j].contigId != exonBlocks[i].contigId ) + break ; + + int leftSupport = 0 ; + int rightSupport = 0 ; + int leftTag = j, rightTag = i - 1 ; + for ( int k = i ; k < j ; ++k ) + { + if ( exonBlocks[k].leftType == 1 && exonBlocks[k].leftSplice > 0 ) + { + leftSupport += exonBlocks[k].leftSplice ; + if ( k < leftTag ) + leftTag = k ; + } + if ( exonBlocks[k].rightType == 2 && exonBlocks[k].rightSplice > 0 ) + { + rightSupport += exonBlocks[k].rightSplice ; + if ( k > rightTag ) + rightTag = k ; + } + } + + if ( leftSupport > 0 && rightSupport > 0 ) + { + // when the right splice support is much greater than that of the left side, there should be a soft boundary for the left side. + // Wether we want to include this soft boundary or not will be determined in class, when it looked at whether the overhang block + // should be kept or not. + exonBlocks[ leftTag ].leftRatio = sqrt( rightSupport ) - log( ( rightSupport + leftSupport ) * 0.5 ) - ( sqrt( leftSupport ) ) ; //+ log( leftSupport ) ) ; + exonBlocks[ rightTag ].rightRatio = sqrt( leftSupport ) - log( ( rightSupport + leftSupport ) * 0.5 ) - ( sqrt( rightSupport ) ) ; //+ log( rightSupport ) ) ; + } + + i = j ; // don't forget this. + } + } +} ; + +#endif diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/classes.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/classes.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,491 @@ +#include +#include +#include +#include + +#include "alignments.hpp" +#include "SubexonGraph.hpp" +#include "SubexonCorrelation.hpp" +#include "Constraints.hpp" +#include "TranscriptDecider.hpp" + +char usage[] = "./classes [OPTIONS]:\n" + "Required:\n" + "\t-s STRING: path to the subexon file.\n" + "\t-b STRING: path to the BAM file.\n" + "\t\tor\n" + "\t--lb STRING: path to the file of the list of BAM files.\n" + "Optional:\n" + "\t-p INT: number of threads. (default: 1)\n" + "\t-o STRING: the prefix of the output file. (default: not used)\n" + "\t-c FLOAT: only use the subexons with classifier score <= than the given number. (default: 0.05)\n" + "\t-f FLOAT: filter the transcript from the gene if its abundance is lower than the given number percent of the most abundant one. (default: 0.05)\n" + "\t-d FLOAT: filter the transcript whose average read depth is less than the given number. (default: 2.5)\n" + "\t--ls STRING: path to the file of the list of single-sample subexon files. (default: not used)\n" + "\t--hasMateIdSuffix: the read id has suffix such as .1, .2 for a mate pair. (default: false)\n" + "\t--maxDpConstraintSize: the maximum number of subexons a constraint can cover in dynamic programming. (default: 7; -1 for inf)\n" + "\t--primaryParalog: use primary alignment to retain paralog genes instead of unique alignments. (default: not used)\n" + ; + +static const char *short_options = "s:b:f:o:d:p:c:h" ; +static struct option long_options[] = + { + { "ls", required_argument, 0, 10000 }, + { "lb", required_argument, 0, 10001 }, + { "hasMateIdSuffix", no_argument, 0, 10002 }, + { "primaryParalog", no_argument, 0, 10003 }, + { "maxDpConstraintSize", required_argument, 0, 10004 }, + { (char *)0, 0, 0, 0} + } ; + +struct _getAlignmentsInfoThreadArg +{ + std::vector *pAlignmentFiles ; + int numThreads ; + int tid ; +} ; + +struct _getConstraintsThreadArg +{ + std::vector *pMultiSampleConstraints ; + int numThreads ; + int tid ; + + struct _subexon *subexons ; + int seCnt ; + int start, end ; +} ; + +void *GetAlignmentsInfo_Thread( void *pArg ) +{ + int i ; + std::vector &alignmentFiles = *( ( (struct _getAlignmentsInfoThreadArg *)pArg )->pAlignmentFiles ) ; + int tid = ( (struct _getAlignmentsInfoThreadArg *)pArg )->tid ; + int numThreads = ( (struct _getAlignmentsInfoThreadArg *)pArg )->numThreads ; + int size = alignmentFiles.size() ; + for ( i = 0 ; i < size ; ++i ) + { + if ( i % numThreads == tid ) + { + alignmentFiles[i].GetGeneralInfo( true ) ; + alignmentFiles[i].Rewind() ; + } + } + + pthread_exit( NULL ) ; +} + + +void *GetConstraints_Thread( void *pArg ) +{ + int i ; + struct _getConstraintsThreadArg &arg = *( (struct _getConstraintsThreadArg *)pArg ) ; + std::vector &multiSampleConstraints = *( arg.pMultiSampleConstraints ) ; + int tid = arg.tid ; + int numThreads = arg.numThreads ; + int size = multiSampleConstraints.size() ; + for ( i = 0 ; i < size ; ++i ) + { + if ( i % numThreads == tid ) + multiSampleConstraints[i].BuildConstraints( arg.subexons, arg.seCnt, arg.start, arg.end ) ; + } + pthread_exit( NULL ) ; +} + +int main( int argc, char *argv[] ) +{ + int i, j ; + int size ; + + if ( argc <= 1 ) + { + printf( "%s", usage ) ; + return 0 ; + } + + int c, option_index ; // For getopt + option_index = 0 ; + FILE *fpSubexon = NULL ; + double FPKMFraction = 0.05 ; + double classifierThreshold ; + double txptMinReadDepth = 2.5 ; + char outputPrefix[1024] = "" ; + int numThreads = 1 ; + bool hasMateReadIdSuffix = false ; + bool usePrimaryAsUnique = false ; + int maxDpConstraintSize = 7 ; + + std::vector alignmentFiles ; + SubexonCorrelation subexonCorrelation ; + + classifierThreshold = 0.05 ; + while ( 1 ) + { + c = getopt_long( argc, argv, short_options, long_options, &option_index ) ; + if ( c == -1 ) + break ; + + if ( c == 's' ) + { + fpSubexon = fopen( optarg, "r" ) ; + } + else if ( c == 'b' ) + { + Alignments a ; + a.Open( optarg ) ; + //a.SetAllowClip( false ) ; + alignmentFiles.push_back( a ) ; + } + else if ( c == 'c' ) + { + classifierThreshold = atof( optarg ) ; + } + else if ( c == 'f' ) + { + FPKMFraction = atof( optarg ) ; + } + else if ( c == 'o' ) + { + strcpy( outputPrefix, optarg ) ; + } + else if ( c == 'd' ) + { + txptMinReadDepth = atof( optarg ) ; + } + else if ( c == 'p' ) + { + numThreads = atoi( optarg ) ; + } + else if ( c == 10000 ) // the list of subexon files. + { + subexonCorrelation.Initialize( optarg ) ; + } + else if ( c == 10001 ) // the list of bam files. + { + FILE *fp = fopen( optarg, "r" ) ; + char buffer[1024] ; + while ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) + { + int len = strlen( buffer ) ; + if ( buffer[len - 1] == '\n' ) + { + buffer[len - 1] = '\0' ; + --len ; + + } + Alignments a ; + a.Open( buffer ) ; + alignmentFiles.push_back( a ) ; + } + fclose( fp ) ; + } + else if ( c == 10002 ) // the mate pair read id has suffix. + { + hasMateReadIdSuffix = true ; + } + else if ( c == 10003 ) // do not check unique alignment + { + usePrimaryAsUnique = true ; + } + else if ( c == 10004 ) // maxDpConstraintSize + { + maxDpConstraintSize = atoi(optarg) ; + } + else + { + printf( "%s", usage ) ; + exit( 1 ) ; + } + } + if ( fpSubexon == NULL ) + { + printf( "Cannot find combined subexon file.\n" ) ; + exit( 1 ) ; + } + if ( alignmentFiles.size() < 1 ) + { + printf( "Must use -b option to specify BAM files.\n" ) ; + exit( 1 ) ; + } + + + if ( alignmentFiles.size() < 50 ) + { + size = alignmentFiles.size() ; + for ( i = 0 ; i < size ; ++i ) + { + alignmentFiles[i].GetGeneralInfo( true ) ; + alignmentFiles[i].Rewind() ; + } + } + else + { + struct _getAlignmentsInfoThreadArg *args = new struct _getAlignmentsInfoThreadArg[ numThreads ] ; + pthread_attr_t pthreadAttr ; + pthread_t *threads ; + + pthread_attr_init( &pthreadAttr ) ; + pthread_attr_setdetachstate( &pthreadAttr, PTHREAD_CREATE_JOINABLE ) ; + + threads = new pthread_t[ numThreads ] ; + + for ( i = 0 ; i < numThreads ; ++i ) + { + args[i].pAlignmentFiles = &alignmentFiles ; + args[i].tid = i ; + args[i].numThreads = numThreads ; + + pthread_create( &threads[i], &pthreadAttr, GetAlignmentsInfo_Thread, &args[i] ) ; + } + + for ( i = 0 ; i < numThreads ; ++i ) + { + pthread_join( threads[i], NULL ) ; + } + + pthread_attr_destroy( &pthreadAttr ) ; + delete[] args ; + delete[] threads ; + } + + // Build the subexon graph + SubexonGraph subexonGraph( classifierThreshold, alignmentFiles[0], fpSubexon ) ; + subexonGraph.ComputeGeneIntervals() ; + + // Solve gene by gene + int sampleCnt = alignmentFiles.size() ; + std::vector multiSampleConstraints ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + Constraints constraints( &alignmentFiles[i] ) ; + constraints.SetHasMateReadIdSuffix( hasMateReadIdSuffix ) ; + constraints.SetUsePrimaryAsUnique( usePrimaryAsUnique ) ; + multiSampleConstraints.push_back( constraints ) ; + } + MultiThreadOutputTranscript outputHandler( sampleCnt, alignmentFiles[0] ) ; + outputHandler.SetOutputFPs( outputPrefix ) ; + + if ( numThreads <= 1 ) + { + TranscriptDecider transcriptDecider( FPKMFraction, classifierThreshold, txptMinReadDepth, sampleCnt, alignmentFiles[0] ) ; + + transcriptDecider.SetMultiThreadOutputHandler( &outputHandler ) ; + transcriptDecider.SetNumThreads( numThreads ) ; + transcriptDecider.SetMaxDpConstraintSize( maxDpConstraintSize ) ; + + int giCnt = subexonGraph.geneIntervals.size() ; + for ( i = 0 ; i < giCnt ; ++i ) + { + struct _geneInterval gi = subexonGraph.geneIntervals[i] ; + struct _subexon *intervalSubexons = new struct _subexon[ gi.endIdx - gi.startIdx + 1 ] ; + subexonGraph.ExtractSubexons( gi.startIdx, gi.endIdx, intervalSubexons ) ; + printf( "%d: %d %s %d %d\n", i, gi.endIdx - gi.startIdx + 1, + alignmentFiles[0].GetChromName( intervalSubexons[0].chrId ), + gi.start + 1, gi.end + 1 ) ; + fflush( stdout ) ; + + subexonCorrelation.ComputeCorrelation( intervalSubexons, gi.endIdx - gi.startIdx + 1, alignmentFiles[0] ) ; + for ( j = 0 ; j < sampleCnt ; ++j ) + multiSampleConstraints[j].BuildConstraints( intervalSubexons, gi.endIdx - gi.startIdx + 1, gi.start, gi.end ) ; + + transcriptDecider.Solve( intervalSubexons, gi.endIdx - gi.startIdx + 1, multiSampleConstraints, subexonCorrelation ) ; + + for ( j = 0 ; j < gi.endIdx - gi.startIdx + 1 ; ++j ) + { + delete[] intervalSubexons[j].prev ; + delete[] intervalSubexons[j].next ; + } + delete[] intervalSubexons ; + } + } + else // multi-thread case. + { + --numThreads ; // one thread is used for read in the data. + + // Allocate memory + struct _transcriptDeciderThreadArg *pArgs = new struct _transcriptDeciderThreadArg[ numThreads ] ; + pthread_mutex_t ftLock ; + int *freeThreads ; + int ftCnt ; + pthread_cond_t fullWorkCond ; + pthread_attr_t pthreadAttr ; + pthread_t *threads ; + pthread_t *getConstraintsThreads ; + bool *initThreads ; + + pthread_mutex_init( &ftLock, NULL ) ; + pthread_cond_init( &fullWorkCond, NULL ) ; + pthread_attr_init( &pthreadAttr ) ; + pthread_attr_setdetachstate( &pthreadAttr, PTHREAD_CREATE_JOINABLE ) ; + + threads = new pthread_t[ numThreads ] ; + getConstraintsThreads = new pthread_t[ numThreads ] ; + initThreads = new bool[numThreads] ; + freeThreads = new int[ numThreads ] ; + ftCnt = numThreads ; + for ( i = 0 ; i < numThreads ; ++i ) + { + pArgs[i].tid = i ; + pArgs[i].sampleCnt = sampleCnt ; + pArgs[i].numThreads = numThreads ; + pArgs[i].maxDpConstraintSize = maxDpConstraintSize ; + pArgs[i].FPKMFraction = FPKMFraction ; + pArgs[i].classifierThreshold = classifierThreshold ; + pArgs[i].txptMinReadDepth = txptMinReadDepth ; + pArgs[i].alignments = &alignmentFiles[0] ; + //pArgs[i].constraints = new std::vector ; + pArgs[i].outputHandler = &outputHandler ; + + freeThreads[i] = i ; + pArgs[i].freeThreads = freeThreads ; + pArgs[i].ftCnt = &ftCnt ; + pArgs[i].ftLock = &ftLock ; + pArgs[i].fullWorkCond = &fullWorkCond ; + + for ( j = 0 ; j < sampleCnt ; ++j ) + { + Constraints constraints( &alignmentFiles[j] ) ; + pArgs[i].constraints.push_back( constraints ) ; + } + + initThreads[i] = false ; + } + + + // Read in and distribute the work + int giCnt = subexonGraph.geneIntervals.size() ; + for ( i = 0 ; i < giCnt ; ++i ) + { + struct _geneInterval gi = subexonGraph.geneIntervals[i] ; + struct _subexon *intervalSubexons = new struct _subexon[ gi.endIdx - gi.startIdx + 1 ] ; + subexonGraph.ExtractSubexons( gi.startIdx, gi.endIdx, intervalSubexons ) ; + subexonCorrelation.ComputeCorrelation( intervalSubexons, gi.endIdx - gi.startIdx + 1, alignmentFiles[0] ) ; + printf( "%d: %d %s %d %d. Free threads: %d/%d\n", i, gi.endIdx - gi.startIdx + 1, + alignmentFiles[0].GetChromName( intervalSubexons[0].chrId ), + gi.start + 1, gi.end + 1, ftCnt, numThreads + 1 ) ; + fflush( stdout ) ; + + int gctCnt = ftCnt ; + if ( gctCnt > 1 && sampleCnt > 1 ) + { + gctCnt = ( gctCnt < sampleCnt ? gctCnt : sampleCnt ) ; + struct _getConstraintsThreadArg *args = new struct _getConstraintsThreadArg[ gctCnt ] ; + for ( j = 0 ; j < gctCnt ; ++j ) + { + args[j].pMultiSampleConstraints = &multiSampleConstraints ; + args[j].numThreads = gctCnt ; + args[j].tid = j ; + args[j].subexons = intervalSubexons ; + args[j].seCnt = gi.endIdx - gi.startIdx + 1 ; + args[j].start = gi.start ; args[j].end = gi.end ; + pthread_create( &getConstraintsThreads[j], &pthreadAttr, GetConstraints_Thread, &args[j] ) ; + } + + + + for ( j = 0 ; j < gctCnt ; ++j ) + pthread_join( getConstraintsThreads[j], NULL ) ; + + delete[] args ; + } + else + { + for ( j = 0 ; j < sampleCnt ; ++j ) + multiSampleConstraints[j].BuildConstraints( intervalSubexons, gi.endIdx - gi.startIdx + 1, gi.start, gi.end ) ; + } + + // Search for the free queue. + int tag = -1 ; // get the working thread. + pthread_mutex_lock( &ftLock ) ; + if ( ftCnt == 0 ) + { + pthread_cond_wait( &fullWorkCond, &ftLock ) ; + } + tag = freeThreads[ ftCnt - 1 ] ; + --ftCnt ; + pthread_mutex_unlock( &ftLock ) ; + + if ( initThreads[tag] ) + pthread_join( threads[tag], NULL ) ; // Make sure the chosen thread exits. + + // Assign the subexons, the constraints and correlation content. + pArgs[tag].subexons = new struct _subexon[gi.endIdx - gi.startIdx + 1] ; + pArgs[tag].seCnt = gi.endIdx - gi.startIdx + 1 ; + for ( j = 0 ; j < pArgs[tag].seCnt ; ++j ) + { + pArgs[tag].subexons[j] = intervalSubexons[j] ; + int cnt = intervalSubexons[j].prevCnt ; + pArgs[tag].subexons[j].prev = new int[cnt] ; + memcpy( pArgs[tag].subexons[j].prev, intervalSubexons[j].prev, sizeof( int ) * cnt ) ; + cnt = intervalSubexons[j].nextCnt ; + pArgs[tag].subexons[j].next = new int[cnt] ; + memcpy( pArgs[tag].subexons[j].next, intervalSubexons[j].next, sizeof( int ) * cnt ) ; + } + + for ( j = 0 ; j < sampleCnt ; ++j ) + { + pArgs[tag].constraints[j].Assign( multiSampleConstraints[ j ] ) ; + } + pArgs[tag].subexonCorrelation.Assign( subexonCorrelation ) ; + pthread_create( &threads[tag], &pthreadAttr, TranscriptDeciderSolve_Wrapper, &pArgs[tag] ) ; + initThreads[tag] = true ; + for ( j = 0 ; j < gi.endIdx - gi.startIdx + 1 ; ++j ) + { + delete[] intervalSubexons[j].prev ; + delete[] intervalSubexons[j].next ; + } + delete[] intervalSubexons ; + } + + for ( i = 0 ; i < numThreads ; ++i ) + { + if ( initThreads[i] ) + pthread_join( threads[i], NULL ) ; + } + + // Release memory + for ( i = 0 ; i < numThreads ; ++i ) + { + std::vector().swap( pArgs[i].constraints ) ; + } + delete []pArgs ; + pthread_attr_destroy( &pthreadAttr ) ; + pthread_mutex_destroy( &ftLock ) ; + pthread_cond_destroy( &fullWorkCond ) ; + + delete[] threads ; + delete[] getConstraintsThreads ; + delete[] initThreads ; + delete[] freeThreads ; + } // end of else for multi-thread. + + outputHandler.OutputCommandInfo( argc, argv ) ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + char buffer[1024] ; + alignmentFiles[i].GetFileName( buffer ) ; + int separator = -1 ; + + // deprive the directory. + for ( j = 0 ; buffer[j] ; ++j ) + { + if ( buffer[j] == '/' ) + separator = j ; + } + if ( separator != -1 ) + { + ++separator ; + for ( j = separator ; buffer[j] ; ++j ) + buffer[j - separator] = buffer[j] ; + buffer[j - separator] = '\0' ; + } + outputHandler.OutputCommentToSampleGTF( i, buffer ) ; + } + outputHandler.ComputeFPKMTPM( alignmentFiles ) ; + outputHandler.Flush() ; + + for ( i = 0 ; i < sampleCnt ; ++i ) + alignmentFiles[i].Close() ; + fclose( fpSubexon ) ; + return 0 ; +} diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/defs.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/defs.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,24 @@ +#ifndef _LSONG_CLASSES_DEFS_HEADER +#define _LSONG_CLASSES_DEFS_HEADER + +#include + +#define MAX_SEG_COUNT 127 + +struct _pair +{ + int64_t a, b ; +} ; + +struct _pair32 +{ + int a, b ; +} ; + +/*char nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2, + -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, 3, + -1, -1, -1, -1, -1, -1 } ; + +char numToNuc[26] = {'A', 'C', 'G', 'T'} ;*/ +#endif diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/example/s1.bam Binary file PsiCLASS-1.0.2/example/s1.bam has changed diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/example/s2.bam Binary file PsiCLASS-1.0.2/example/s2.bam has changed diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/example/slist --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/example/slist Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,2 @@ +example/s1.bam +example/s2.bam diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/gamma.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/gamma.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,201 @@ +#include "gamma.hpp" + +/** The digamma function in long double precision. +* @param x the real value of the argument +* @return the value of the digamma (psi) function at that point +* @author Richard J. Mathar +* @since 2005-11-24 +*/ +long double digammal(long double x) +{ + /* force into the interval 1..3 */ + if( x < 0.0L ) + return digammal(1.0L-x)+M_PIl/tanl(M_PIl*(1.0L-x)) ; /* reflection formula */ + else if( x < 1.0L ) + return digammal(1.0L+x)-1.0L/x ; + else if ( x == 1.0L) + return -M_GAMMAl ; + else if ( x == 2.0L) + return 1.0L-M_GAMMAl ; + else if ( x == 3.0L) + return 1.5L-M_GAMMAl ; + else if ( x > 3.0L) + /* duplication formula */ + return 0.5L*(digammal(x/2.0L)+digammal((x+1.0L)/2.0L))+M_LN2l ; + else + { + /* Just for your information, the following lines contain + * the Maple source code to re-generate the table that is + * eventually becoming the Kncoe[] array below + * interface(prettyprint=0) : + * Digits := 63 : + * r := 0 : + * + * for l from 1 to 60 do + * d := binomial(-1/2,l) : + * r := r+d*(-1)^l*(Zeta(2*l+1) -1) ; + * evalf(r) ; + * print(%,evalf(1+Psi(1)-r)) ; + *o d : + * + * for N from 1 to 28 do + * r := 0 : + * n := N-1 : + * + * for l from iquo(n+3,2) to 70 do + * d := 0 : + * for s from 0 to n+1 do + * d := d+(-1)^s*binomial(n+1,s)*binomial((s-1)/2,l) : + * od : + * if 2*l-n > 1 then + * r := r+d*(-1)^l*(Zeta(2*l-n) -1) : + * fi : + * od : + * print(evalf((-1)^n*2*r)) ; + *od : + *quit : + */ + static long double Kncoe[] = { .30459198558715155634315638246624251L, + .72037977439182833573548891941219706L, -.12454959243861367729528855995001087L, + .27769457331927827002810119567456810e-1L, -.67762371439822456447373550186163070e-2L, + .17238755142247705209823876688592170e-2L, -.44817699064252933515310345718960928e-3L, + .11793660000155572716272710617753373e-3L, -.31253894280980134452125172274246963e-4L, + .83173997012173283398932708991137488e-5L, -.22191427643780045431149221890172210e-5L, + .59302266729329346291029599913617915e-6L, -.15863051191470655433559920279603632e-6L, + .42459203983193603241777510648681429e-7L, -.11369129616951114238848106591780146e-7L, + .304502217295931698401459168423403510e-8L, -.81568455080753152802915013641723686e-9L, + .21852324749975455125936715817306383e-9L, -.58546491441689515680751900276454407e-10L, + .15686348450871204869813586459513648e-10L, -.42029496273143231373796179302482033e-11L, + .11261435719264907097227520956710754e-11L, -.30174353636860279765375177200637590e-12L, + .80850955256389526647406571868193768e-13L, -.21663779809421233144009565199997351e-13L, + .58047634271339391495076374966835526e-14L, -.15553767189204733561108869588173845e-14L, + .41676108598040807753707828039353330e-15L, -.11167065064221317094734023242188463e-15L } ; + + register long double Tn_1 = 1.0L ; /* T_{n-1}(x), started at n=1 */ + register long double Tn = x-2.0L ; /* T_{n}(x) , started at n=1 */ + register long double resul = Kncoe[0] + Kncoe[1]*Tn ; + + x -= 2.0L ; + int n ; + + for( n = 2 ; n < sizeof(Kncoe)/sizeof(long double) ;n++) + { + const long double Tn1 = 2.0L * x * Tn - Tn_1 ; /* Chebyshev recursion, Eq. 22.7.4 Abramowitz-Stegun */ + resul += Kncoe[n]*Tn1 ; + Tn_1 = Tn ; + Tn = Tn1 ; + } + return resul ; + } +} + + + +double trigamma ( double x, int *ifault ) + +//**************************************************************************** +// purpose: +// +// trigamma calculates trigamma(x) = d**2 log(gamma(x)) / dx**2 +// +// licensing: +// +// this code is distributed under the gnu lgpl license. +// +// modified: +// +// 19 january 2008 +// +// author: +// +// original fortran77 version by be schneider. +// c++ version by john burkardt. +// +// reference: +// +// be schneider, +// algorithm as 121: +// trigamma function, +// applied statistics, +// volume 27, number 1, pages 97-99, 1978. +// +// parameters: +// +// input, double x, the argument of the trigamma function. +// 0 < x. +// +// output, int *ifault, error flag. +// 0, no error. +// 1, x <= 0. +// +// output, double trigamma, the value of the trigamma function at x. +// +{ + double a = 0.0001; + double b = 5.0; + double b2 = 0.1666666667; + double b4 = -0.03333333333; + double b6 = 0.02380952381; + double b8 = -0.03333333333; + double value; + double y; + double z; + // + // check the input. + // + if ( x <= 0.0 ) + { + *ifault = 1; + value = 0.0; + return value; + } + + *ifault = 0; + z = x; + // + // use small value approximation if x <= a. + // + if ( x <= a ) + { + value = 1.0 / x / x; + return value; + } + // + // increase argument to ( x + i ) >= b. + // + value = 0.0; + + while ( z < b ) + { + value = value + 1.0 / z / z; + z = z + 1.0; + } + // + // apply asymptotic formula if argument is b or greater. + // + y = 1.0 / z / z; + + value = value + 0.5 * + y + ( 1.0 + y * ( b2+ y * ( b4 + y * ( b6+ y * b8 )))) / z; + + return value; +} + + +double LogGammaDensity( double x, double k, double theta ) +{ + return -k * log( theta ) + ( k - 1 ) * log( x ) - x / theta - lgamma( k ) ; +} + +double MixtureGammaAssignment( double x, double pi, double* k, double *theta ) +{ + if ( pi == 1 ) + return 0 ; + else if ( pi == 0 ) + return 1 ; + + double lf0 = LogGammaDensity( x, k[0], theta[0] ) ; + double lf1 = LogGammaDensity( x, k[1], theta[1] ) ; + + return (double)1.0 / ( 1.0 + exp( lf1 + log( 1 - pi ) - lf0 - log( pi ) ) ) ; +} diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/gamma.hpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/gamma.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,34 @@ +#ifndef _LSONG_GAMMA +#define _LSONG_GAMMA + +#include +#include + +#ifndef M_PIl +/** The constant Pi in high precision */ +#define M_PIl 3.1415926535897932384626433832795029L +#endif +#ifndef M_GAMMAl +/** Euler's constant in high precision */ +#define M_GAMMAl 0.5772156649015328606065120900824024L +#endif +#ifndef M_LN2l +/** the natural logarithm of 2 in high precision */ +#define M_LN2l 0.6931471805599453094172321214581766L +#endif + +/** The digamma function in long double precision. +* @param x the real value of the argument +* @return the value of the digamma (psi) function at that point +* @author Richard J. Mathar +* @since 2005-11-24 +*/ +long double digammal(long double x) ; + +//https://people.sc.fsu.edu/~jburkardt/cpp_src/asa121/asa121.hpp +double trigamma ( double x, int *ifault ); + + +double LogGammaDensity( double x, double k, double theta ) ; +double MixtureGammaAssignment( double x, double pi, double* k, double *theta ) ; +#endif diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/grader.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/grader.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,840 @@ +// Compare the reference and the predictions +// Format: ./a.out ref.gtf prediction.gtf +#include +#include +#include + +#define MAX_TXPT 2000000 + +#define DEBUG 0 + +bool flagMultiExonOnly = false ; +bool flagValidChr = true ; +int relaxWidth = 20 ; + +struct _pair +{ + int a, b ; +} ; + +struct _transcript +{ + char tid[50] ; + char chrom[50] ; + char strand ; + struct _pair *exons ; + int ecnt ; +} ; + +struct _info +{ + double coverage ; + int byId ; // It gets this coverage by comparing with byId. +} ; + +struct _intron +{ + char chrom[50] ; + int start, end ; + int tindex ; // The index to the corresponding transcripts +} ; + +struct _exon +{ + char chrom[50] ; + int start, end ; + //int tindex ; + int soft ; // left-bit: left side is soft, right-bit: right side is soft + bool matched ; +} ; + +int TranscriptComp( const void *p1, const void *p2 ) +{ + const struct _transcript *a = (struct _transcript *)p1 ; + const struct _transcript *b = ( struct _transcript *)p2 ; + int tmp = strcmp( a->chrom, b->chrom ) ; + if ( tmp != 0 ) + return tmp ; + return a->exons[0].a - b->exons[0].a ; +} + +int TranscriptComp_ByIntron( const void *p1, const void *p2 ) +{ + const struct _transcript *a = (struct _transcript *)p1 ; + const struct _transcript *b = ( struct _transcript *)p2 ; + if ( a->strand != b->strand ) + return a->strand - b->strand ; + int tmp = strcmp( a->chrom, b->chrom ) ; + if ( tmp != 0 ) + return tmp ; + if ( a->ecnt != b->ecnt ) + return a->ecnt - b->ecnt ; + int i ; + for ( i = 0 ; i < a->ecnt - 1 ; ++i ) + { + if ( a->exons[i].b != b->exons[i].b ) + return a->exons[i].b - b->exons[i].b ; + if ( a->exons[i + 1].a != b->exons[i + 1].a ) + return a->exons[i + 1].a - b->exons[i + 1].a ; + } + + return 0 ; //strcmp( a->tid, b->tid ) ; +} + +bool validChrom( char *chrom ) +{ + if ( !flagValidChr ) + return true ; + if ( chrom[0] != 'c' || chrom[1] != 'h' || chrom[2] != 'r' ) + return false ; + // only consider chr1-22,x,y,z + if ( chrom[3]=='x' || chrom[3]=='X' + || chrom[3] == 'y' || chrom[3] == 'Y' + || chrom[3] == 'm' || chrom[3] == 'M' + || ( chrom[3] >= '3' && chrom[3] <= '9' ) ) + { + if ( chrom[4] == '\0' ) + return true ; + else + return false ; + } + if ( chrom[3] == '1' ) + { + if ( chrom[4] == '\0' ) + return true ; + if ( chrom[4] >= '0' && chrom[4] <= '9' && chrom[5] == '\0' ) + return true ; + return false ; + } + if ( chrom[3] == '2' ) + { + if ( chrom[4] == '\0' ) + return true ; + if ( chrom[4] >= '0' && chrom[4] <= '2' && chrom[5] == '\0' ) + return true ; + return false ; + } + return false ; +} + +void ReverseExonList( struct _pair *exons, int ecnt ) +{ + if ( ecnt < 2 || ( exons[0].a < exons[1].a ) ) + return ; + int i, j ; + struct _pair tmp ; + + for ( i = 0, j = ecnt - 1 ; i < j ; ++i, --j ) + { + tmp = exons[i] ; + exons[i] = exons[j] ; + exons[j] = tmp ; + } +} + +/** + Merge exons that are next to each other. Showed up in scripture. +*/ +int CleanExonList( struct _pair *exons, int ecnt ) +{ + int i, k ; + for ( i = 0 ; i < ecnt - 1 ; ++i ) + if ( exons[i + 1].a - exons[i].b - 1 < 20 ) + { + exons[i + 1].a = exons[i].a ; + exons[i].a = -1 ; + } + k = 0 ; + for ( i = 0 ; i < ecnt ; ++i ) + { + if ( exons[i].a == -1 ) + continue ; + exons[k] = exons[i] ; + ++k ; + } + return k ; +} + +/** + Remove the duplicated intron chain in the list +*/ +int RemoveDuplicateIntronChain( struct _transcript *t, int tcnt ) +{ + int i, j, k ; + + qsort( t, tcnt, sizeof( *t ), TranscriptComp_ByIntron ) ; + for ( i = 1 ; i < tcnt ; ++i ) + { + k = i - 1 ; + while ( t[k].ecnt == -1 ) + --k ; + + if ( t[i].ecnt != t[k].ecnt || strcmp( t[i].chrom, t[k].chrom ) || t[i].strand != t[k].strand ) + continue ; + for ( j = 0 ; j < t[i].ecnt - 1 ; ++j ) + if ( t[i].exons[j].b != t[k].exons[j].b || + t[i].exons[j + 1].a != t[k].exons[j + 1].a ) + break ; + if ( j >= t[i].ecnt - 1 && t[i].ecnt != 1 ) + t[i].ecnt = -1 ; + if ( t[i].ecnt == 1 && t[i].exons[0].a == t[k].exons[0].a && + t[i].exons[0].b == t[k].exons[0].b ) + t[i].ecnt = -1 ; + /*if ( t[i].ecnt == -1 ) + { + printf( "%s <- %s\n", t[i].tid, t[k].tid ) ; + }*/ + } + + k = 0 ; + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( t[i].ecnt == -1 ) + { + free( t[i].exons ) ; + continue ; + } + t[k] = t[i] ; + ++k ; + } + tcnt = k ; + return tcnt ; +} + + +double CompareTranscripts( const struct _transcript &ref, const struct _transcript &pred ) +{ + int i, j ; + struct _pair *refExons = ref.exons ; + int refECnt = ref.ecnt ; + struct _pair *predExons = pred.exons ; + int predECnt = pred.ecnt ; + + // Prediction must be a "subset" of the reference + if ( refECnt < predECnt ) + return -1 ; + + // single exon case + if ( refECnt == 1 ) + { + if ( refExons[0].b < predExons[0].a || refExons[0].a > predExons[0].b ) + return -1 ; + else + return 1 ; + } + + if ( predECnt == 1 ) + { + for ( i = 0 ; i < refECnt ; ++i ) + { + if ( refExons[i].b < predExons[0].a || refExons[i].a > predExons[0].b ) + continue ; + else + { + /*if ( i == 0 && predExons[0].a >= refExons[0].a - relaxWidth ) + return 0 ; + else if ( i == refECnt - 1 && predExons[0].b <= refExons[i].b + relaxWidth ) + return 0 ; + else if ( i > 0 && i < refECnt - 1 && predExons[0].a >= refExons[i].a - relaxWidth && + predExons[0].b <= refExons[i].b + relaxWidth ) + return 0 ; + + return -1 ;*/ + return 0 ; + } + } + return -1 ; + } + + if ( predECnt > 0 && ref.strand != pred.strand ) + return -1 ; + + // Test the intron chain + for ( i = 0 ; i < refECnt - 1 ; ++i ) + if ( refExons[i].b == predExons[0].b ) + break ; + if ( i >= refECnt - 1 ) + return -1 ; + //if ( i != 0 && predExons[0].a < refExons[i].a - relaxWidth ) + // return -1 ; + + for ( j = 0 ; i < refECnt - 1 && j < predECnt - 1 ; ++i, ++j ) + { + if ( refExons[i].b != predExons[j].b || refExons[i + 1].a != predExons[j + 1].a ) + break ; + } + if ( j >= predECnt - 1 ) + { + /*if ( i == refECnt - 1 || + predExons[ predECnt - 1 ].b <= refExons[i].b + relaxWidth ) + return (double)( predECnt - 1 ) / (double)( refECnt - 1 ) ; + else + return -1 ;*/ + return (double)( predECnt - 1 ) / (double)( refECnt - 1 ) ; + } + else + return -1 ; +} + +bool WithinRange( int a, int b, int w = 10 ) +{ + if ( a - b >= -w && a - b <= w ) + return true ; + return false ; +} + +int GetIntrons( struct _transcript *transcripts, int tcnt, struct _intron *introns ) +{ + int i, j, k, tag ; + int cnt = 0 ; + for ( i = 0 ; i < tcnt ; ++i ) + { + for ( j = 0 ; j < transcripts[i].ecnt - 1 ; ++j ) + { + bool flag = false ; + + tag = 0 ; + for ( k = cnt - 1 ; k >= 0 ; --k ) + { + if ( strcmp( introns[k].chrom, transcripts[i].chrom ) ) + { + //printf( "hi\n" ) ; + tag = k + 1 ; + break ; + } + if ( introns[k].start < transcripts[i].exons[j].b ) + { + tag = k + 1 ; + break ; + } + else if ( introns[k].start == transcripts[i].exons[j].b && + introns[k].end == transcripts[i].exons[j + 1].a ) + { + flag = true ; + break ; + } + } + + if ( !flag ) + { + for ( k = cnt ; k > tag ; --k ) + introns[k] = introns[k - 1] ; + strcpy( introns[k].chrom, transcripts[i].chrom ) ; + introns[k].start = transcripts[i].exons[j].b ; + introns[k].end = transcripts[i].exons[j + 1].a ; + introns[k].tindex = i ; + ++cnt ; + } + } + } + return cnt ; +} + +int GetExons( struct _transcript *transcripts, int tcnt, struct _exon *exons ) +{ + int i, j, k, tag ; + int cnt = 0 ; + for ( i = 0 ; i < tcnt ; ++i ) + { + for ( j = 0 ; j < transcripts[i].ecnt ; ++j ) + { + bool flag = false ; + int soft = 0 ; + if ( j == 0 ) + soft = soft | 2 ; + if ( j == transcripts[i].ecnt - 1 ) + soft = soft | 1 ; + + tag = 0 ; + for ( k = cnt - 1 ; k >= 0 ; --k ) + { + if ( strcmp( exons[k].chrom, transcripts[i].chrom ) ) + { + //printf( "hi\n" ) ; + tag = k + 1 ; + break ; + } + + if ( exons[k].start < transcripts[i].exons[j].a ) + { + tag = k + 1 ; + break ; + } + else if ( exons[k].start == transcripts[i].exons[j].a && + exons[k].end == transcripts[i].exons[j].b && + exons[k].soft == soft ) + { + flag = true ; + break ; + } + } + + if ( !flag ) + { + for ( k = cnt ; k > tag ; --k ) + exons[k] = exons[k - 1] ; + strcpy( exons[k].chrom, transcripts[i].chrom ) ; + exons[k].start = transcripts[i].exons[j].a ; + exons[k].end = transcripts[i].exons[j].b ; + //exons[k].tindex = i ; + exons[k].soft = soft ; + exons[k].matched = false ; + ++cnt ; + } + } + } + return cnt ; +} + +//chr1 HAVANA transcript 320162 324461 . + . gene_id "ENSG00000237094.6"; transcript_id "ENST00000423728.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP4-669L17.10"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP4-669L17.10-002"; level 2; havana_gene "OTTHUMG00000156968.4"; havana_transcript "OTTHUMT00000346879.1"; +int main( int argc, char *argv[] ) +{ + FILE *fp ; + struct _transcript *ref, *pred ; + struct _info *refInfo, *predInfo ; + + int refCnt, predCnt ; + char line[10000], filename[10000] ; + struct _pair tmpExons[10000] ; + int tmpECnt = 0 ; + + char chrom[50], tool[20], type[40], strand[3] ; + char tid[50] ; + char buffer[50] ; + int start, end ; + + int i, j, k, tag ; + + if ( argc == 1 ) + { + printf( "Compare the reference transcripts and predicted transcripts.\n" + "Format: ./grader ref.gtf prediction.gtf\n" ) ; + exit( 1 ) ; + } + // Parse the arguments + for ( i = 3 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "-M" ) ) + flagMultiExonOnly = true ; + else if ( !strcmp( argv[i], "-ac" ) ) + flagValidChr = false ; + else + { + printf( "Unknown argument: %s\n", argv[i] ) ; + exit( 1 ) ; + } + } + + /*======================================================================================== + Stage 1: Read the transcripts from the reference and prediction's GTF files. + ========================================================================================*/ + fp = NULL ; + fp = fopen( argv[1], "r" ) ; + if ( fp == NULL ) + { + printf( "Could not open file %s.\n", argv[1] ) ; + exit( 1 ) ; + } + refCnt = 0 ; + ref = ( struct _transcript *)malloc( MAX_TXPT * sizeof( struct _transcript ) ) ; + + strcpy( ref[0].tid, "tmp_-1" ) ; + while ( fgets( line, sizeof( line ), fp ) != NULL ) + { + if ( line[0] == '#' ) + continue ; + + sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ; + + if ( strcmp( type, "exon" ) || !validChrom( chrom ) ) + continue ; + + char *p = strstr( line, "transcript_id" ) ; + for ( ; *p != ' ' ; ++p ) + ; + p += 2 ; + sscanf( p, "%s", tid ) ; + //printf( "+%s %d\n", tid, strlen( tid ) ) ; + p = tid + strlen( tid ) ; + while ( *p != '\"' ) + --p ; + *p = '\0' ; + //printf( "%s\n", tid ) ; + if ( strcmp( tid, ref[ refCnt ].tid ) && tmpECnt ) + { + ReverseExonList( tmpExons, tmpECnt ) ; + tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; + if ( tmpECnt > 1 || !flagMultiExonOnly ) + { + ref[ refCnt ].ecnt = tmpECnt ; + ref[ refCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; + memcpy( ref[ refCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; + ++refCnt ; + } + tmpECnt = 0 ; + } + + tmpExons[ tmpECnt ].a = start ; + tmpExons[ tmpECnt ].b = end ; + ++tmpECnt ; + + ref[ refCnt ].strand = strand[0] ; + strcpy( ref[ refCnt ].chrom, chrom ) ; + strcpy( ref[ refCnt ].tid, tid ) ; + } + if ( tmpECnt != 0 ) + { + ReverseExonList( tmpExons, tmpECnt ) ; + tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; + if ( tmpECnt > 1 || !flagMultiExonOnly ) + { + ref[ refCnt ].ecnt = tmpECnt ; + ref[ refCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; + memcpy( ref[ refCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; + ++refCnt ; + } + tmpECnt = 0 ; + } + fclose( fp ) ; + + fp = NULL ; + fp = fopen( argv[2], "r" ) ; + if ( fp == NULL ) + { + printf( "Could not open file %s.\n", argv[2] ) ; + exit( 1 ) ; + } + predCnt = 0 ; + pred = ( struct _transcript *)malloc( MAX_TXPT * sizeof( struct _transcript ) ) ; + + strcpy( pred[0].tid, "tmp_-1" ) ; + tmpECnt = 0 ; + while ( fgets( line, sizeof( line ), fp ) != NULL ) + { + if ( line[0] == '#' ) + continue ; + sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ; + + if ( strcmp( type, "exon" ) || !validChrom( chrom ) ) + continue ; + + char *p = strstr( line, "transcript_id" ) ; + //char *p = strstr( line, "gene_name" ) ; + for ( ; *p != ' ' ; ++p ) + ; + p += 2 ; + sscanf( p, "%s", tid ) ; + //tid[ strlen( tid ) - 2 ] = '\0' ; + p = tid + strlen( tid ) ; + while ( *p != '\"' ) + --p ; + *p = '\0' ; + + if ( strcmp( tid, pred[ predCnt ].tid ) && tmpECnt ) + { + ReverseExonList( tmpExons, tmpECnt ) ; + tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; + if ( tmpECnt > 1 || !flagMultiExonOnly ) + { + pred[ predCnt ].ecnt = tmpECnt ; + pred[ predCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; + memcpy( pred[ predCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; + ++predCnt ; + } + tmpECnt = 0 ; + } + + tmpExons[ tmpECnt ].a = start ; + tmpExons[ tmpECnt ].b = end ; + ++tmpECnt ; + + pred[ predCnt ].strand = strand[0] ; + strcpy( pred[ predCnt ].chrom, chrom ) ; + strcpy( pred[ predCnt ].tid, tid ) ; + } + if ( tmpECnt > 0 ) + { + ReverseExonList( tmpExons, tmpECnt ) ; + tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; + if ( tmpECnt > 1 || !flagMultiExonOnly ) + { + pred[ predCnt ].ecnt = tmpECnt ; + pred[ predCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; + memcpy( pred[ predCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; + ++predCnt ; + } + tmpECnt = 0 ; + } + + refCnt = RemoveDuplicateIntronChain( ref, refCnt ) ; + //printf( "predCnt = %d\n", predCnt ) ; + predCnt = RemoveDuplicateIntronChain( pred, predCnt ) ; + //printf( "predCnt = %d\n", predCnt ) ; + qsort( ref, refCnt, sizeof( ref[0] ), TranscriptComp ) ; + qsort( pred, predCnt, sizeof( pred[0] ), TranscriptComp ) ; + + + /*======================================================================================== + Stage 2: Compute the recall and precision. + ========================================================================================*/ + refInfo = ( struct _info * )malloc( refCnt * sizeof( *refInfo ) ) ; + predInfo = ( struct _info * )malloc( predCnt * sizeof( *predInfo ) ) ; + for ( i = 0 ; i < refCnt ; ++i ) + { + refInfo[i].coverage = -1 ; + refInfo[i].byId = -1 ; +#if DEBUG + printf( "=%d %s %s %d\n", i, ref[i].chrom, ref[i].tid, ref[i].ecnt ) ; + for ( j = 0 ; j < ref[i].ecnt ; ++j ) + printf( "%d %d\n", ref[i].exons[j].a, ref[i].exons[j].b ) ; +#endif + + } + for ( i = 0 ; i < predCnt ; ++i ) + { + predInfo[i].coverage = -1 ; + predInfo[i].byId = -1 ; +#if DEBUG + printf( "#%d %s %s %d\n", i, pred[i].chrom, pred[i].tid, pred[i].ecnt ) ; + for ( j = 0 ; j < pred[i].ecnt ; ++j ) + printf( "%d %d\n", pred[i].exons[j].a, pred[i].exons[j].b ) ; +#endif + } + tag = 0 ; + for ( i = 0 ; i < predCnt ; ++i ) + { + while ( tag < refCnt && + ( ( strcmp( ref[tag].chrom, pred[i].chrom ) < 0 ) || + ( !strcmp( ref[tag].chrom, pred[i].chrom ) && + ref[tag].exons[ ref[tag].ecnt - 1 ].b < pred[i].exons[0].a - relaxWidth ) ) ) + ++tag ; + + // if ( i == 16474 ) + // printf( "%d %d %d", i, tag, refCnt ) ; + for ( j = tag ; j < refCnt && ref[j].exons[0].a <= pred[i].exons[ pred[i].ecnt - 1 ].b + relaxWidth && !strcmp( ref[j].chrom, pred[i].chrom ); ++j ) + { + /*if ( !strcmp( pred[i].tid, "CUFF.4256.1" ) ) + { + printf( "%s ? %s\n", pred[i].tid, ref[j].tid ) ; + }*/ + // if ( i == 16474 ) + // { + // printf( "%d %d\n", i, j ) ; + // } + double coverage = CompareTranscripts( ref[j], pred[i] ) ; + if ( coverage > refInfo[j].coverage ) + { + refInfo[j].coverage = coverage ; + refInfo[j].byId = i ; + } + if ( coverage > predInfo[i].coverage ) + { + predInfo[i].coverage = coverage ; + predInfo[i].byId = j ; + } + } + } + + /*======================================================================================== + Stage 3: Dump the information into output files. + ========================================================================================*/ + //sprintf( filename, "grader.%s.recall", argv[1] ) ; + sprintf( filename, "grader.recall" ) ; + fp = fopen( filename, "w" ) ; + for ( i = 0 ; i < refCnt ; ++i ) + { + fprintf( fp, "%s\t%d\t%lf\t%s\n", ref[i].tid, ref[i].ecnt, refInfo[i].coverage, + refInfo[i].byId == -1 ? "-" : pred[ refInfo[i].byId ].tid ) ; + } + fclose( fp ) ; + + //sprintf( filename, "grader.%s.precision", argv[2] ) ; + sprintf( filename, "grader.precision" ) ; + fp = fopen( filename, "w" ) ; + for ( i = 0 ; i < predCnt ; ++i ) + { + fprintf( fp, "%s\t%d\t%lf\t%s\n", pred[i].tid, pred[i].ecnt, predInfo[i].coverage, + predInfo[i].byId == -1 ? "-" : ref[predInfo[i].byId].tid ) ; + } + fclose( fp ) ; + + // print the summary to the stdout + const int binCnt = 20 ; + int bins[binCnt + 1] ; + memset( bins, 0, sizeof( bins ) ) ; + k = 0 ; + for ( i = 0 ; i < refCnt ; ++i ) + { + if ( refInfo[i].coverage == -1 ) + continue ; + ++bins[ (int)( refInfo[i].coverage * binCnt ) ] ; + ++k ; + } + for ( i = 0 ; i <= binCnt ; ++i ) + { + printf( "recall %lf %lf %d/%d\n", (double)i / binCnt, (double)k / refCnt, k, refCnt ) ; + k -= bins[i] ; + } + + memset( bins, 0, sizeof( bins ) ) ; + k = 0 ; + for ( i = 0 ; i < predCnt ; ++i ) + { + if ( predInfo[i].coverage == -1 ) + continue ; + ++bins[ (int)( predInfo[i].coverage * binCnt ) ] ; + ++k ; + } + for ( i = 0 ; i <= binCnt ; ++i ) + { + printf( "precision %lf %lf %d/%d\n", (double)i / binCnt, (double)k / predCnt, k, predCnt ) ; + k -= bins[i] ; + } + + /*======================================================================================== + Stage 4: Evaluations for introns. + ========================================================================================*/ + struct _intron *refIntrons = ( struct _intron * )malloc( sizeof( struct _intron ) * MAX_TXPT ) ; + int riCnt = GetIntrons( ref, refCnt, refIntrons ) ; + struct _intron *predIntrons = ( struct _intron * )malloc( sizeof( struct _intron ) * MAX_TXPT ) ; + int piCnt = GetIntrons( pred, predCnt, predIntrons ) ; + int matchedIntron = 0 ; + /*for ( i = 0 ; i < riCnt ; ++i ) + { + printf( "%d %s %d %d\n", i, refIntrons[i].chrom, refIntrons[i].start, refIntrons[i].end ) ; + }*/ + + fp = fopen( "grader.intron", "w" ) ; + tag = 0 ; + for ( i = 0 ; i < piCnt ; ++i ) + { + bool flag = false ; + while ( 1 ) + { + if ( tag >= riCnt ) + break ; + int tmp = strcmp( refIntrons[tag].chrom, predIntrons[i].chrom ) ; + if ( tmp < 0 || ( tmp == 0 && refIntrons[tag].start < predIntrons[i].start ) ) + ++tag ; + else + break ; + } + //printf( "%d (%s %d %d) %d=>", i, predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, tag ) ; + /*if ( predIntrons[i].start == 1613457 ) + printf( "%d %d\n", tag, riCnt ) ;*/ + for ( j = tag ; j < riCnt ; ++j ) + { + if ( strcmp( refIntrons[j].chrom, predIntrons[i].chrom ) > 0 || + refIntrons[j].start > predIntrons[i].start /*+ 10*/ ) + break ; + if ( refIntrons[j].start == predIntrons[i].start && + refIntrons[j].end == predIntrons[i].end ) + //if ( WithinRange( refIntrons[j].start, predIntrons[i].start ) && + // WithinRange( refIntrons[j].end, predIntrons[i].end ) ) + { + ++matchedIntron ; + flag = true ; + break ; + } + } + //printf( "%d\n", j ) ; + if ( flag ) + fprintf( fp, "Y\t%s\t%d\t%d\t%s\n", predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, pred[ predIntrons[i].tindex ].tid ) ; + else + fprintf( fp, "N\t%s\t%d\t%d\t%s\n", predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, pred[ predIntrons[i].tindex ].tid ) ; + } + fclose( fp ) ; + printf( "\n" ) ; + printf( "Intron evaluation\n") ; + //printf( "\tmatched\t%d\n", matchedIntron ) ; + printf( "\trecall\t%lf\t%d/%d\n", matchedIntron / (double)riCnt, matchedIntron, riCnt ) ; + printf( "\tprecision\t%lf\t%d/%d\n", matchedIntron / (double)piCnt, matchedIntron, piCnt ) ; + + /*======================================================================================== + Stage 4: Evaluations for exons. + ========================================================================================*/ + struct _exon *refExons = ( struct _exon * )malloc( sizeof( struct _exon ) * MAX_TXPT ) ; + int reCnt = GetExons( ref, refCnt, refExons ) ; + struct _exon *predExons = ( struct _exon * )malloc( sizeof( struct _exon ) * MAX_TXPT ) ; + int peCnt = GetExons( pred, predCnt, predExons ) ; + int matchedExons = 0, matchedInternalExons = 0 ; + int refInternalExons = 0, predInternalExons = 0 ; + + for ( i = 0 ; i < reCnt ; ++i ) + { + //printf( "%d %d\n", refExons[i].start, refExons[i].end ) ; + if ( refExons[i].soft == 0 ) + ++refInternalExons ; + //if ( refExons[i].soft == 3 ) + // printf( "hi\n" ) ; + } + for ( i = 0 ; i < peCnt ; ++i ) + if ( predExons[i].soft == 0 ) + ++predInternalExons ; + tag = 0 ; + for ( i = 0 ; i < peCnt ; ++i ) + { + bool flag = false ; + while ( 1 ) + { + if ( tag >= reCnt ) + break ; + int tmp = strcmp( refExons[tag].chrom, predExons[i].chrom ) ; + if ( tmp < 0 || ( tmp == 0 && refExons[tag].end < predExons[i].start ) ) + ++tag ; + else + break ; + } + for ( j = tag ; j < reCnt ; ++j ) + { + if ( strcmp( refExons[j].chrom, predExons[i].chrom ) > 0 || + refExons[j].start > predExons[i].end /*+ 10*/ ) + break ; + if ( refExons[j].soft != predExons[i].soft + || refExons[j].end < predExons[i].start ) + continue ; + + if ( refExons[j].start == predExons[i].start && + refExons[j].end == predExons[i].end && predExons[i].soft == 0 ) + //if ( WithinRange( refIntrons[j].start, predIntrons[i].start ) && + // WithinRange( refIntrons[j].end, predIntrons[i].end ) ) + { + refExons[j].matched = true ; + predExons[i].matched = true ; + ++matchedInternalExons ; + flag = true ; + break ; + } + else if ( ( refExons[j].start == predExons[i].start || ( predExons[i].soft & 2 ) ) + && ( refExons[j].end == predExons[i].end || (predExons[i].soft & 1 ) ) ) + { + refExons[j].matched = true ; + predExons[i].matched = true ; + flag = true ; + //break ; + } + } + //printf( "%d\n", j ) ; + } + + printf( "\n" ) ; + printf( "Exon evaluation\n" ) ; + //printf( "\tmatched\t%d\n", matchedExons ) ; + matchedExons = 0 ; + for ( i = 0 ; i < reCnt ; ++i ) + if ( refExons[i].matched ) + ++matchedExons ; + printf( "\trecall\t%lf\t%d/%d\n", (double)matchedExons / reCnt, matchedExons, reCnt ) ; + matchedExons = 0 ; + for ( i = 0 ; i < peCnt ; ++i ) + if ( predExons[i].matched ) + ++matchedExons ; + printf( "\tprecision\t%lf\t%d/%d\n", (double)matchedExons / peCnt, matchedExons, peCnt ) ; + + printf( "\n" ) ; + printf( "Internal exon evaluation\n" ) ; + //printf( "\tmatched\t%d\n", matchedInternalExons ) ; + printf( "\trecall\t%lf\t%d/%d\n", (double)matchedInternalExons / refInternalExons, matchedInternalExons, refInternalExons ) ; + printf( "\tprecision\t%lf\t%d/%d\n", (double)matchedInternalExons / predInternalExons, matchedInternalExons, predInternalExons ) ; + return 0 ; +} diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/psiclass --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/psiclass Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,411 @@ +#!/usr/bin/env perl + +use strict ; +use warnings ; + +use Cwd 'cwd' ; +use Cwd 'abs_path' ; +use File::Basename; +use threads ; +use threads::shared ; + +die "Usage: ./psiclass [OPTIONS]\n". + "Required:\n". + "\t-b STRING: paths to the alignment BAM files. Use comma to separate multiple BAM files\n". + "\t\tor\n". + "\t--lb STRING: path to the file listing the alignments BAM files\n". + "Optional:\n". + "\t-s STRING: path to the trusted splice file (default: not used)\n". + "\t-o STRING: prefix of output files (default: ./psiclass)\n". + "\t-p INT: number of processes/threads (default: 1)\n". + "\t-c FLOAT: only use the subexons with classifier score <= than the given number (default: 0.05)\n". + "\t--sa FLOAT: the minimum average number of supported read for retained introns (default: 0.5)\n". + "\t--vd FLOAT : the minimum average coverage depth of a transcript to be reported (defaults: 1.0)\n". + "\t--maxDpConstraintSize: the number of subexons a constraint can cover in DP. (default: 7. -1 for inf)\n". + "\t--bamGroup STRING: path to the file listing the group id of BAMs in the --lb file (default: not used)\n". + "\t--primaryParalog: use primary alignment to retain paralog genes (default: use unique alignments)\n". + #"\t--mateIdx INT: the read id has suffix such as .1, .2 for a mate pair. (default: auto)\n". + "\t--version: print version and exit\n". + "\t--stage INT: (default: 0)\n". + "\t\t0-start from beginning - building splice sites for each sample\n". + "\t\t1-start from building subexon files for each sample\n". + "\t\t2-start from combining subexon files across samples\n". + "\t\t3-start from assembling the transcripts for each sample\n". + "\t\t4-start from voting the consensus transcripts across samples\n" + if ( @ARGV == 0 ) ; + +my $WD = dirname( abs_path( $0 ) ) ; + +my $i ; +my $cmd ; +my $prefix = "psiclass_" ; +my $numThreads = 1 ; + +sub system_call +{ + print $_[0], "\n" ; + system( $_[0] ) == 0 or die "Terminated\n" ; +} + +# Process the arguments +my @bamFiles : shared ; +my $spliceFile = "" ; +my $bamFileList = "" ; +my $stage = 0 ; +my $classesOpt = "" ; +my $juncOpt = "" ; +my $trustSpliceOpt = "" ; +my $voteOpt = "" ; +my $outdir = "." ; +my $mateIdx = -1 ; +my $spliceAvgSupport = 0.5 ; +my $bamGroup = "" ; +for ( $i = 0 ; $i < @ARGV ; ++$i ) +{ + if ( $ARGV[$i] eq "--lb" ) + { + $bamFileList = $ARGV[$i + 1] ; + open FP1, $ARGV[$i + 1] ; + while ( ) + { + chomp ; + push @bamFiles, $_ ; + } + ++$i ; + } + elsif ( $ARGV[$i] eq "-b" ) + { + push @bamFiles, ( split /,/, $ARGV[$i + 1] ) ; + ++$i ; + } + elsif ( $ARGV[$i] eq "-s" ) + { + $spliceFile = $ARGV[$i + 1] ; + ++$i ; + } + elsif ( $ARGV[$i] eq "-o" ) + { + my $o = $ARGV[$i + 1] ; + # Parse the -o option to get the folder and prefix + if ( $o =~ /\// ) + { + if ( substr( $o, -1 ) ne "/" ) # file prefix is in -o + { + my @cols = split /\/+/, $o ; + $prefix = $cols[-1] ; + $outdir = join( "/", @cols[0..($#cols-1)] ) ; + } + else # -o only specifies path + { + $outdir = substr( $o, 0, length( $o ) - 1 ) ; + } + } + else + { + $prefix = $o ; + } + + if ( substr( $prefix, -1 ) ne "_" ) + { + $prefix .= "_" ; + } + + ++$i ; + } + elsif ( $ARGV[$i] eq "--stage" ) + { + $stage = $ARGV[$i + 1] ; + ++$i ; + } + elsif ( $ARGV[ $i ] eq "-p" ) + { + $numThreads = $ARGV[$i + 1] ; + $classesOpt .= " -p $numThreads" ; + ++$i ; + } + elsif ( $ARGV[ $i ] eq "-c" ) + { + $classesOpt .= " -c ".$ARGV[ $i + 1 ] ; + ++$i ; + } + elsif ( $ARGV[ $i ] eq "--mateIdx" ) + { + $mateIdx = $ARGV[ $i + 1 ] ; + if ( $mateIdx == 1 ) + { + $classesOpt .= " --hasMateIdSuffix" ; + $juncOpt .= " --hasMateIdSuffix" ; + } + ++$i ; + } + elsif ( $ARGV[$i] eq "--bamGroup" ) + { + $bamGroup = $ARGV[$i + 1] ; + ++$i ; + } + elsif ( $ARGV[ $i ] eq "--sa" ) + { + $trustSpliceOpt .= " -a ".$ARGV[$i + 1] ; + ++$i ; + } + elsif ( $ARGV[ $i ] eq "--vd" ) + { + $voteOpt .= " -d ".$ARGV[$i + 1] ; + ++$i ; + } + elsif ( $ARGV[$i] eq "--primaryParalog" ) + { + $classesOpt .= " --primaryParalog" ; + } + elsif ( $ARGV[$i] eq "--maxDpConstraintSize" ) + { + $classesOpt .= " --maxDpConstraintSize ".$ARGV[$i + 1] ; + ++$i ; + } + elsif ( $ARGV[$i] eq "--version" ) + { + die "PsiCLASS v1.0.1\n" ; + } + else + { + die "Unknown argument: ", $ARGV[$i], "\n" ; + } +} +if ( scalar( @bamFiles ) == 0 ) +{ + die "Must use option --lb to specify the list of bam files.\n" ; +} + +if ( $mateIdx == -1 ) +{ + open FPsam, "$WD/samtools-0.1.19/samtools view ".$bamFiles[0]."| head -1000 |" ; + my $flag = 0 ; + while ( ) + { + my @cols = split /\t/ ; + my $id = $cols[0] ; + + if ( !( $id =~ /[\.|\/][1|2]$/ ) ) + { + $flag = 1 ; + last ; + } + } + close FPsam ; + + if ( $flag == 0 ) + { + print "Found mate read id index suffix(.1 or /1). Calling \"--mateIdx 1\" option. If this is a false calling, please use \"--mateIdx 0\".\n" ; + $classesOpt .= " --hasMateIdSuffix" ; + $juncOpt .= " --hasMateIdSuffix" ; + + } +} + +mkdir $outdir if ( !-d $outdir ) ; +mkdir "$outdir/splice" if ( !-d "$outdir/splice" ) ; +mkdir "$outdir/subexon" if ( !-d "$outdir/subexon" ) ; + + +my $threadLock : shared ; +my @sharedFiles : shared ; +my @threads ; +for ( $i = 0 ; $i < $numThreads ; ++$i ) +{ + push @threads, $i ; +} + +sub threadRunSplice +{ + my $tid = threads->tid() - 1 ; + my $i ; + for ( $i = 0 ; $i < scalar( @bamFiles ) ; ++$i ) + { + next if ( ( $i % $numThreads ) != $tid ) ; + system_call( "$WD/junc ".$bamFiles[$i]." -a $juncOpt > $outdir/splice/${prefix}bam_$i.raw_splice" ) ; + } +} + +# Generate the splice file for each bam file. +if ( $stage <= 0 ) +{ + if ( $numThreads == 1 ) + { + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + system_call( "$WD/junc ".$bamFiles[$i]." -a $juncOpt > $outdir/splice/${prefix}bam_$i.raw_splice" ) ; + #if ( $spliceFile ne "" ) + #{ + # system_call( "perl $WD/ManipulateIntronFile.pl $spliceFile ${prefix}bam_$i.raw_splice > ${prefix}bam_$i.splice" ) ; + #} + #else + #{ + #system_call( "awk \'{if (\$6>1) print;}\' ${prefix}bam_$i.raw_splice > ${prefix}bam_$i.splice" ) ; + # system_call( "mv ${prefix}bam_$i.raw_splice ${prefix}bam_$i.splice" ) ; + #} + } + } + else + { + foreach ( @threads ) + { + $_ = threads->create( \&threadRunSplice ) ; + } + foreach ( @threads ) + { + $_->join() ; + } + } + + if ( $bamGroup eq "" || $spliceFile ne "" ) + { + open FPls, ">$outdir/splice/${prefix}splice.list" ; + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + print FPls "$outdir/splice/${prefix}bam_$i.raw_splice\n" ; + } + close FPls ; + + if ( $spliceFile ne "" ) + { + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $spliceFile > $outdir/splice/${prefix}bam_$i.splice" ) ; + } + } + else + { + system_call( "$WD/trust-splice $outdir/splice/${prefix}splice.list ". $bamFiles[0] ." $trustSpliceOpt > $outdir/splice/${prefix}bam.trusted_splice" ) ; + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $outdir/splice/${prefix}bam.trusted_splice > $outdir/splice/${prefix}bam_$i.splice" ) ; + } + } + } + else + { + my @bamToGroupId ; + my %groupNameToId ; + open FPbg, $bamGroup ; + my $groupUsed = 0 ; + while () + { + chomp ; + if ( !defined $groupNameToId{$_} ) + { + $groupNameToId{$_} = $groupUsed ; + ++$groupUsed ; + } + push @bamToGroupId, $groupNameToId{$_} ; + } + close FPbg ; + + # Process each group one by one + my $group ; + for ( $group = 0 ; $group < $groupUsed ; ++$group ) + { + open FPls, ">$outdir/splice/${prefix}splice_$group.list" ; + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + next if ( $bamToGroupId[$i] != $group ) ; + print FPls "$outdir/splice/${prefix}bam_$i.raw_splice\n" ; + } + close FPls ; + + system_call( "$WD/trust-splice $outdir/splice/${prefix}splice_$group.list ". $bamFiles[0] ." $trustSpliceOpt > $outdir/splice/${prefix}bam_$group.trusted_splice" ) ; + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + next if ( $bamToGroupId[$i] != $group ) ; + system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $outdir/splice/${prefix}bam_$group.trusted_splice > $outdir/splice/${prefix}bam_$i.splice" ) ; + } + } + + } +} + + +# Get subexons from each bam file +sub threadRunSubexonInfo +{ + my $tidAdjust = 0 ; + $tidAdjust = $numThreads if ( $stage < 1 ) ; + my $tid = threads->tid() - $tidAdjust - 1 ; + my $i ; + for ( $i = 0 ; $i < scalar( @bamFiles ) ; ++$i ) + { + next if ( ( $i % $numThreads ) != $tid ) ; + system_call( "$WD/subexon-info ".$bamFiles[$i]." $outdir/splice/${prefix}bam_$i.splice > $outdir/subexon/${prefix}subexon_$i.out" ) ; + } +} + + +if ( $stage <= 1 ) +{ + if ( $numThreads == 1 ) + { + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + system_call( "$WD/subexon-info ".$bamFiles[$i]." $outdir/splice/${prefix}bam_$i.splice > $outdir/subexon/${prefix}subexon_$i.out" ) ; + } + } + else + { + #print "hi ", scalar( @threads ), " ", scalar( @bamFiles), "\n" ; + foreach ( @threads ) + { + $_ = threads->create( \&threadRunSubexonInfo ) ; + } + foreach ( @threads ) + { + $_->join() ; + } + } + + open FPls, ">$outdir/subexon/${prefix}subexon.list" ; + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + print FPls "$outdir/subexon/${prefix}subexon_$i.out\n" ; + } + close FPls ; +} + +# combine the subexons. +if ( $stage <= 2 ) +{ + $cmd = "$WD/combine-subexons --ls $outdir/subexon/${prefix}subexon.list > $outdir/subexon/${prefix}subexon_combined.out" ; + system_call( "$cmd" ) ; +} + +# Run classes +if ( $stage <= 3 ) +{ + my $trimPrefix = substr( $prefix, 0, -1 ) ; + my $bamPath = "" ; + if ( $bamFileList ne "" ) + { + $bamPath = " --lb $bamFileList " ; + } + else + { + foreach my $b (@bamFiles) + { + $bamPath .= " -b $b " ; + } + } + $cmd = "$WD/classes $classesOpt $bamPath -s $outdir/subexon/${prefix}subexon_combined.out -o $outdir/${trimPrefix} > $outdir/${prefix}classes.log" ; + system_call( "$cmd" ) ; +} + +# Run voting +if ( $stage <= 4 ) +{ + open FPgtflist, ">$outdir/${prefix}gtf.list" ; + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + print FPgtflist "$outdir/${prefix}sample_${i}.gtf\n" ; + } + close FPgtflist ; + $cmd = "$WD/vote-transcripts --lg $outdir/${prefix}gtf.list $voteOpt > $outdir/${prefix}vote.gtf" ; + system_call( "$cmd" ) ; +} + diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/samtools-0.1.19/.gitignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/.gitignore Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,4 @@ +*.o +.*.swp +*.a +*.dSYM diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/samtools-0.1.19/AUTHORS --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/AUTHORS Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,20 @@ +Heng Li from the Sanger Institute wrote most of the initial source codes +of SAMtools and various converters. + +Bob Handsaker from the Broad Institute is a major contributor to the +SAM/BAM specification. He designed and implemented the BGZF format, the +underlying indexable compression format for the BAM format. BGZF does +not support arithmetic between file offsets. + +Jue Ruan for the Beijing Genome Institute designed and implemented the +RAZF format, an alternative indexable compression format. RAZF supports +arithmetic between file offsets, at the cost of increased index file +size and the full compatibility with gzip. RAZF is optional and only +used in `faidx' for indexing RAZF compressed fasta files. + +Colin Hercus updated novo2sam.pl to support gapped alignment by +novoalign. + +Petr Danecek contributed the header parsing library sam_header.c and +sam2vcf.pl script and added knet support to the RAZF library. + diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/samtools-0.1.19/COPYING --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/COPYING Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2008-2009 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff -r 000000000000 -r 903fc43d6227 PsiCLASS-1.0.2/samtools-0.1.19/ChangeLog.old --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/ChangeLog.old Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,3875 @@ +commit db2ad3e19068cbafde72ecde75d0638bbb3598ba +Author: Heng Li +Date: Thu Feb 16 14:45:17 2012 -0500 + + removed downsample.c + +commit 6c55c576903992c6fef148fe3b606fbc8bd10655 +Author: Heng Li +Date: Thu Feb 16 14:45:06 2012 -0500 + + print to output + +commit db1044a34e6049c87eaa63c39ed6e56f03e7d4c1 +Author: Heng Li +Date: Thu Feb 16 14:39:34 2012 -0500 + + removed sample + + Downsampling already exists in "view". View also keeps pairing while "sample" does not. + +commit ffdeed3e5d4a530bfdf6f9ba97fff0ba7add6cba +Merge: 2daad7b accf026 +Author: Heng Li +Date: Thu Feb 16 14:22:15 2012 -0500 + + Merge branch 'master' of github.com:lh3/samtools + +commit accf0260fd1117e10047344345d40b31a9ec31bb +Merge: 9134e0d c554160 +Author: Heng Li +Date: Thu Feb 16 11:21:14 2012 -0800 + + Merge pull request #8 from nh13/master + + Patches + +commit c554160df16ec7748cfdda4c7b54c641be7b809f +Author: Nils Homer +Date: Thu Feb 16 14:06:52 2012 -0500 + + * more README.md work + +commit 2a81ffe349208d917666808fbc9f3041e0cb57de +Author: Nils Homer +Date: Thu Feb 16 14:06:10 2012 -0500 + + * more README work + +commit fb3125f732715f62cded8685a23a002a96ce009b +Author: Nils Homer +Date: Thu Feb 16 14:05:19 2012 -0500 + + * more README work + +commit 444d41002c37e1c3d0f9208b4a88126c47276386 +Author: Nils Homer +Date: Thu Feb 16 14:02:13 2012 -0500 + + * updating README + +commit dec53cb1043fe7efadfde75fa2fd39b76de22e54 +Author: Nils Homer +Date: Thu Feb 16 13:55:01 2012 -0500 + + updating the README for markdown syntax + +commit 798da18c346dca8ec6005582a0ddb1d5420b04ca +Author: Nils Homer +Date: Thu Feb 16 13:48:35 2012 -0500 + + adding a README with the current differences between this repository and + the official one + +commit 4d22d86c0f28636662f2144a88cd168e104c4275 +Author: Nils Homer +Date: Thu Feb 16 13:35:03 2012 -0500 + + adding "samtools sample" to the main + +commit 893c25a37c21005dc42f45d45e9ad78ddc5f29bb +Author: Nils Homer +Date: Thu Feb 16 13:33:51 2012 -0500 + + * removing some compile flags to work with OS X + +commit 7ac22f72fdc32edd5c24af6baebfa7db5faf8e7b +Author: Jonathan Manning +Date: Thu Feb 16 10:47:14 2012 -0500 + + Check write filehandle after opening for write. tamw/tamr is a union type, so change is only semantic. + + Signed-off-by: Nils Homer + +commit fef53330416631690f60fdff42b6e43d764170dc +Author: Jonathan Manning +Date: Thu Feb 16 10:44:59 2012 -0500 + + Catch and report invalid BAM header, instead of segfaulting later on. + + Signed-off-by: Nils Homer + +commit 5cc013fe4930bf9b6e7963aab1cd4a3c94f695bc +Author: Jonathan Manning +Date: Thu Feb 16 10:44:16 2012 -0500 + + Add downsample to examples. + + Signed-off-by: Nils Homer + +commit b3fa9e7071532905a81dc7aa48eadc24b8c8846b +Author: Jonathan Manning +Date: Thu Feb 16 10:43:48 2012 -0500 + + Adjust for leading hard clip on colorspace reads. + + Signed-off-by: Nils Homer + +commit 1a9296c1389469d1c1db5b8069f0e11ffcc8abb2 +Author: Jonathan Manning +Date: Thu Feb 16 10:42:52 2012 -0500 + + Add samtools sample command, contributed by Davide Cittaro . + + Signed-off-by: Nils Homer + +commit 2a804f3379748aeba944f1dec306dd726ff3235e +Author: Jonathan Manning +Date: Thu Feb 16 10:42:07 2012 -0500 + + Add samtools qa command, contributed by Roman Valls Guimera . + + Signed-off-by: Nils Homer + +commit 0f3207fe8fd93e44d40fcf57204079c8c06d24a6 +Author: Jonathan Manning +Date: Thu Feb 16 10:39:08 2012 -0500 + + Makefile cleanup - allow CC, CFLAGS, LDFLAGS to be passed on make command line. Use LDFLAGS in samtools compile. + + Signed-off-by: Nils Homer + +commit 6e7df604025f6a86881bf7f4a16f30e15d31538a +Author: Jonathan Manning +Date: Thu Feb 16 10:31:15 2012 -0500 + + Allow max_mem for sort to be specified with units. + + Signed-off-by: Nils Homer + +commit f12ebcaf6e60d34180a27d70e09b743cef140b98 +Author: Jonathan Manning +Date: Thu Feb 16 10:29:11 2012 -0500 + + Allow user defined [lowercase] tags in header elements. + + Signed-off-by: Nils Homer + +commit 50b931fa3312dc109537a4260698ddecd0f06a05 +Author: Jonathan Manning +Date: Thu Feb 16 10:27:11 2012 -0500 + + Check lowerbound in text entry box to avoid segfault in tview. Remove redundant call to bam_aux_get. + + Signed-off-by: Nils Homer + +commit 5e729da5190949a813d20d329eab7ddb661816bd +Author: Nils Homer +Date: Thu Feb 16 10:31:48 2012 -0500 + + * fixing overflow/underflow in integer parsing + +commit fa50a4330b9abedaf07c26e13d31f05e57f1d319 +Author: Nils Homer +Date: Thu Feb 16 10:30:40 2012 -0500 + + * updating help message for samtools depth + +commit 79e52c9624b6dd3bdfdf439f4b4bc6f774c230a4 +Author: Nils Homer +Date: Thu Feb 16 10:29:32 2012 -0500 + + * adding support for outputting a circos histogram file in "samtools depth". Use + the "-c/-B" options. + +commit 2daad7b52daa86561c0fb65fe366691fad9f5ed3 +Author: Heng Li +Date: Thu Feb 16 09:31:57 2012 -0500 + + bugfix: wrong SP; missing DV in the VCF hdr + +commit 9134e0d5047c281ef3bd53da91771d4814a5131c +Author: Heng Li +Date: Wed Feb 8 11:19:12 2012 -0500 + + missing support of DV + +commit 34ebf12078c1d1015a0b8b9a9221243a60b22893 +Author: Heng Li +Date: Wed Feb 8 11:08:56 2012 -0500 + + new BCF DV format: number of variant reads + +commit 9589d3312fa2d076f48bdd68e2a5edd419c8070c +Author: Heng Li +Date: Tue Jan 10 10:30:27 2012 -0500 + + scale depth to quality (hidden option) + +commit 704473e14668333ecaca5fb7b238af405c43e3b1 +Author: Heng Li +Date: Tue Jan 10 10:18:17 2012 -0500 + + really nothing + +commit 01b307fd287962372bbf07461c88b54f41636817 +Author: Heng Li +Date: Wed Dec 7 13:07:42 2011 -0500 + + added an example containing 'B' + +commit c678791f0451ceb9205c1ab5c52c84641863c99a +Author: Heng Li +Date: Sat Dec 3 12:10:30 2011 -0500 + + 'B' now moves backward w.r.t. the query + +commit 152119bc06a073933ca830e8e1407538e44626cc +Author: Heng Li +Date: Fri Dec 2 10:50:12 2011 -0500 + + better consensus; a little more robust + +commit 454da4754ac503edda5b1329b67757d797e46e07 +Author: Heng Li +Date: Fri Dec 2 00:20:22 2011 -0500 + + in pileup call remove_B() + +commit ff2bcac1cc078ba1879f18c89cfae314439d7086 +Author: Heng Li +Date: Fri Dec 2 00:17:32 2011 -0500 + + working on a few toy examples + +commit 745ca7260158d6df7897b52598033ffb055a9e4f +Author: Heng Li +Date: Thu Dec 1 22:55:39 2011 -0500 + + bam_remove_B(); not tested + +commit 07e4cdc7300abfcc82e03105b4689f95cab551cd +Author: Heng Li +Date: Thu Nov 10 12:58:55 2011 -0500 + + baseQ threshold on plain pipleup; removed -E + +commit 322ebf2082dfa91df44b3a996d26c85357e5d5a2 +Author: Heng Li +Date: Wed Oct 19 09:28:04 2011 -0400 + + fixed two gcc warnings + +commit a632457b4c4adc50d833b56b5a5231feafaf8193 +Author: Heng Li +Date: Tue Oct 4 10:13:23 2011 -0400 + + change size_t to uint32_t in bam_header_t + + This may cause issues on 64-bit big-endian machines. Reported and fixed by Paolo Emilio Mazzon. + +commit af31bf5a78aea03baf6eb90fe50076549d499f6e +Author: Heng Li +Date: Mon Sep 26 20:17:57 2011 -0400 + + rename pad2unpad to depad + +commit 77b198b73dfad1048e5d1c5a64aa75ee7b90f596 +Author: Heng Li +Date: Fri Sep 23 01:22:40 2011 -0400 + + convert padded BAM to unpadded BAM + +commit adb9e2342b7b7501d9527d3c23afab10469ae2c6 +Author: Heng Li +Date: Wed Sep 7 11:40:50 2011 -0400 + + generate template cigar with "fixmate" + +commit 46e5ab445a0fe880216cbc0daf1225725b569d7a +Author: Heng Li +Date: Fri Sep 2 12:50:18 2011 -0400 + + update kseq.h to the latest version + +commit 68e9e4a73eb91405bb3e56bf0cdaf12d1b487abb +Author: Heng Li +Date: Fri Sep 2 12:44:45 2011 -0400 + + Release samtools-0.1.18 + +commit aa06bdadb2d109a79f927f478102f96a1f5fd258 +Author: Heng Li +Date: Fri Sep 2 12:14:17 2011 -0400 + + updated the revision number + +commit 267e1e1b6e54c0ab24f94cd9aee9cbd2d1923f9f +Merge: 19ff1d3 aebab30 +Author: Heng Li +Date: Fri Sep 2 12:13:08 2011 -0400 + + Merge https://github.com/lh3/samtools into reduce + + Conflicts: + bam_md.c + + Fixed a few typos in the merge + +commit aebab302399c24eaa6c5ab79d13d6bd5e2e9ea9a +Merge: c2c63d0 da62663 +Author: Heng Li +Date: Fri Sep 2 09:03:49 2011 -0700 + + Merge pull request #4 from peterjc/x_equals2 + + Implement basic support for =/X CIGAR operations + +commit 19ff1d3d7f47d7e61b121292aefe5a74bb8a18d2 +Author: Heng Li +Date: Thu Aug 25 16:38:12 2011 -0400 + + reduce BAM size (experimental) + +commit da626630fd98fd4e07ceb4d58c5c9a42d312a85d +Author: peterjc +Date: Mon Aug 22 06:58:08 2011 +0100 + + Support =/X CIGAR operations (treated like M) + +commit 461d8003529db77a4d5ecbd108312e868b051a3d +Author: peterjc +Date: Mon Aug 22 05:52:56 2011 +0100 + + Define CIGAR equals and X operationss (7 and 8) + +commit c2c63d067113baab41f3bc35fb28f4f00578accb +Merge: 7ab3ef3 9a0ed9a +Author: Heng Li +Date: Thu Aug 18 17:21:54 2011 -0700 + + Merge pull request #3 from peterjc/x_equals + + Accept SAM files using = in CIGAR (treats X and = as M) + +commit 9a0ed9a6b85c7981465f459300208dbd93e3c6f5 +Author: peterjc +Date: Thu Aug 18 19:28:52 2011 +0100 + + Accept SAM files using = in CIGAR (treats X and = as M) + +commit 7ab3ef388c1eb34d7912fd70cc5656c955240263 +Author: Heng Li +Date: Mon Aug 8 10:22:22 2011 -0400 + + bugfix: indexing takes huge memory + + This happens when an unmapped mate has coordinate 1. Thank Joel Martin for the fix. + +commit a3f6738593e944354a8f75306687d8b3acf08bf1 +Merge: a8bdca9 bc67ea2 +Author: Heng Li +Date: Mon Aug 8 09:52:26 2011 -0400 + + Merge branch 'master' of github.com:lh3/samtools + +commit bc67ea225da653f36a70b38382d6111dd494f659 +Author: Petr Danecek +Date: Thu Jul 28 20:03:16 2011 +0100 + + Variant Distance Bias + +commit deb578f0c49d0b7d8c3bc6be220b4d67e2e7dfdf +Author: Petr Danecek +Date: Tue Jul 26 09:57:37 2011 +0100 + + If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but use the tag instead. + +commit a8bdca9cf482a637b89ee4f98469a93e0ab5e69b +Author: Heng Li +Date: Mon Jul 25 10:10:55 2011 -0400 + + bugfix: LRT2=nan + +commit 0afe33137d046a3e849eeb4a54590f27cbad4228 +Author: Heng Li +Date: Fri Jul 22 21:55:38 2011 -0400 + + fixed a bug/typo + +commit 62d5849658c10222d40308c6b53ab4f99a448494 +Author: Heng Li +Date: Fri Jul 15 16:04:19 2011 -0400 + + allow to set see in subsampling + +commit 5f46243824cc9435b167973e1d51e13128794ea1 +Author: Heng Li +Date: Fri Jul 15 15:54:47 2011 -0400 + + support subsampling + +commit 5e55b6f34fc86cba7cf98d52ccaed405c3ffabbc +Author: Heng Li +Date: Fri Jul 15 15:53:38 2011 -0400 + + support indels + +commit f31c162926d6f43e8b60171789a258d02e1f9be5 +Author: Heng Li +Date: Thu Jul 7 17:02:33 2011 -0400 + + do not count indel with "view -Y" + +commit e412dae587883b4c17e5fbf4b7c33f38bfa8458a +Author: Heng Li +Date: Thu Jul 7 00:35:25 2011 -0400 + + for WIN32 compatibility + +commit 70a52501bcfa63824749893a5ab8ed3c38e34958 +Author: Heng Li +Date: Thu Jul 7 00:32:46 2011 -0400 + + for WIN32 compatibility + +commit 00438f14ed5984f08e8f7645a9b95644a812f969 +Author: Heng Li +Date: Wed Jul 6 23:41:45 2011 -0400 + + fixed an uninitialized variable + +commit 7609c4a01059c326544b3d0142dfe9c4229d68c6 +Author: Heng Li +Date: Wed Jul 6 23:39:31 2011 -0400 + + fixed an uninitialized variable + +commit cec7189a412f80ccb068a73bd28528915c16b0bf +Author: Heng Li +Date: Wed Jul 6 22:53:19 2011 -0400 + + Release samtools-0.1.17 + +commit 93c06a249de3bb666029bf07b66de5e8e5e314fa +Author: Heng Li +Date: Wed Jul 6 09:46:09 2011 -0400 + + bugfix: incorrect idxstats for the last seq + + Again, this bug is caused by 3rd-party code for the sorting order checking. + +commit 84f6ca62db6e27b8c4c711e7b5f3ca704bf27b4f +Author: Heng Li +Date: Tue Jul 5 23:30:23 2011 -0400 + + output mapping quality in the old pileup format + +commit 362e05fd670886acaede69b864903d730b9db3ca +Author: Heng Li +Date: Tue Jul 5 21:59:22 2011 -0400 + + added a brief description of the VCF format + +commit e690a696468205e0cc4560016361c997660dd496 +Author: Heng Li +Date: Tue Jul 5 16:23:10 2011 -0400 + + improved samtools manual page + +commit 362b4a1408ef3c32311d638aa8d85ce39c1c7b2d +Author: Heng Li +Date: Tue Jul 5 15:58:29 2011 -0400 + + merge bcftools.1 to samtools.1 + +commit 643e0e61ba7266efbc9e5bfcb8e41f369ba2ce0a +Author: Heng Li +Date: Tue Jul 5 13:39:02 2011 -0400 + + mpileup: when region set, set reference properly + +commit 613e4d67624a94f62563935fbd5cc294df69605a +Author: Heng Li +Date: Mon Jul 4 23:29:02 2011 -0400 + + compute the min PL diff + +commit 5b7d5d3f52b97ca42c8500eede808dab88a46a53 +Author: Heng Li +Date: Mon Jul 4 22:57:48 2011 -0400 + + rename trio.c to mut.c + +commit 84fe96ad64b0365ead93a4115d1684b9bebb98fc +Author: Heng Li +Date: Sun Jul 3 15:38:51 2011 -0400 + + added pair caller interface; not tested + +commit 2f2867b87b84c35319cc416d6173819d5c8a4e8c +Author: Heng Li +Date: Sun Jul 3 15:24:23 2011 -0400 + + inital implementation of a pair caller + +commit e97653cf2ad653c95886933c42a2b5492ccab5ff +Author: Heng Li +Date: Sun Jul 3 00:06:28 2011 -0400 + + convert bam to single-end fastq + +commit e8013e11f7a8db0a8d18c60d130169cca39bf2bd +Author: Heng Li +Date: Sat Jul 2 14:39:18 2011 -0400 + + improve BED parsing + +commit 1025714325fdc636aeee47a76db8dafbbbfde64b +Author: Heng Li +Date: Fri Jul 1 14:19:54 2011 -0400 + + update the manual page + +commit 8022d0039dff47b1c11b2421357d510c1f28ae15 +Author: Heng Li +Date: Fri Jul 1 14:17:03 2011 -0400 + + output the best constrained genotypes in trio + +commit 18c87295e12f5bebafdcae00d52000fb94c8a566 +Author: Heng Li +Date: Fri Jul 1 11:18:14 2011 -0400 + + added documentations for view -T + +commit daf7a8d96bd495296bf7c7d99cddb808a3ced7d5 +Author: Heng Li +Date: Thu Jun 30 22:45:20 2011 -0400 + + fixed a bug in writing SP + +commit e5c32bf9b28c6e3e861db88de56b5dbe11058b61 +Author: Heng Li +Date: Thu Jun 30 22:35:25 2011 -0400 + + optionally output read positions in mpileup + +commit 1008051155ec994c1901e18f3eb03ea32a62e5d7 +Author: Heng Li +Date: Thu Jun 30 22:17:25 2011 -0400 + + make faidx works with <2GB lines + +commit 2daebb63762425dd3074ddf71582ad189001e394 +Author: Heng Li +Date: Thu Jun 30 17:28:58 2011 -0400 + + fixed an issue in the trio caller and the indel caller + +commit 9fdd52cf0716fb342a94946433d564b28b230835 +Author: Heng Li +Date: Thu Jun 30 13:34:01 2011 -0400 + + Added trio caller; NOT tested yet + +commit ea22a8ed83625e9c82382b56acc42a2d9cfd17e5 +Author: Heng Li +Date: Thu Jun 30 11:42:29 2011 -0400 + + convert PL to 10-likelihood GL + +commit 10d7065267b0d12c2bfcb6c70204fb6944cd395d +Author: Heng Li +Date: Thu Jun 30 10:49:05 2011 -0400 + + fix a compatibility issue with the new bcftools + +commit d340f01f609c61b719d38a6a55629a3fc899e1cd +Author: Heng Li +Date: Sun Jun 26 23:41:20 2011 -0400 + + allow to ignore RG + +commit d6321faf98ebfe899b9409fb23c90a4aa8c6b542 +Author: Heng Li +Date: Sun Jun 5 23:05:21 2011 -0400 + + fixed a bug in SO checking due to a recent change + +commit bc995abf666d0c9ab4258f6c1b3518a45a89209f +Author: Heng Li +Date: Fri Jun 3 14:45:36 2011 -0400 + + update the version number + +commit 9e7cd83a08383858d008e0ccb2238a2b93831d6c +Author: Heng Li +Date: Fri Jun 3 14:43:12 2011 -0400 + + smarter way to parse a region string + +commit e58a90a0fde54053dac65352b34c13c3fea815fc +Author: Heng Li +Date: Wed Jun 1 14:36:22 2011 -0400 + + output LRT2 instead of LRT1 + +commit 08f78c9af3e5661f04f80bef424232de721dba03 +Author: Heng Li +Date: Wed Jun 1 14:02:28 2011 -0400 + + genotype test, but assuming 1-degree + +commit 587b852340d7e60f6f7cf474a92ef77aeab46018 +Author: Heng Li +Date: Wed Jun 1 12:55:19 2011 -0400 + + perform 2-degree test by default + +commit 3d38e403c5c830478b7eb157a484776997440501 +Author: Heng Li +Date: Wed Jun 1 12:44:34 2011 -0400 + + fixed a typo; but the result is still not good + +commit 06291624f7dcc57445676f3be25d0bc355dd7110 +Author: Heng Li +Date: Wed Jun 1 12:24:18 2011 -0400 + + fixed a typo + +commit 63b98aa33636b0d82a435bf49153c8c1502e7d42 +Author: Heng Li +Date: Wed Jun 1 12:23:37 2011 -0400 + + added HWE+F<0 filter + +commit 37d926e8999999b593d0637ab7dc379dbd3d6006 +Author: Heng Li +Date: Wed May 4 10:11:59 2011 -0400 + + improved sorting order checking in index + + Patches from Jonathan Manning + +commit 1c2dc6762c5f7cd946046b53346513f2f9761dbf +Author: Heng Li +Date: Tue May 3 23:09:05 2011 -0400 + + added r^2 estimate; added Brent's method + +commit c2d3bcd8f98e31668b5f1321222fbc6fd6336e75 +Author: Heng Li +Date: Sun May 1 23:45:23 2011 -0400 + + combine several utilites into vcfutils.lua + +commit be2e7362d7593ea4d03fb33cdb6af2aa096ca6c4 +Author: Heng Li +Date: Wed Apr 27 21:09:22 2011 -0400 + + minor warning + +commit 683ef0443860813d743cf84fa86dda9bfaf5445a +Author: Heng Li +Date: Wed Apr 27 10:10:38 2011 -0400 + + added versioning + +commit ed72f25ec85671f7646dbc92fa7b5b1dda427f7d +Author: Heng Li +Date: Wed Apr 27 10:04:02 2011 -0400 + + Output ML allele count + +commit 2a9e36d2d6c405b2411ca47458f028ada8fe1000 +Author: Heng Li +Date: Tue Apr 26 16:14:20 2011 -0400 + + use ar -s + +commit 7a4f54e6dbcd7c94acbb3f1050a93f94b8a07949 +Author: Heng Li +Date: Sat Apr 23 01:22:31 2011 -0400 + + added another type of LRT + +commit b9c5e84762a4aacce3a3771b51ea80967c79a2e5 +Author: Heng Li +Date: Fri Apr 22 16:00:31 2011 -0400 + + added version + +commit 8fad6677c5952efd67391581d64e67e02e7f6e68 +Author: Heng Li +Date: Fri Apr 22 00:30:19 2011 -0400 + + remove the pileup command + +commit 3a962fb6ebf779de70f9e6effb2d8701a9aa3dd9 +Author: Heng Li +Date: Thu Apr 21 23:10:45 2011 -0400 + + Release 0.1.16 (r963:234) + +commit b4d683cffbd98c43f05aff8610b37d63dd7e54aa +Author: Heng Li +Date: Thu Apr 21 12:44:44 2011 -0400 + + fixed a bug when coordinate-less reads are on the reverse strand + +commit c5ec45a128f409debc6a56a798024f53004037dc +Author: Heng Li +Date: Wed Apr 20 11:36:52 2011 -0400 + + added option '-f' to merge to avoid overwritting + +commit 68d431531370d24907c01a27f166f2341d7c4d35 +Author: Heng Li +Date: Wed Apr 20 10:26:58 2011 -0400 + + do not print a warning + +commit 32922607e51ad2260c337eb022b9e4aedacb049f +Author: Heng Li +Date: Wed Apr 20 10:21:06 2011 -0400 + + Added ldpair to compute LD between requested pairs + +commit b8d6fa71b91678fa02338257e0707d1e5ca098dd +Author: Heng Li +Date: Sun Apr 17 21:51:43 2011 -0400 + + On a toy sample, type "B" seems to be accepted + +commit 0e7ee9a6bb4029184202aa6e6738105ba0c0510b +Author: Heng Li +Date: Sun Apr 17 21:21:20 2011 -0400 + + added type "B"; not tested yet + +commit a513dfad0ac0062b03871eb6ecf26cb8d18dc895 +Author: Heng Li +Date: Sun Apr 17 19:25:54 2011 -0400 + + fixed a bug in bedidx.c: input BED not sorted + +commit de1e192bb0a8a762a54a6eee81d882fab01c3d32 +Author: Heng Li +Date: Sun Apr 17 18:51:08 2011 -0400 + + by default, always perform posterior chi^2 + +commit df6e0d1099895fc6cd7a19dc89fba95ed6654d35 +Author: Heng Li +Date: Sat Apr 16 12:33:28 2011 -0400 + + added debugging + +commit 8ce52e024dc2ef361dbd5399c232163055057e70 +Author: Heng Li +Date: Sat Apr 16 00:59:05 2011 -0400 + + avoid a segfault given wrong input + +commit e66b6684fc9a397f91ec29fdeecae9f8eb986a55 +Author: Heng Li