Mercurial > repos > lsong10 > psiclass
changeset 0:903fc43d6227 draft default tip
Uploaded
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/AddGeneName.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,476 @@ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#include <string> +#include <map> +#include <vector> +#include <algorithm> + +char usage[] = "Usage: ./add-genename annotation.gtf gtflist [OPTIONS]\n" + "\t-o: the directory for output gtf files (default: ./)\n" ; + +struct _interval +{ + char chrom[97] ; + char geneName[97] ; + + int start, end ; +} ; + +char line[10000] ; +char buffer[10000], buffer2[10000], buffer3[10000] ; + +int CompInterval( const struct _interval &a, const struct _interval &b ) +{ + int chromCmp = strcmp( a.chrom, b.chrom ) ; + if ( chromCmp ) + return chromCmp ; + else if ( a.start != b.start ) + return a.start - b.start ; + else + return a.end - b.end ; +} + +bool Comp( const struct _interval &a, const struct _interval &b ) +{ + int cmp = CompInterval( a, b ) ; + if ( cmp < 0 ) + return true ; + else + return false ; + + return false ; +} + +void SortAndCleanIntervals( std::vector<struct _interval> &it ) +{ + std::sort( it.begin(), it.end(), Comp ) ; + + int i, k ; + int size = it.size() ; + if ( size == 0 ) + return ; + + k = 1 ; + for ( i = 1 ; i < size ; ++i ) + { + if ( CompInterval( it[k - 1], it[i] ) ) + { + it[k] = it[i] ; + ++k ; + } + } + it.resize( k ) ; +} + +int GetGTFField( char *ret, char *line, const char *fid ) +{ + char *p = strstr( line, fid ) ; + if ( p == NULL ) + return 0 ; + //printf( "%s %s\n", line, fid ) ; + for ( ; *p != ' ' ; ++p ) + ; + p += 2 ; + + sscanf( p, "%s", ret ) ; + p = ret + strlen( ret ) ; + while ( p != ret && *p != '\"' ) + --p ; + *p = '\0' ; + return 1 ; +} + +void UpdateIdToAnnoId( std::vector<struct _interval> &transcript, char *tid, int exonTag, int intronTag, std::vector<struct _interval> &exons, std::vector<struct _interval> &introns, + std::map<std::string, std::string> &txptIdToAnnoId, std::map<std::string, std::string> &geneIdToAnnoId ) +{ + int i, k ; + bool flag = false ; + // First, try whether intron works. + int ecnt = transcript.size() ; + int intronSize = introns.size() ; + int exonSize = exons.size() ; + + struct _interval itron ; + strcpy( itron.chrom, transcript[0].chrom ) ; + k = intronTag ; + for ( i = 0 ; i < ecnt - 1 && !flag ; ++i ) + { + itron.start = transcript[i].end ; + itron.end = transcript[i + 1].start ; + for ( ; k < intronSize ; ++k ) + { + //printf( "hi %d: %s %d %d; %d %s %d %d\n", 0, itron.chrom, itron.start, itron.end, k, introns[k].chrom, introns[k].start, introns[k].end ) ; + if ( strcmp( introns[k].chrom, itron.chrom ) ) + break ; + + int cmp = CompInterval( introns[k], itron ) ; + if ( cmp == 0 ) + { + txptIdToAnnoId[ std::string( tid ) ] = introns[k].geneName ; + geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = introns[k].geneName ; + flag = true ; + break ; + } + else if ( cmp > 0 ) + break ; + } + } + + // Next, try whether exon works + k = exonTag ; + for ( i = 0 ; i < ecnt && !flag ; ++i ) + { + for ( ; k < exonSize ; ++k ) + { + if ( strcmp( exons[k].chrom, transcript[i].chrom ) ) + break ; + + if ( exons[k].end < transcript[i].start ) + continue ; + else if ( exons[k].start > transcript[i].end ) + break ; + else + { + txptIdToAnnoId[ std::string( tid ) ] = exons[k].geneName ; + geneIdToAnnoId[ std::string( transcript[0].geneName ) ] = exons[k].geneName ; + flag = true ; + break ; + } + } + } +} + +int main( int argc, char *argv[] ) +{ + int i, j, k ; + FILE *fp ; + FILE *fpList ; + char outputPath[1024] = "./" ; + char prevTid[97] = "" ; + + std::vector<struct _interval> introns, exons ; + std::map<std::string, std::string> txptIdToAnnoId, geneIdToAnnoId ; + std::map<std::string, int> chromIdMapIntrons, chromIdMapExons ; // the index for the starting of a chrom. + + if ( argc < 3 ) + { + fprintf( stderr, "%s", usage ) ; + exit( 1 ) ; + } + + for ( i = 3 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "-o" ) ) + { + strcpy( outputPath, argv[i + 1 ] ) ; + ++i ; + } + else + { + fprintf( stderr, "Unknown argument: %s\n", argv[i] ) ; + exit( 1 ) ; + } + } + + // Get the exons, introns from the annotation. + fp = fopen( argv[1], "r" ) ; + while ( fgets( line, sizeof( line ), fp ) != NULL ) + { + if ( line[0] == '#' ) + continue ; + char tid[97] ; + char gname[97] ; + char chrom[97] ; + char type[50] ; + int start, end ; + + sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ; + if ( strcmp( type, "exon" ) ) + continue ; + if ( GetGTFField( gname, line, "gene_name" ) ) + { + struct _interval ne ; + strcpy( ne.chrom, chrom ) ; + strcpy( ne.geneName, gname ) ; + ne.start = start ; ne.end = end ; + + exons.push_back( ne ) ; + } + else + { + fprintf( stderr, "%s has no field of gene_name.\n", line ) ; + exit( 1 ) ; + } + + if ( GetGTFField( tid, line, "transcript_id" ) ) + { + if ( !strcmp( tid, prevTid ) ) + { + struct _interval ni ; + + strcpy( ni.chrom, chrom ) ; + strcpy( ni.geneName, gname ) ; + + int size = exons.size() ; + if ( size < 2 ) + continue ; + + struct _interval &e1 = exons[ size - 2 ] ; + struct _interval &e2 = exons[ size - 1 ] ; + if ( e1.start < e2.start ) + { + ni.start = e1.end ; + ni.end = e2.start ; + } + else + { + ni.start = e2.end ; + ni.end = e1.start ; + } + + introns.push_back( ni ) ; + } + else + strcpy( prevTid, tid ) ; + } + else + { + fprintf( stderr, "%s has no field of transcript_id.\n", line ) ; + exit( 1 ) ; + } + } + fclose( fp ) ; + SortAndCleanIntervals( exons ) ; + SortAndCleanIntervals( introns ) ; + int exonSize = exons.size() ; + int intronSize = introns.size() ; + // Determine the offset for each chrom on the list of features. + if ( exonSize ) + { + chromIdMapExons[ std::string( exons[0].chrom ) ] = 0 ; + for ( i = 1 ; i < exonSize ; ++i ) + { + if ( strcmp( exons[i].chrom, exons[i - 1].chrom ) ) + chromIdMapExons[ std::string( exons[i].chrom ) ] = i ; + } + } + + if ( intronSize ) + { + chromIdMapIntrons[ std::string( introns[0].chrom ) ] = 0 ; + for ( i = 1 ; i < intronSize ; ++i ) + { + if ( strcmp( introns[i].chrom, introns[i - 1].chrom ) ) + { + //printf( "%s %d\n", introns[i].chrom, i ) ; + chromIdMapIntrons[ std::string( introns[i].chrom ) ] = i ; + } + } + + //for ( i = 0 ; i < intronSize ; ++i ) + // printf( "%s\t%d\t%d\n", introns[i].chrom, introns[i].start, introns[i].end ) ; + } + //printf( "%d %d\n", exonSize, intronSize ) ; + + // Go through all the GTF files to find the map between PsiCLASS's gene_id to annotation's gene name + fpList = fopen( argv[2], "r ") ; + std::vector<struct _interval> transcript ; + int exonTag = 0 ; + int intronTag = 0 ; + + while ( fscanf( fpList, "%s", line ) != EOF ) + { + // Set the output file + //printf( "hi: %s\n", line ) ; + char *p ; + p = line + strlen( line ) ; + while ( p != line && *p != '/' ) + { + if ( *p == '\n' ) + *p = '\0' ; + --p ; + } + sprintf( buffer, "%s/%s", outputPath, p ) ; + + // Test whether this will overwrite the input gtf file. + if ( realpath( buffer, buffer2 ) != NULL ) + { + if ( realpath( line, buffer3 ) && !strcmp( buffer2, buffer3 ) ) + { + fprintf( stderr, "Output will overwrite the input files. Please use -o to specify a different output directory.\n" ) ; + exit( 1 ) ; + } + } + + fp = fopen( line, "r" ) ; + transcript.resize( 0 ) ; // hold the exons in the transcript. + prevTid[0] = '\0' ; + exonTag = 0 ; + intronTag = 0 ; + int farthest = 0 ; + + while ( fgets( line, sizeof( line ), fp ) ) + { + char tid[97] ; + //char gname[97] ; + char chrom[97] ; + char type[50] ; + int start, end ; + + if ( line[0] == '#' ) + continue ; + + sscanf( line, "%s %s %s %d %d", chrom, buffer, type, &start, &end ) ; + if ( strcmp( type, "exon" ) ) + continue ; + + if ( GetGTFField( tid, line, "transcript_id" ) ) + { + struct _interval ne ; + strcpy( ne.chrom, chrom ) ; + ne.start = start ; + ne.end = end ; + GetGTFField( ne.geneName, line, "gene_id" ) ; + + if ( !strcmp( tid, prevTid ) ) + { + transcript.push_back( ne ) ; + if ( end > farthest ) + farthest = end ; + } + else if ( transcript.size() > 0 ) + { + // the non-existed chrom will be put to offset 0, and that's fine + if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) + intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; + if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) + exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; + + UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ; + + // Adjust the offset if we are done with a gene cluster + // We don't need to worry about the case of changing chrom here. + if ( !strcmp( ne.geneName, transcript[0].geneName ) && + start > farthest ) // Use farthest to avoid inteleaved gene. + { + while ( intronTag < intronSize && !strcmp( introns[intronTag ].chrom, ne.chrom ) + && introns[intronTag].end < ne.start ) + ++intronTag ; + + while ( exonTag < exonSize && !strcmp( exons[ exonTag ].chrom, ne.chrom ) + && exons[ exonTag ].end < ne.start ) + ++exonTag ; + + farthest = end ; + } + + transcript.resize( 0 ) ; + transcript.push_back( ne ) ; + + // Find the overlaps. + strcpy( prevTid, tid ) ; + } + else + strcpy( prevTid, tid ) ; + } + else + { + fprintf( stderr, "Could not find transcript_id field in GTF file: %s", line ) ; + exit( 1 ) ; + } + } + + if ( transcript.size() > 0 ) + { + if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) + intronTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; + if ( strcmp( transcript[0].chrom, introns[intronTag].chrom ) ) + exonTag = chromIdMapIntrons[ std::string( transcript[0].chrom ) ] ; + + UpdateIdToAnnoId( transcript, prevTid, exonTag, intronTag, exons, introns, txptIdToAnnoId, geneIdToAnnoId ) ; + } + fclose( fp ) ; + } + fclose( fpList ) ; + + // Add the gene_name field. + fpList = fopen( argv[2], "r" ) ; + int novelCnt = 0 ; + while ( fscanf( fpList, "%s", line ) != EOF ) + { + FILE *fpOut ; + // Set the output file + char *p ; + p = line + strlen( line ) ; + while ( p != line && *p != '/' ) + { + if ( *p == '\n' ) + *p = '\0' ; + --p ; + } + sprintf( buffer, "%s/%s", outputPath, p ) ; + fpOut = fopen( buffer, "w" ) ; + + // Process the input file + fp = fopen( line, "r" ) ; + + transcript.resize( 0 ) ; + prevTid[0] = '\0' ; + while ( fgets( line, sizeof( line ), fp ) ) + { + char gname[97] ; + if ( line[0] == '#' ) + { + fprintf( fpOut, "%s", line ) ; + continue ; + } + + if ( GetGTFField( buffer, line, "transcript_id" ) ) + { + if ( txptIdToAnnoId.find( std::string( buffer ) ) != txptIdToAnnoId.end() ) + { + int len = strlen( line ) ; + if ( line[len - 1] == '\n' ) + { + line[len - 1] = '\0' ; + --len ; + } + fprintf( fpOut, "%s gene_name \"%s\";\n", line, txptIdToAnnoId[std::string( buffer )].c_str() ) ; + continue ; + } + } + + if ( GetGTFField( gname, line, "gene_id" ) ) + { + int len = strlen( line ) ; + if ( line[len - 1] == '\n' ) + { + line[len - 1] = '\0' ; + --len ; + } + if ( geneIdToAnnoId.find( std::string( gname ) ) != geneIdToAnnoId.end() ) + { + fprintf( fpOut, "%s gene_name \"%s\";\n", line, geneIdToAnnoId[std::string(gname)].c_str() ) ; + } + else + { + sprintf( buffer, "novel_%d", novelCnt ) ; + geneIdToAnnoId[ std::string( gname ) ] = std::string( buffer ) ; + fprintf( fpOut, "%s gene_name \"%s\";\n", line, buffer ) ; + ++novelCnt ; + } + + } + else + fprintf( fpOut, "%s", line ) ; + + } + + fclose( fp ) ; + fclose( fpOut ) ; + } + + return 0 ; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/AddXS.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,583 @@ +#include <stdio.h> +#include <stdint.h> + +#include <string.h> +#include <stdlib.h> +#include <vector> +#include <map> +#include <fstream> + +char usage[] = + "./addXS ref.fa [OPTIONS] < in.sam > out.sam\n" + "From bam to bam: samtools view -h in.bam | ./addXS ref.fa | samtools view -bS - > out.bam\n" ; +char samLine[65537] ; +char seq[65537] ; + +char nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2, + -1, -1, -1, -1, -1, -1, 0, // Regard 'N' as 'A' + -1, -1, -1, -1, -1, 3, + -1, -1, -1, -1, -1, -1 } ; +char numToNuc[26] = {'A', 'C', 'G', 'T'} ; + +struct _pair +{ + int a, b ; +} ; + +class BitSequence +{ +private: + int len ; + //std::vector<uint32_t> sequence ; + uint32_t *sequence ; + +public: + BitSequence() { len = 0 ; sequence = NULL ;} + BitSequence( int l ) + { + len = 0 ; + sequence = new uint32_t[ l / 16 + 1 ] ; + } + + ~BitSequence() + { + //if ( sequence != NULL ) + // delete[] sequence ; + } + + void SetLength( int l ) + { + len = 0 ; + if ( sequence != NULL ) + delete[] sequence ; + sequence = new uint32_t[ l / 16 + 1 ] ; + } + + int GetLength() + { + return len ; + } + + void Append( char c ) + { + if ( ( len & 15 ) == 0 ) + { + sequence[ len / 16 ] = 0 ; + } + ++len ; + //printf( "%d %c\n", len, c ) ; + Set( c, len - 1 ) ; + } + + // pos is 0-based coordinate + // notice that the order within one 32 bit butcket is reversed + void Set( char c, int pos ) + { + if ( pos >= len ) + return ; + + if ( c >= 'a' && c <= 'z' ) + { + c = c - 'a' + 'A' ; + } + if ( c == 'N' ) + c = 'A' ; + + int ind = pos >> 4 ; + int offset = pos & 15 ; + int mask = ( (int)( nucToNum[c - 'A'] & 3 ) ) << ( 2 * offset ) ; + sequence[ind] = sequence[ind] | mask ; + //if ( c != 'A' ) + // printf( "%d: %c %c %d %d : %d\n", pos, c, Get(pos), ind, offset, mask ) ; + //Print() ; + } + + char Get( int pos ) + { + if ( pos >= len ) + return 'N' ; + + int ind = pos >> 4 ; + int offset = pos & 15 ; + //printf( "%d: %d\n", pos, sequence[ind] ) ; + return numToNuc[ ( ( sequence[ind] >> ( 2 * offset ) ) & 3 ) ] ; + } + + void Print() + { + int i ; + for ( i = 0 ; i < len ; ++i ) + printf( "%c", Get( i ) ) ; + printf( "\n" ) ; + } + + void Print( FILE *fp, int start, int end, bool rc ) + { + if ( !rc ) + { + for ( int i = start ; i <= end ; ++i ) + fprintf( fp, "%c", Get( i ) ) ; + } + else + { + for ( int i = end ; i >= start ; --i ) + { + char c = Get( i ) ; + if ( c == 'A' ) + c = 'T' ; + else if ( c == 'C' ) + c = 'G' ; + else if ( c == 'G' ) + c = 'C' ; + else //if ( c == 'T' ) + c = 'A' ; + fprintf( fp, "%c", c ) ; + } + } + } +} ; + +void ReverseComplement( char *s, int l ) +{ + int u, v ; + for ( u = 0, v = l - 1 ; u <= v ; ++u, --v ) + { + char tmp ; + tmp = s[v] ; + s[v] = s[u] ; + s[u] = tmp ; + + s[u] = numToNuc[ 3 - nucToNum[ s[u] - 'A' ] ] ; + s[v] = numToNuc[ 3 - nucToNum[ s[v] - 'A' ] ] ; + } +} + + +int main( int argc, char *argv[] ) +{ + int i, j, k ; + + std::vector<BitSequence> genome ; + std::map<std::string, int> chrToChrId ; + + char readid[200], chrom[50], mapq[10], cigar[1000], mateChrom[50] ; + char buffer[1024] ; + char pattern[1024] ; + int start, mstart, flag, tlen ; // read start and mate read start + int seqLen ; + int chrCnt = 0 ; + int chrId ; + int width = 7 ; + int score ; // 0-bit: multiple aligned, 1-bit can shift the intron, 2-bit can shift the alignment, 3-bit contain sequencing error + + if ( argc < 2 ) + { + printf( "%s", usage ) ; + return 0 ; + } + + + std::ifstream fpRef ; + fpRef.open( argv[1] ) ; + std::string line ; + + int motifStrand[1024] ; + + /*motifStrand[ 0b10110010 ] = 1 ; // GT/AG + motifStrand[ 0b01110001 ] = 0 ; // CT/AC + motifStrand[ 0b10010010 ] = 1 ;// GC/AG + motifStrand[ 0b01111001 ] = 0 ; // CT/GC + motifStrand[ 0b00110001 ] = 1 ; // AT/AC + motifStrand[ 0b10110011 ] = 0 ; // GT/AT*/ + memset( motifStrand, -1, sizeof( motifStrand )) ; + motifStrand[ 0xb2 ] = 1 ; // GT/AG + motifStrand[ 0x71 ] = 0 ; // CT/AC + motifStrand[ 0x92 ] = 1 ;// GC/AG + motifStrand[ 0x79 ] = 0 ; // CT/GC + motifStrand[ 0x31 ] = 1 ; // AT/AC + motifStrand[ 0xb3 ] = 0 ; // GT/AT + + k = 0 ; + while ( getline( fpRef, line ) ) + { + //printf( "%s\n", line.c_str() ) ; + int len = line.length() ; + if ( line[0] == '>' ) + { + //char *s = strdup( line.c_str() + 1 ) ; + if ( chrCnt > 0 ) + { + genome[ chrCnt - 1 ].SetLength( k ) ; + } + for ( i = 1 ; i < len ; ++i ) + if ( line[i] == ' ' || line[i] == '\t' ) + break ; + chrToChrId[ line.substr( 1, i - 1 ) ] = chrCnt ; + ++chrCnt ; + + BitSequence bs ; + genome.push_back( bs ) ; + + k = 0 ; + } + else + { + k += len ; + } + } + genome[ chrCnt - 1 ].SetLength( k ) ; + fpRef.close() ; + + fpRef.open( argv[1] ) ; + while ( getline( fpRef, line ) ) + { + //printf( "%s\n", line.c_str() ) ; + int len = line.length() ; + if ( line[0] == '>' ) + { + //char *s = strdup( line.c_str() + 1 ) ; + for ( i = 1 ; i < len ; ++i ) + if ( line[i] == ' ' || line[i] == '\t' ) + break ; + chrId = chrToChrId[ line.substr( 1, i - 1 ) ] ; + } + else + { + BitSequence &bs = genome[chrId] ; + for ( i = 0 ; i < len ; ++i ) + { + if ( ( line[i] >= 'A' && line[i] <= 'Z' ) || + ( line[i] >= 'a' && line[i] <= 'z' ) ) + bs.Append( line[i] ) ; + } + } + } + fpRef.close() ; + + + + while ( fgets( samLine, sizeof( samLine ), stdin ) != NULL ) + { + if ( samLine[0] == '@' ) + { + printf( "%s", samLine ) ; + continue ; + } + + sscanf( samLine, "%s %d %s %d %s %s %s %d %d %s", readid, &flag, chrom, &start, mapq, cigar, mateChrom, &mstart, &tlen, seq ) ; + struct _pair segments[1000] ; + struct _pair seqSegments[1000] ; // the segments with respect to the sequence. + int segCnt = 0 ; + int seqSegCnt = 0 ; + int num = 0 ; + int len = 0 ; + + score = 0 ; + + for ( i = 0 ; cigar[i] ; ++i ) + { + if ( cigar[i] >= '0' && cigar[i] <= '9' ) + num = num * 10 + cigar[i] - '0' ; + else if ( cigar[i] == 'I' || cigar[i] == 'S' || cigar[i] == 'H' + || cigar[i] == 'P' ) + { + num = 0 ; + } + else if ( cigar[i] == 'N' ) + { + segments[segCnt].a = start ; + segments[segCnt].b = start + len - 1 ; + ++segCnt ; + + start = start + len + num ; + len = 0 ; + num = 0 ; + } + else + { + len += num ; + num = 0 ; + } + } + + + if ( len > 0 ) + { + segments[segCnt].a = start ; + segments[segCnt].b = start + len - 1 ; + ++segCnt ; + } + + if ( segCnt <= 1 || strstr( samLine, "XS:A:" ) != NULL ) + { + printf( "%s", samLine ) ; + continue ; + } + + std::string schr( chrom ) ; + int chrId = chrToChrId[schr] ; + int strand = -1 ; + + len = strlen( samLine ) ; + samLine[len - 1] = '\0' ; + int motif = 0 ; + + BitSequence &chrSeq = genome[ chrId ] ; + + for ( i = 0 ; i < segCnt - 1 ; ++i ) + { + char m[4] ; + m[0] = chrSeq.Get( segments[i].b + 1 - 1 ) ; + m[1] = chrSeq.Get( segments[i].b + 2 - 1 ) ; + m[2] = chrSeq.Get( segments[i + 1].a - 2 - 1 ) ; + m[3] = chrSeq.Get( segments[i + 1].a - 1 - 1 ) ; + motif = 0 ; + for ( j = 0 ; j < 4 ; ++j ) + motif = ( motif << 2 ) + ( nucToNum[ m[j] - 'A' ] & 3 ); + strand = motifStrand[ motif ] ; + if ( strand != -1 ) + break ; + } + if ( strand == 1 ) + { + printf( "%s\tXS:A:+\n", samLine ) ; + continue ; + } + else if ( strand == 0 ) + { + printf( "%s\tXS:A:-\n", samLine ) ; + continue ; + } + + char *p ; + if ( ( p = strstr( samLine, "NH:i:" ) ) != NULL ) + { + p += 5 ; + num = 0 ; + while ( *p >= '0' && *p <= '9' ) + { + num = num * 10 + *p - '0' ; + ++p ; + } + if ( num > 1 ) + score |= 1 ; + } + + seqLen = strlen( seq ) ; + for ( i = 0 ; i < seqLen ; ++i ) + if ( seq[i] == 'N' ) + { + score |= 1 ; + break ; + } + + num = 0 ; + len = 0 ; + seqSegCnt = 0 ; + start = 0 ; + for ( i = 0 ; cigar[i] ; ++i ) + { + if ( cigar[i] >= '0' && cigar[i] <= '9' ) + num = num * 10 + cigar[i] - '0' ; + else if ( cigar[i] == 'D' || cigar[i] == 'S' || cigar[i] == 'H' + || cigar[i] == 'P' ) + { + num = 0 ; + } + else if ( cigar[i] == 'N' ) + { + seqSegments[seqSegCnt].a = start ; + seqSegments[seqSegCnt].b = start + len - 1 ; + ++seqSegCnt ; + + start = start + len ; + len = 0 ; + num = 0 ; + } + else + { + len += num ; + num = 0 ; + } + } + if ( len > 0 ) + { + seqSegments[seqSegCnt].a = start ; + seqSegments[seqSegCnt].b = start + len - 1 ; + ++seqSegCnt ; + } + + // Check whether we can shift the intron to a canonical splice site + for ( i = 0 ; i < segCnt - 1 ; ++i ) + { + for ( j = -width ; j < width ; ++j ) + { + if ( segments[i].b + j < segments[i].a || segments[i+1].a + j > segments[i+1].b ) + continue ; + // Look for the splice signal. + char m[4] ; + m[0] = chrSeq.Get( segments[i].b + j + 1 - 1 ) ; + m[1] = chrSeq.Get( segments[i].b + j + 2 - 1 ) ; + m[2] = chrSeq.Get( segments[i + 1].a + j - 2 - 1 ) ; + m[3] = chrSeq.Get( segments[i + 1].a + j - 1 - 1 ) ; + motif = 0 ; + for ( k = 0 ; k < 4 ; ++k ) + motif = ( motif << 2 ) + ( nucToNum[ m[k] - 'A' ] & 3 ); + strand = motifStrand[ motif ] ; + if ( strand == -1 ) + continue ; + //printf( "%d %d: %d\n", i, j, motif ) ; + + // We found a signal, then test whether we can shift the intron. + // Extract the sequence from the reference genome. + int l = 0 ; + if ( j < 0 ) + for ( k = segments[i + 1].a + j, l = 0 ; k <= segments[i + 1].a - 1 ; ++k, ++l ) + buffer[l] = chrSeq.Get( k - 1 ) ; + else + for ( k = segments[i].b + 1, l= 0 ; k <= segments[i].b + j ; ++k, ++l ) + buffer[l] = chrSeq.Get(k - 1) ; + buffer[l] = '\0' ; + + //printf( "%s\n", buffer ) ; + + // Get the coordinate on the sequence. + int tag ; + int useBegin = 0 ; // is the portion from the beginning part of a seqSegment? + int from, to ; + if ( j < 0 ) + { + tag = i ; + useBegin = 0 ; + } + else // j>0 + { + tag = i + 1 ; + useBegin = 1 ; + } + //printf( "%d %d\n", tag, useBegin ) ; + + if ( ( flag & 0x10 ) != 0 ) + { + //TODO: it seems star already fliped the sequence. + // is it true for other aligners? + + //tag = segCnt - 1 - tag ; + //useBegin = 1 - useBegin ; + + + //ReverseComplement( buffer, l ) ; + } + + if ( useBegin ) + { + from = seqSegments[tag].a ; + to = seqSegments[tag].a + l - 1 ; + } + else + { + from = seqSegments[tag].b - l + 1 ; + to = seqSegments[tag].b ; + } + //printf( "%d %d %d\n", tag, from, to ) ; + + for ( k = 0 ; k < l ; ++k ) + { + if ( seq[from + k] != buffer[k] ) + break ; + } + + + if ( k >= l ) + { + // We can shift the intron + score |= 2 ; + break ; + } + } + if ( j < width ) + break ; + } + + // Check whether the sequence is low-complexity. To do this, we test whether we can shift the seqSegments and directly inspect the sequence. + for ( i = 0 ; i < seqSegCnt ; ++i ) + { + int l ; + for ( k = 0, j = seqSegments[i].a - width ; j <= seqSegments[i].b + width ; ++j, ++k ) + buffer[k] = chrSeq.Get( j ) ; + buffer[k] = '0' ; + + for ( l = 0, j = seqSegments[i].a ; j <= seqSegments[i].b ; ++j, ++l ) + { + pattern[l] = seq[j] ; + } + pattern[l] = '\0' ; + + // It seems the aligner already flipped the read in its output? + //if ( flag & 0x10 != 0 ) + // ReverseComplement( pattern, l ) ; + + + char *p = buffer ; + int cnt = 0 ; + while ( ( p = strstr( p, pattern ) ) != NULL ) + { + ++cnt ; + ++p ; + } + if ( cnt > 1 ) + { + score |= 4 ; + break ; + } + + int count[5] = { 0, 0, 0, 0, 0 } ; + int max = 0 ; + for ( j = 0 ; j < l ; ++j ) + { + ++count[ nucToNum[ pattern[j] - 'A' ] ] ; + } + + for ( j = 0 ; j < 5 ; ++j ) + { + if ( count[j] > max ) + max = count[j] ; + } + if ( max > 0.8 * l ) + { + score |= 4 ; + break ; + } + } + + if ( ( p = strstr( samLine, "NM:i:" ) ) != NULL || ( p = strstr( samLine, "nM:i:" ) ) != NULL ) + { + int nm = atoi( p + 5 ) ; + if ( nm > 0 ) + { + score |= 8 ; + } + } + else + { + // TODO: check the nm by ourself + } + + // Check whether the alignment is concordant. + if ( ( flag & 0x1 ) != 0 && ( flag & 0x4 ) == 0 ) + { + start = segments[0].a ; + if ( strcmp( mateChrom, "=" ) + || ( flag & 0x10 ) == ( flag & 0x20 ) + || ( ( flag & 0x10 ) == 0 && ( mstart < start || mstart > start + 2000000 ) ) + || ( ( flag & 0x10 ) != 0 && ( mstart > start || mstart < start - 2000000 ) ) ) + { + score |= 16 ; + } + } + + printf( "%s\tXS:A:?\tYS:i:%d\n", samLine, score ) ; + } + + return 0 ; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/BitTable.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,471 @@ +/** + Use an 64bit integer to represent bit table. + + Li Song + July 24, 2012 + + - Modified again from May 27, 2017. +*/ + + +#ifndef _BIT_TABLE_HEADER +#define _BIT_TABLE_HEADER + +#include <stdio.h> + +typedef unsigned long long int UINT64 ; +#define UNIT_SIZE (sizeof( UINT64 ) * 8 ) +#define UNIT_MASK ((UINT64)63) + +class BitTable +{ +private: + UINT64 *tab ; // The bits + int size ; // The size of the table + int asize ; // The size of the array (size/64). +public: + BitTable() // Intialize a bit table. + { + //tab = new UINT64[1] ; + //size = UNIT_SIZE ; + tab = NULL ; + size = 0 ; + } + + BitTable( int s ) // Initalize a bit table with size s. + { + if ( s == 0 ) + s = UNIT_SIZE ; + + if ( s & UNIT_MASK ) + { + asize = s / UNIT_SIZE + 1 ; + tab = new UINT64[ asize ] ; + } + else + { + asize = s / UNIT_SIZE ; + tab = new UINT64[ asize ] ; + } + + size = s ; + Reset() ; + } + + ~BitTable() + { + //printf( "hi %d\n", size ) ; + //printf( "%s\n", __func__ ) ; + //if ( tab != NULL ) + // delete [] tab ; + //size = -1 ; + + // Manually release the memory. + } + + void Init( int s ) // Initialize a bit table with size s. + { + if ( tab != NULL ) + delete [] tab ; + + if ( s == 0 ) + s = UNIT_SIZE ; + + if ( s & UNIT_MASK ) + { + asize = s / UNIT_SIZE + 1 ; + tab = new UINT64[ asize ] ; + } + else + { + asize = s / UNIT_SIZE ; + tab = new UINT64[ asize ] ; + } + + size = s ; + Reset() ; + } + + void Release() + { + if ( tab != NULL ) + delete[] tab ; + tab = NULL ; + } + + void Reset() // Make every value 0. + { + for ( int i = 0 ; i < asize ; ++i ) + tab[i] = 0 ; + } + + void Set( int i ) // Set the ith bit. + { + int ind, offset ; + ind = i / UNIT_SIZE ; + offset = i & UNIT_MASK ; + //printf( "%d,%d %d,%d: %llx", 311 / UNIT_SIZE, 311 & UNIT_MASK, 279 / UNIT_SIZE, 279 & UNIT_MASK, tab[ind] ) ; + tab[ind] |= ( (UINT64)1 << offset ) ; + //printf( " %llx %d\n", tab[ind], UNIT_SIZE ) ; + } + + void Unset( int i ) // Unset the ith bit + { + int ind, offset ; + ind = i / UNIT_SIZE ; + offset = i & UNIT_MASK ; + + tab[ind] &= ( (UINT64)-1 ^ ( (UINT64)1 << offset ) ) ; + } + + void Flip( int i ) // Flip the ith bit. Same as xor 1. + { + int ind, offset ; + ind = i / UNIT_SIZE ; + offset = i & UNIT_MASK ; + + tab[ind] ^= ( (UINT64)1 << offset ) ; + } + + bool Test( unsigned int i ) const // Test the ith bit. + { + if ( i >= (unsigned int)size ) + return false ; + + unsigned int ind, offset ; + ind = i / UNIT_SIZE ; + offset = i & UNIT_MASK ; + + if ( tab[ind] & ( (UINT64)1 << offset ) ) + return true ; + else + return false ; + } + + void Not() // Not all the bits. + { + int i ; + for ( i = 0 ; i < asize ; ++i ) + { + tab[i] = ~tab[i] ; + } + } + + void And( const BitTable &in ) // Do the "and" on each bits. + { + if ( asize != in.asize ) + return ; + int i ; + for ( i = 0 ; i < asize ; ++i ) + tab[i] &= in.tab[i] ; + } + + void Or( const BitTable &in ) // Do the "or" on each bits. + { + if ( asize != in.asize ) + return ; + int i ; + for ( i = 0 ; i < asize ; ++i ) + tab[i] |= in.tab[i] ; + } + + void Xor( const BitTable &in ) + { + if ( asize != in.asize ) + return ; + int i ; + for ( i = 0 ; i < asize ; ++i ) + tab[i] ^= in.tab[i] ; + } + + // Unset all the bits outside [s,e] + void MaskRegionOutside( unsigned int s, unsigned int e ) + { + int i ; + // mask out [0, s-1]. + int ind, offset ; + if ( s > 0 ) + { + ind = (s - 1) / UNIT_SIZE ; + offset = ( s - 1 ) & UNIT_MASK ; + for ( i = 0 ; i <= ind - 1 ; ++i ) + tab[i] = 0 ; + if ( offset + 1 >= 64 ) + tab[i] = 0 ; + else + tab[i] = ( tab[i] >> (UINT64)( offset + 1 ) ) << (UINT64)( offset + 1 ) ; + } + + // mask out [e+1, size-1] + if ( e < ( (unsigned int)size - 1 ) ) + { + ind = ( e + 1 ) / UNIT_SIZE ; + offset = ( e + 1 ) & UNIT_MASK ; + for ( i = ind + 1 ; i < asize ; ++i ) + tab[i] = 0 ; + + if ( UNIT_SIZE - offset >= 64 ) + tab[ind] = 0 ; + else + tab[ind] = ( tab[ind] << (UINT64)( UNIT_SIZE - offset ) ) >> (UINT64)( UNIT_SIZE - offset ) ; + } + } + + // Given further information about the position of first and last 1's + void MaskRegionOutsideInRange( int s, int e, int first, int last ) + { + int i ; + int start, to ; + start = first / UNIT_SIZE ; + to = last / UNIT_SIZE ; + // mask out [0, s-1]. + int ind, offset ; + if ( s > 0 ) + { + ind = (s - 1) / UNIT_SIZE ; + offset = ( s - 1 ) & UNIT_MASK ; + for ( i = start ; i <= ind - 1 ; ++i ) + tab[i] = 0 ; + if ( offset + 1 >= 64 ) + tab[i] = 0 ; + else + tab[i] = ( tab[i] >> (UINT64)( offset + 1 ) ) << (UINT64)( offset + 1 ) ; + } + + // mask out [e+1, size-1] + if ( e < size - 1 ) + { + ind = ( e + 1 ) / UNIT_SIZE ; + offset = ( e + 1 ) & UNIT_MASK ; + for ( i = ind + 1 ; i <= to ; ++i ) + tab[i] = 0 ; + + if ( UNIT_SIZE - offset >= 64 ) + tab[ind] = 0 ; + else + tab[ind] = ( tab[ind] << (UINT64)( UNIT_SIZE - offset ) ) >> (UINT64)( UNIT_SIZE - offset ) ; + } + } + void ShiftOneLeft() + { + int i ; + UINT64 carry = 0 ; + UINT64 lastTabMask ; + if ( ( size & UNIT_MASK ) == 0 ) + lastTabMask = (UINT64)(-1) ; + else + lastTabMask = ( 1 << ( size & UNIT_MASK ) ) - 1 ; + + for ( i = 0 ; i < asize - 1 ; ++i ) + { + UINT64 tmp = ( tab[i] >> ( UNIT_SIZE - 1 ) ) & 1 ; + tab[i] = ( tab[i] << 1 ) | carry ; + carry = tmp ; + } + tab[i] = ( ( tab[i] << 1 ) | carry ) & lastTabMask ; + } + + void ShiftOneRight() + { + int i ; + UINT64 carry = ( tab[ asize - 1 ] & 1 ) ; + tab[ asize - 1 ] >>= 1 ; + for ( i = asize - 2 ; i >= 0 ; --i ) + { + UINT64 tmp = ( tab[i] & 1 ) ; + tab[i] = ( tab[i] >> 1 ) | ( carry << ( UNIT_SIZE - 1 ) ) ; + carry = tmp ; + } + } + + bool IsAllZero() + { + int i ; + for ( i = 0 ; i < asize ; ++i ) + if ( tab[i] != 0 ) + return false ; + return true ; + } + + int Count() const // Count how many 1. + { + if ( size <= 0 ) + return 0 ; + UINT64 k ; + int i, ret = 0 ; + for ( i = 0 ; i < asize - 1 ; ++i ) + { + k = tab[i] ; + while ( k ) + //for ( j = 0 ; j < UNIT_SIZE ; ++j ) + { + if ( k & 1 ) + ++ret ; + //printf( "### %d %d %d %d\n", ret, asize, size, k ) ; + k /= 2 ; + } + } + //printf( "(%d) ", ret ) ; + k = tab[ asize - 1 ] ; + for ( i = 0 ; i <= (int)( ( size - 1 ) & UNIT_MASK ) ; ++i ) + { + if ( k & 1 ) + ++ret ; + k /= 2 ; + } + //for ( i = 0 ; i < asize ; ++i ) + // printf( "%llu ", tab[i] ) ; + //printf( "(%d)\n", ret ) ; + return ret ; + } + + void GetOnesIndices( std::vector<int> &indices ) + { + if ( size <= 0 ) + return ; + UINT64 k ; + int i ; + int ind = 0 ; + for ( i = 0 ; i < asize - 1 ; ++i ) + { + k = tab[i] ; + ind = i * UNIT_SIZE ; + while ( k ) + //for ( j = 0 ; j < UNIT_SIZE ; ++j ) + { + if ( k & 1 ) + indices.push_back( ind ) ; + //printf( "### %d %d %d %d\n", ret, asize, size, k ) ; + k /= 2 ; + ++ind ; + } + } + //printf( "(%d) ", ret ) ; + k = tab[ asize - 1 ] ; + ind = i * UNIT_SIZE ; + for ( i = 0 ; i <= (int)( ( size - 1 ) & UNIT_MASK ) ; ++i ) + { + if ( k & 1ull ) + indices.push_back( ind ) ; + k /= 2ull ; + ++ind ; + } + } + + bool IsEqual( const BitTable &in ) const// Test wether two bit tables equal. + { + int i ; + if ( in.size != size ) + return false ; + //printf( "== %d %d\n", in.size, size ) ; + //if ( size & UNIT_MASK ) + // k = size / UNIT_SIZE + 1 ; + //else + // k = size / UNIT_SIZE ; + + for ( i = 0 ; i < asize ; ++i ) + if ( tab[i] != in.tab[i] ) + return false ; + return true ; + } + + // Return the location of the first difference. -1 if the same. + int GetFirstDifference( const BitTable &in ) const + { + int as = asize < in.asize ? asize : in.asize ; + //int s = size < in.size ? size : in.size ; + int i, j ; + + for ( i = 0 ; i < as ; ++i ) + { + if ( tab[i] != in.tab[i] ) + { + UINT64 k1 = tab[i] ; + UINT64 k2 = in.tab[i] ; + for ( j = 0 ; j < (int)UNIT_SIZE ; ++j ) + { + if ( ( k1 & 1 ) != ( k2 & 1 ) ) + { + return j + UNIT_SIZE * i ; + } + k1 >>= 1 ; + k2 >>= 1 ; + } + } + } + + for ( ; i < asize ; ++i ) + { + if ( tab[i] != 0 ) + { + UINT64 k = tab[i] ; + for ( j = 0 ; j < (int)UNIT_SIZE ; ++j ) + { + if ( k & 1 ) + return j + UNIT_SIZE * i ; + k >>= 1 ; + } + } + } + + for ( ; i < in.asize ; ++i ) + { + if ( in.tab[i] != 0 ) + { + UINT64 k = in.tab[i] ; + for ( j = 0 ; j < (int)UNIT_SIZE ; ++j ) + { + if ( k & 1 ) + return j + UNIT_SIZE * i ; + k >>= 1 ; + } + } + } + return -1 ; + } + + void Duplicate( BitTable &in ) + { + if ( tab != NULL ) + delete [] tab ; + int i ; + size = in.size ; + asize = in.asize ; + tab = new UINT64[ asize ] ; + for ( i = 0 ; i < asize ; ++i ) + tab[i] = in.tab[i] ; + } + + /*void SetBulk( int ind, uint64_t val ) + { + tab[ind] = val ; + }*/ + + void Assign( BitTable &in ) + { + int i ; + for ( i = 0 ; i < asize ; ++i ) + tab[i] = in.tab[i] ; + } + + void Nullify() + { + tab = NULL ; + } + + int GetSize() + { + return size ; + } + + void Print() + { + int i ; + for ( i = 0 ; i < asize; ++i ) + printf( "%llu ", tab[i] ) ; + printf( "\n" ) ; + } +} ; + + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/CombineSubexons.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1680 @@ +#include <stdio.h> +#include <string.h> + +#include <algorithm> +#include <vector> +#include <map> +#include <string> + +#include "alignments.hpp" +#include "blocks.hpp" +#include "stats.hpp" +#include "SubexonGraph.hpp" + +char usage[] = "combineSubexons [options]\n" + "Required options:\n" + "\t-s STRING: the path to the predicted subexon information. Can use multiple -s to specify multiple subexon prediction files\n" + "\t\tor\n" + "\t--ls STRING: the path to file of the list of the predicted subexon information.\n" ; + +struct _overhang +{ + int cnt ; // the number of samples support this subexon. + int validCnt ; // The number of samples that are used for compute probability. + int length ; + double classifier ; +} ; + +struct _intronicInfo +{ + int chrId ; + int start, end ; + int leftSubexonIdx, rightSubexonIdx ; + double irClassifier ; + int irCnt ; + int validIrCnt ; + struct _overhang leftOverhang, rightOverhang ; // leftOverhangClassifier is for the overhang subexon at the left side of this intron. +} ; + +struct _seInterval +{ + int chrId ; + int start, end ; + int type ; // 0-subexon, 1-intronicInfo + int idx ; +} ; + +struct _subexonSplit +{ + int chrId ; + int pos ; + int type ; //1-start of a subexon. 2-end of a subexon + int splitType ; //0-soft boundary, 1-start of an exon, 2-end of an exon. + int strand ; + + int weight ; +} ; + +struct _interval // exon or intron +{ + int chrId ; + int start, end ; + int strand ; + int sampleSupport ; +} ; + +struct _subexonSupplement // supplement the subexon structure defined in SubexonGraph. +{ + int *nextSupport ; + int *prevSupport ; +} ; + +char buffer[4096] ; + +bool CompSubexonSplit( struct _subexonSplit a, struct _subexonSplit b ) +{ + if ( a.chrId < b.chrId ) + return true ; + else if ( a.chrId > b.chrId ) + return false ; + else if ( a.pos != b.pos ) + return a.pos < b.pos ; + else if ( a.type != b.type ) + { + // the split site with no strand information should come first. + /*if ( a.splitType != b.splitType ) + { + if ( a.splitType == 0 ) + return true ; + else if ( b.splitType == 0 ) + return false ; + }*/ + return a.type < b.type ; + } + else if ( a.splitType != b.splitType ) + { + //return a.splitType < b.splitType ; + if ( a.splitType == 0 ) + return true ; + else if ( b.splitType == 0 ) + return false ; + + if ( a.type == 1 ) + return a.splitType > b.splitType ; + else + return a.splitType < b.splitType ; + } + + return false ; +} + +bool CompInterval( struct _interval a, struct _interval b ) +{ + if ( a.chrId < b.chrId ) + return true ; + else if ( a.chrId > b.chrId ) + return false ; + else if ( a.start != b.start ) + return a.start < b.start ; + else if ( a.end != b.end ) + return a.end < b.end ; + return false ; +} + +bool CompSeInterval( struct _seInterval a, struct _seInterval b ) +{ + if ( a.chrId < b.chrId ) + return true ; + else if ( a.chrId > b.chrId ) + return false ; + else if ( a.start < b.start ) + return true ; + else if ( a.start > b.start ) + return false ; + else if ( a.end < b.end ) + return true ; + else + return false ; +} + +// Keep this the same as in SubexonInfo.cpp. +double TransformCov( double c ) +{ + double ret ; + //return sqrt( c ) - 1 ; + + if ( c <= 2 + 1e-6 ) + ret = 1e-6 ; + else + ret = c - 2 ; + + return ret ; +} + +double GetUpdateMixtureGammaClassifier( double ratio, double cov, double piRatio, double kRatio[2], double thetaRatio[2], + double piCov, double kCov[2], double thetaCov[2], bool conservative ) +{ + double p1 = 0, p2 ; + + cov = TransformCov( cov ) ; + if ( cov < ( kCov[0] - 1 ) * thetaCov[0] ) + cov = ( kCov[0] - 1 ) * thetaCov[0] ; + + if ( ratio > 0 ) + p1 = MixtureGammaAssignment( ratio, piRatio, kRatio, thetaRatio ) ; + // Make sure cov > 1? + p2 = MixtureGammaAssignment( cov, piCov, kCov, thetaCov ) ; + double ret = 0 ; + + if ( conservative ) + { + if ( p1 >= p2 ) // we should use ratio. + ret = LogGammaDensity( ratio, kRatio[1], thetaRatio[1] ) + - LogGammaDensity( ratio, kRatio[0], thetaRatio[0] ) ; + else + ret = LogGammaDensity( cov, kCov[1], thetaCov[1] ) + - LogGammaDensity( cov, kCov[0], thetaCov[0] ) ; + } + else + { + if ( p1 >= p2 ) // we should use ratio. + ret = LogGammaDensity( ratio, kRatio[1], thetaRatio[1] ) + - LogGammaDensity( ratio, kRatio[0], thetaRatio[0] ) ; + else + ret = LogGammaDensity( cov, kCov[1], thetaCov[1] ) + - LogGammaDensity( cov, kCov[0], thetaCov[0] ) ; + } + return ret ; +} + +double GetPValueOfGeneEnd( double cov ) +{ + if ( cov <= 2.0 ) + return 1.0 ; + double tmp = 2.0 * ( sqrt( cov ) - log( cov ) ) ; + if ( tmp <= 0 ) + return 1.0 ; + return 2.0 * alnorm( tmp, true ) ; +} + +char StrandNumToSymbol( int strand ) +{ + if ( strand > 0 ) + return '+' ; + else if ( strand < 0 ) + return '-' ; + else + return '.' ; +} + +int StrandSymbolToNum( char c ) +{ + if ( c == '+' ) + return 1 ; + else if ( c == '-' ) + return -1 ; + else + return 0 ; +} + +int *MergePositions( int *old, int ocnt, int *add, int acnt, int &newCnt ) //, int **support ) +{ + int i, j, k ; + int *ret ; + if ( acnt == 0 ) + { + newCnt = ocnt ; + return old ; + } + if ( ocnt == 0 ) + { + newCnt = acnt ; + ret = new int[acnt] ; + //*support = new int[acnt] ; + for ( i = 0 ; i < acnt ; ++i ) + { + //(*support)[i] = 1 ; + ret[i] = add[i] ; + } + return ret ; + } + newCnt = 0 ; + for ( i = 0, j = 0 ; i < ocnt && j < acnt ; ) + { + if ( old[i] < add[j] ) + { + ++i ; + ++newCnt ; + } + else if ( old[i] == add[j] ) + { + ++i ; ++j ; + ++newCnt ; + } + else + { + ++j ; + ++newCnt ; + } + } + newCnt = newCnt + ( ocnt - i ) + ( acnt - j ) ; + // no new elements. + if ( newCnt == ocnt ) + { + /*i = 0 ; + for ( j = 0 ; j < acnt ; ++j ) + { + for ( ; old[i] < add[j] ; ++i ) + ; + ++(*support)[i] ; + }*/ + return old ; + } + k = 0 ; + //delete []old ; + ret = new int[ newCnt ] ; + //int *bufferSupport = new int[newCnt] ; + for ( i = 0, j = 0 ; i < ocnt && j < acnt ; ) + { + if ( old[i] < add[j] ) + { + ret[k] = old[i] ; + //bufferSupport[k] = (*support)[i] ; + ++i ; + ++k ; + } + else if ( old[i] == add[j] ) + { + ret[k] = old[i] ; + //bufferSupport[k] = (*support)[i] + 1 ; + ++i ; ++j ; + ++k ; + } + else + { + ret[k] = add[j] ; + //bufferSupport[k] = 1 ; + ++j ; + ++k ; + } + } + for ( ; i < ocnt ; ++i, ++k ) + { + ret[k] = old[i] ; + //bufferSupport[k] = (*support)[i] ; + } + for ( ; j < acnt ; ++j, ++k ) + { + ret[k] = add[j] ; + //bufferSupport[k] = 1 ; + } + delete[] old ; + //delete[] *support ; + //*support = bufferSupport ; + return ret ; +} + +void CoalesceSubexonSplits( std::vector<struct _subexonSplit> &splits, int mid ) +{ + int i, j, k ; + int cnt = splits.size() ; + //std::sort( splits.begin(), splits.end(), CompSubexonSplit ) ; + + std::vector<struct _subexonSplit> sortedSplits ; + sortedSplits.resize( cnt ) ; + + k = 0 ; + for ( i = 0, j = mid ; i < mid && j < cnt ; ++k ) + { + if ( CompSubexonSplit( splits[i], splits[j] ) ) + { + sortedSplits[k] = splits[i] ; + ++i ; + } + else + { + sortedSplits[k] = splits[j] ; + ++j ; + } + } + for ( ; i < mid ; ++i, ++k ) + sortedSplits[k] = splits[i] ; + for ( ; j < cnt ; ++j, ++k ) + sortedSplits[k] = splits[j] ; + splits = sortedSplits ; + + k = 0 ; + for ( i = 1 ; i < cnt ; ++i ) + { + if ( splits[i].chrId == splits[k].chrId && splits[i].pos == splits[k].pos && splits[i].type == splits[k].type && splits[i].splitType == splits[k].splitType + && splits[i].strand == splits[k].strand ) + { + splits[k].weight += splits[i].weight ; + } + else + { + ++k ; + splits[k] = splits[i] ; + } + } + splits.resize( k + 1 ) ; +} + +void CoalesceDifferentStrandSubexonSplits( std::vector<struct _subexonSplit> &splits ) +{ + int i, j, k, l ; + int cnt = splits.size() ; + k = 0 ; + for ( i = 0 ; i < cnt ; ) + { + for ( j = i + 1 ; j < cnt ; ++j ) + { + if ( splits[i].chrId == splits[j].chrId && splits[i].pos == splits[j].pos && splits[i].type == splits[j].type && splits[i].splitType == splits[j].splitType ) + continue ; + break ; + } + + int maxWeight = -1 ; + int weightSum = 0 ; + int strand = splits[i].strand ; + for ( l = i ; l < j ; ++l ) + { + weightSum += splits[l].weight ; + if ( splits[l].strand != 0 && splits[l].weight > maxWeight ) + { + strand = splits[l].strand ; + maxWeight = splits[l].weight ; + } + } + //printf( "%d: %d %d %d\n", splits[i].pos, i, j, strand ) ; + splits[k] = splits[i] ; + splits[k].strand = strand ; + splits[k].weight = weightSum ; + ++k ; + + i = j ; + } + splits.resize( k ) ; +} + + +void CoalesceIntervals( std::vector<struct _interval> &intervals ) +{ + int i, k ; + std::sort( intervals.begin(), intervals.end(), CompInterval ) ; + int cnt = intervals.size() ; + k = 0 ; + for ( i = 1 ; i < cnt ; ++i ) + { + if ( intervals[i].chrId == intervals[k].chrId && intervals[i].start == intervals[k].start && intervals[i].end == intervals[k].end ) + intervals[k].sampleSupport += intervals[i].sampleSupport ; + else + { + ++k ; + intervals[k] = intervals[i] ; + } + } + intervals.resize( k + 1 ) ; +} + +void CleanIntervalIrOverhang( std::vector<struct _interval> &irOverhang ) +{ + int i, j, k ; + std::sort( irOverhang.begin(), irOverhang.end(), CompInterval ) ; + + int cnt = irOverhang.size() ; + for ( i = 0 ; i < cnt ; ++i ) + { + if ( irOverhang[i].start == -1 ) + continue ; + + + // locate the longest interval start at the same coordinate. + int tag = i ; + + for ( j = i + 1 ; j < cnt ; ++j ) + { + if ( irOverhang[j].chrId != irOverhang[i].chrId || irOverhang[j].start != irOverhang[i].start ) + break ; + if ( irOverhang[j].start == -1 ) + continue ; + tag = j ; + } + + for ( k = i ; k < tag ; ++k ) + { + irOverhang[k].start = -1 ; + } + + for ( k = tag + 1 ; k < cnt ; ++k ) + { + if ( irOverhang[k].chrId != irOverhang[tag].chrId || irOverhang[k].start > irOverhang[tag].end ) + break ; + if ( irOverhang[k].end <= irOverhang[tag].end ) + { + irOverhang[k].start = -1 ; + } + } + } + + k = 0 ; + for ( i = 0 ; i < cnt ; ++i ) + { + if ( irOverhang[i].start == -1 ) + continue ; + irOverhang[k] = irOverhang[i] ; + ++k ; + } + irOverhang.resize( k ) ; +} + +// Remove the connection that does not match the boundary +// of subexons. +void CleanUpSubexonConnections( std::vector<struct _subexon> &subexons ) +{ + int seCnt = subexons.size() ; + int i, j, k, m ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].prevCnt > 0 ) + { + for ( k = i ; k >= 0 ; --k ) + if ( subexons[k].chrId != subexons[i].chrId || subexons[k].end <= subexons[i].prev[0] ) + break ; + if ( subexons[k].chrId != subexons[i].chrId ) + ++k ; + m = 0 ; + for ( j = 0 ; j < subexons[i].prevCnt ; ++j ) + { + for ( ; k <= i ; ++k ) + if ( subexons[k].end >= subexons[i].prev[j] ) + break ; + if ( subexons[k].end == subexons[i].prev[j] + && ( SubexonGraph::IsSameStrand( subexons[k].rightStrand, subexons[i].leftStrand ) || subexons[k].end + 1 == subexons[i].start ) ) + { + subexons[i].prev[m] = subexons[i].prev[j] ; + ++m ; + } + } + subexons[i].prevCnt = m ; + } + + m = 0 ; + k = i ; + for ( j = 0 ; j < subexons[i].nextCnt ; ++j ) + { + for ( ; k < seCnt ; ++k ) + if ( subexons[k].chrId != subexons[i].chrId || subexons[k].start >= subexons[i].next[j] ) + break ; + if ( subexons[k].start == subexons[i].next[j] + && ( SubexonGraph::IsSameStrand( subexons[i].rightStrand, subexons[k].leftStrand ) || subexons[i].end + 1 == subexons[k].start ) ) + { + subexons[i].next[m] = subexons[i].next[j] ; + ++m ; + } + } + subexons[i].nextCnt = m ; + } +} + +int main( int argc, char *argv[] ) +{ + int i, j, k ; + FILE *fp ; + std::vector<char *> files ; + + Blocks regions ; + Alignments alignments ; + + if ( argc == 1 ) + { + printf( "%s", usage ) ; + return 0 ; + } + + for ( i = 1 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "-s" ) ) + { + files.push_back( argv[i + 1] ) ; + ++i ; + continue ; + } + else if ( !strcmp( argv[i], "--ls" ) ) + { + FILE *fpLs = fopen( argv[i + 1], "r" ) ; + char buffer[1024] ; + while ( fgets( buffer, sizeof( buffer ), fpLs ) != NULL ) + { + int len = strlen( buffer ) ; + if ( buffer[len - 1] == '\n' ) + { + buffer[len - 1] = '\0' ; + --len ; + + } + char *fileName = strdup( buffer ) ; + files.push_back( fileName ) ; + } + } + } + int fileCnt = files.size() ; + // Obtain the chromosome ids through bam file. + fp = fopen( files[0], "r" ) ; + if ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) + { + int len = strlen( buffer ) ; + buffer[len - 1] = '\0' ; + alignments.Open( buffer + 1 ) ; + } + fclose( fp ) ; + + // Collect the split sites of subexons. + std::vector<struct _subexonSplit> subexonSplits ; + std::vector<struct _interval> intervalIrOverhang ; // intervals contains ir and overhang. + std::vector<struct _interval> introns ; + std::vector<struct _interval> exons ; + + + for ( k = 0 ; k < fileCnt ; ++k ) + { + fp = fopen( files[k], "r" ) ; + struct _subexon se ; + struct _subexonSplit sp ; + char chrName[50] ; + int origSize = subexonSplits.size() ; + while ( fgets( buffer, sizeof( buffer), fp ) != NULL ) + { + if ( buffer[0] == '#' ) + continue ; + + SubexonGraph::InputSubexon( buffer, alignments, se, false) ; + // Record all the intron rentention, overhang from the samples + if ( ( se.leftType == 2 && se.rightType == 1 ) + || ( se.leftType == 2 && se.rightType == 0 ) + || ( se.leftType == 0 && se.rightType == 1 ) ) + { + struct _interval si ; + si.chrId = se.chrId ; + si.start = se.start ; + si.end = se.end ; + + intervalIrOverhang.push_back( si ) ; + } + + // Ignore overhang subexons and ir subexons for now. + if ( ( se.leftType == 0 && se.rightType == 1 ) + || ( se.leftType == 2 && se.rightType == 0 ) + || ( se.leftType == 2 && se.rightType == 1 ) ) + continue ; + + if ( se.leftType == 0 && se.rightType == 0 && ( se.leftClassifier == -1 || se.leftClassifier == 1 ) ) // ignore noisy single-exon island + continue ; + if ( se.leftType == 0 && se.rightType == 0 && ( fileCnt >= 10 && se.leftClassifier > 0.99 ) ) + continue ; + + if ( se.leftType == 1 && se.rightType == 2 ) // a full exon, we allow mixtured strand here. + { + struct _interval ni ; + ni.chrId = se.chrId ; + ni.start = se.start ; + ni.end = se.end ; + ni.strand = se.rightStrand ; + ni.sampleSupport = 1 ; + exons.push_back( ni ) ; + } + + + /*for ( i = 0 ; i < se.nextCnt ; ++i ) + { + struct _interval ni ; + ni.chrId = se.chrId ; + ni.start = se.end ; + ni.end = se.next[i] ; + ni.strand = se.rightStrand ; + ni.sampleSupport = 1 ; + if ( ni.start + 1 < ni.end ) + introns.push_back( ni ) ; + }*/ + + sp.chrId = se.chrId ; + sp.pos = se.start ; + sp.type = 1 ; + sp.splitType = se.leftType ; + sp.strand = se.leftStrand ; + sp.weight = 1 ; + subexonSplits.push_back( sp ) ; + + sp.chrId = se.chrId ; + sp.pos = se.end ; + sp.type = 2 ; + sp.splitType = se.rightType ; + sp.strand = se.rightStrand ; + sp.weight = 1 ; + subexonSplits.push_back( sp ) ; + + /*if ( se.prevCnt > 0 ) + delete[] se.prev ; + if ( se.nextCnt > 0 ) + delete[] se.next ;*/ + } + CoalesceIntervals( exons ) ; + CoalesceIntervals( introns ) ; + CoalesceSubexonSplits( subexonSplits, origSize ) ; + CleanIntervalIrOverhang( intervalIrOverhang ) ; + + fclose( fp ) ; + } + + CoalesceDifferentStrandSubexonSplits( subexonSplits ) ; + + // Obtain the split sites from the introns. + int intronCnt = introns.size() ; + std::vector<struct _subexonSplit> intronSplits ; + for ( i = 0 ; i < intronCnt ; ++i ) + { + /*if ( introns[i].sampleSupport < 0.05 * fileCnt ) + { + continue ; + }*/ + struct _interval &it = introns[i] ; + struct _subexonSplit sp ; + sp.chrId = it.chrId ; + sp.pos = it.start ; + sp.type = 2 ; + sp.splitType = 2 ; + sp.strand = it.strand ; + intronSplits.push_back( sp ) ; + + sp.chrId = it.chrId ; + sp.pos = it.end ; + sp.type = 1 ; + sp.splitType = 1 ; + sp.strand = it.strand ; + intronSplits.push_back( sp ) ; + } + + // Pair up the split sites to get subexons + std::sort( intronSplits.begin(), intronSplits.end(), CompSubexonSplit ) ; + //std::sort( subexonSplits.begin(), subexonSplits.end(), CompSubexonSplit ) ; + + // Convert the hard boundary to soft boundary if the split sites is filtered from the introns + // Seems NO need to do this now. + int splitCnt = subexonSplits.size() ; + int intronSplitCnt = intronSplits.size() ; + k = 0 ; + //for ( i = 0 ; i < splitCnt ; ++i ) + while ( 0 ) + { + if ( subexonSplits[i].type != subexonSplits[i].splitType ) + continue ; + + while ( k < intronSplitCnt && ( intronSplits[k].chrId < subexonSplits[i].chrId + || ( intronSplits[k].chrId == subexonSplits[i].chrId && intronSplits[k].pos < subexonSplits[i].pos ) ) ) + ++k ; + j = k ; + while ( j < intronSplitCnt && intronSplits[j].chrId == subexonSplits[i].chrId + && intronSplits[j].pos == subexonSplits[i].pos && intronSplits[j].splitType != subexonSplits[i].splitType ) + ++j ; + + // the split site is filtered. + if ( j >= intronSplitCnt || intronSplits[j].chrId != subexonSplits[i].chrId || + intronSplits[j].pos > subexonSplits[i].pos ) + { + //printf( "%d %d. %d %d\n", subexonSplits[i].pos, intronSplits[j].pos, intronSplits[j].chrId , subexonSplits[i].chrId ) ; + subexonSplits[i].splitType = 0 ; + + // Convert the adjacent subexon split. + for ( int l = i + 1 ; i < splitCnt && subexonSplits[l].chrId == subexonSplits[i].chrId + && subexonSplits[l].pos == subexonSplits[i].pos + 1 ; ++l ) + { + if ( subexonSplits[l].type != subexonSplits[i].type + && subexonSplits[l].splitType == subexonSplits[i].splitType ) + { + subexonSplits[l].splitType = 0 ; + } + } + + // And the other direction + for ( int l = i - 1 ; l >= 0 && subexonSplits[l].chrId == subexonSplits[i].chrId + && subexonSplits[l].pos == subexonSplits[i].pos - 1 ; --l ) + { + if ( subexonSplits[l].type != subexonSplits[i].type + && subexonSplits[l].splitType == subexonSplits[i].splitType ) + { + subexonSplits[l].splitType = 0 ; + } + } + } + } + intronSplits.clear() ; + std::vector<struct _subexonSplit>().swap( intronSplits ) ; + + // Force the soft boundary that collides with hard boundaries to be hard boundary. + for ( i = 0 ; i < splitCnt ; ++i ) + { + if ( subexonSplits[i].splitType != 0 ) + continue ; + int newSplitType = 0 ; + int newStrand = subexonSplits[i].strand ; + for ( j = i + 1 ; j < splitCnt ; ++j ) + { + if ( subexonSplits[i].type != subexonSplits[j].type || subexonSplits[i].pos != subexonSplits[j].pos || + subexonSplits[i].chrId != subexonSplits[j].chrId ) + break ; + if ( subexonSplits[j].splitType != 0 ) + { + newSplitType = subexonSplits[j].splitType ; + newStrand = subexonSplits[j].strand ; + break ; + } + } + + if ( newSplitType == 0 ) + { + for ( j = i - 1 ; j >= 0 ; --j ) + { + if ( subexonSplits[i].type != subexonSplits[j].type || subexonSplits[i].pos != subexonSplits[j].pos || + subexonSplits[i].chrId != subexonSplits[j].chrId ) + break ; + if ( subexonSplits[j].splitType != 0 ) + { + newSplitType = subexonSplits[j].splitType ; + newStrand = subexonSplits[j].strand ; + break ; + } + } + + } + /*if ( subexonSplits[i].pos == 154464157 ) + { + printf( "force conversion: %d %d %d. %d %d\n", subexonSplits[i].pos, subexonSplits[i].splitType, subexonSplits[i].weight, subexonSplits[i + 1].pos, subexonSplits[i + 1].splitType ) ; + }*/ + subexonSplits[i].splitType = newSplitType ; + subexonSplits[i].strand = newStrand ; + } + + /*for ( i = 0 ; i < splitCnt ; ++i ) + { + printf( "%d: type=%d splitType=%d weight=%d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, subexonSplits[i].weight ) ; + }*/ + + // Build subexons from the collected split sites. + + std::vector<struct _subexon> subexons ; + int diffCnt = 0 ; // |start of subexon split| - |end of subexon split| + int seCnt = 0 ; + for ( i = 0 ; i < splitCnt - 1 ; ++i ) + { + struct _subexon se ; + /*if ( subexonSplits[i + 1].pos == 144177260 ) + { + printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, + subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ; + }*/ + + if ( subexonSplits[i].type == 1 ) + diffCnt += subexonSplits[i].weight ; + else + diffCnt -= subexonSplits[i].weight ; + + if ( subexonSplits[i + 1].chrId != subexonSplits[i].chrId ) + { + diffCnt = 0 ; + continue ; + } + + if ( diffCnt == 0 ) // the interval between subexon + continue ; + + se.chrId = subexonSplits[i].chrId ; + se.start = subexonSplits[i].pos ; + se.leftType = subexonSplits[i].splitType ; + se.leftStrand = subexonSplits[i].strand ; + if ( subexonSplits[i].type == 2 ) + { + se.leftStrand = 0 ; + ++se.start ; + } + + se.end = subexonSplits[i + 1].pos ; + se.rightType = subexonSplits[i + 1].splitType ; + se.rightStrand = subexonSplits[i + 1].strand ; + if ( subexonSplits[i + 1].type == 1 ) + { + se.rightStrand = 0 ; + --se.end ; + } + + /*if ( se.end == 24613649 ) + { + //printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, + // subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ; + printf( "%d %d %d. %d %d %d\n", se.start, se.leftType, se.leftStrand, se.end, se.rightType, se.rightStrand ) ; + }*/ + + if ( se.start > se.end ) //Note: this handles the case of repeated subexon split. + { + // handle the case in sample 0: [...[..] + // in sample 1: [..]...] + if ( seCnt > 0 && se.end == subexons[seCnt - 1].end && subexons[seCnt - 1].rightType < se.rightType ) + { + subexons[seCnt - 1].rightType = se.rightType ; + subexons[seCnt - 1].rightStrand = se.rightStrand ; + } + continue ; + } + se.leftClassifier = se.rightClassifier = 0 ; + se.lcCnt = se.rcCnt = 0 ; + + /*if ( 1 ) //se.chrId == 25 ) + { + printf( "%d: %d-%d: %d %d\n", se.chrId, se.start, se.end, se.leftType, se.rightType ) ; + }*/ + + + se.next = se.prev = NULL ; + se.nextCnt = se.prevCnt = 0 ; + subexons.push_back( se ) ; + ++seCnt ; + } + subexonSplits.clear() ; + std::vector<struct _subexonSplit>().swap( subexonSplits ) ; + + // Adjust the split type. + seCnt = subexons.size() ; + for ( i = 1 ; i < seCnt ; ++i ) + { + if ( subexons[i - 1].end + 1 == subexons[i].start ) + { + if ( subexons[i - 1].rightType == 0 ) + subexons[i - 1].rightType = subexons[i].leftType ; + if ( subexons[i].leftType == 0 ) + subexons[i].leftType = subexons[i - 1].rightType ; + } + } + + // Merge the adjacent soft boundaries + std::vector<struct _subexon> rawSubexons = subexons ; + int exonCnt = exons.size() ; + subexons.clear() ; + + k = 0 ; // hold index for exon. + for ( i = 0 ; i < seCnt ; ) + { + /*if ( rawSubexons[k].rightType == 0 && rawSubexons[i].leftType == 0 + && rawSubexons[k].end + 1 == rawSubexons[i].start ) + { + rawSubexons[k].end = rawSubexons[i].end ; + rawSubexons[k].rightType = rawSubexons[i].rightType ; + rawSubexons[k].rightStrand = rawSubexons[i].rightStrand ; + } + else + { + subexons.push_back( rawSubexons[k] ) ; + k = i ; + }*/ + + while ( k < exonCnt && ( exons[k].chrId < rawSubexons[i].chrId + || ( exons[k].chrId == rawSubexons[i].chrId && exons[k].start < rawSubexons[i].start ) ) ) + ++k ; + + for ( j = i + 1 ; j < seCnt ; ++j ) + { + if ( rawSubexons[j - 1].chrId != rawSubexons[j].chrId || rawSubexons[j - 1].rightType != 0 || rawSubexons[j].leftType != 0 + || ( fileCnt > 1 && rawSubexons[j - 1].end + 1 != rawSubexons[j].start ) + || ( fileCnt == 1 && rawSubexons[j - 1].end + 50 < rawSubexons[j].start ) ) + break ; + } + // rawsubexons[i...j-1] will be merged. + + /*if ( rawSubexons[i].start == 119625875 ) + { + printf( "merge j-1: %d %d %d %d\n", rawSubexons[j - 1].end, rawSubexons[j - 1].rightType, + rawSubexons[j].start, rawSubexons[j].leftType ) ; + }*/ + bool merge = true ; + if ( rawSubexons[i].leftType == 1 && rawSubexons[j - 1].rightType == 2 && j - i > 1 + && rawSubexons[j - 1].end - rawSubexons[i].start >= 1000 ) + { + merge = false ; + int sampleSupport = 0 ; + for ( int l = k ; l < exonCnt ; ++l ) + { + if ( exons[l].chrId != rawSubexons[i].chrId || exons[l].start > rawSubexons[i].start ) + break ; + if ( exons[l].end == rawSubexons[j - 1].end ) + { + merge = true ; + sampleSupport = exons[l].sampleSupport ; + break ; + } + } + + if ( merge == true && rawSubexons[j - 1].end - rawSubexons[i].start >= 1000 ) + { + if ( sampleSupport <= 0.2 * fileCnt ) + { + merge = false ; + } + } + + if ( merge == false ) + { + if ( j - i >= 3 ) + { + rawSubexons[i].end = rawSubexons[ (i + j - 1) / 2 ].start ; + rawSubexons[j - 1].start = rawSubexons[ (i + j - 1) / 2].end ; + } + + if ( rawSubexons[i].end + 1 == rawSubexons[j - 1].start ) + { + --rawSubexons[i].end ; + ++rawSubexons[j - 1].start ; + } + subexons.push_back( rawSubexons[i] ) ; + subexons.push_back( rawSubexons[j - 1] ) ; + } + } + + if ( merge ) + { + rawSubexons[i].end = rawSubexons[j - 1].end ; + rawSubexons[i].rightType = rawSubexons[j - 1].rightType ; + rawSubexons[i].rightStrand = rawSubexons[j - 1].rightStrand ; + + if ( rawSubexons[i].leftType == 0 && rawSubexons[i].rightType != 0 ) + { + rawSubexons[i].start = rawSubexons[ ( i + j - 1 ) / 2 ].start ; + } + else if ( rawSubexons[i].rightType == 0 && rawSubexons[i].leftType != 0 ) + { + rawSubexons[i].end = rawSubexons[ ( i + j - 1 ) / 2 ].end ; + } + + subexons.push_back( rawSubexons[i] ) ; + } + + i = j ; + } + exons.clear() ; + std::vector<struct _interval>().swap( exons ) ; + + // Remove overhang, ir subexons intron created after putting multiple sample to gether. + // eg: s0: [......) + // s1: [...]--------[....] + // s2: [...]..)-----[....] + // Though the overhang from s2 is filtered in readin, there will a new overhang created combining s0,s1. + // But be careful about how to compute the classifier for the overhang part contributed from s0. + // Furthermore, note that the case of single-exon island showed up in intron retention region after combining is not possible when get here. + // eg: s0:[...]-----[...] + // s1: (.) + // s2:[.............] + // After merge adjacent soft boundaries, the single-exon island will disappear. + rawSubexons = subexons ; + seCnt = subexons.size() ; + subexons.clear() ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( ( rawSubexons[i].leftType == 2 && rawSubexons[i].rightType == 1 ) // ir + || ( rawSubexons[i].leftType == 2 && rawSubexons[i].rightType == 0 ) // overhang + || ( rawSubexons[i].leftType == 0 && rawSubexons[i].rightType == 1 ) ) + continue ; + subexons.push_back( rawSubexons[i] ) ; + } + + // Remove the single-exon island if it overlaps with intron retentioned or overhang. + rawSubexons = subexons ; + seCnt = subexons.size() ; + subexons.clear() ; + k = 0 ; + std::sort( intervalIrOverhang.begin(), intervalIrOverhang.end(), CompInterval ) ; + int irOverhangCnt = intervalIrOverhang.size() ; + + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( rawSubexons[i].leftType != 0 || rawSubexons[i].rightType != 0 ) + { + subexons.push_back( rawSubexons[i] ) ; + continue ; + } + + while ( k < irOverhangCnt ) + { + // Locate the interval that before the island + if ( intervalIrOverhang[k].chrId < rawSubexons[i].chrId + || ( intervalIrOverhang[k].chrId == rawSubexons[i].chrId && intervalIrOverhang[k].end < rawSubexons[i].start ) ) + { + ++k ; + continue ; + } + break ; + } + bool overlap = false ; + for ( j = k ; j < irOverhangCnt ; ++j ) + { + if ( intervalIrOverhang[j].chrId > rawSubexons[i].chrId || intervalIrOverhang[j].start > rawSubexons[i].end ) + break ; + if ( ( intervalIrOverhang[j].start <= rawSubexons[i].start && intervalIrOverhang[j].end >= rawSubexons[i].start ) + || ( intervalIrOverhang[j].start <= rawSubexons[i].end && intervalIrOverhang[j].end >= rawSubexons[i].end ) ) + { + overlap = true ; + break ; + } + } + + if ( !overlap ) + subexons.push_back( rawSubexons[i] ) ; + } + rawSubexons.clear() ; + std::vector<struct _subexon>().swap( rawSubexons ) ; + + intervalIrOverhang.clear() ; + std::vector<struct _interval>().swap( intervalIrOverhang ) ; + + // Create the dummy intervals. + seCnt = subexons.size() ; + std::vector<struct _intronicInfo> intronicInfos ; + std::vector<struct _seInterval> seIntervals ; + std::vector<struct _subexonSupplement> subexonInfo ; + + //subexonInfo.resize( seCnt ) ; + for ( i = 0 ; i < seCnt ; ++i ) + { + struct _seInterval ni ; // new interval + ni.start = subexons[i].start ; + ni.end = subexons[i].end ; + ni.type = 0 ; + ni.idx = i ; + ni.chrId = subexons[i].chrId ; + seIntervals.push_back( ni ) ; + + /*if ( subexons[i].end == 42671717 ) + { + printf( "%d: %d-%d: %d\n", subexons[i].chrId, subexons[i].start, subexons[i].end, subexons[i].rightType ) ; + }*/ + //subexonInfo[i].prevSupport = subexonInfo[i].nextSupport = NULL ; + + /*int nexti ; + for ( nexti = i + 1 ; nexti < seCnt ; ++nexti ) + if ( subexons[ nexti ].leftType == 0 && subexons[nexti].rightType == 0 )*/ + + if ( i < seCnt - 1 && subexons[i].chrId == subexons[i + 1].chrId && + subexons[i].end + 1 < subexons[i + 1].start && + subexons[i].rightType + subexons[i + 1].leftType != 0 ) + { + // Only consider the intervals like ]..[,]...(, )...[ + // The case like ]...] is actaully things like ][...] in subexon perspective, + // so they won't pass the if-statement + struct _intronicInfo nii ; // new intronic info + ni.start = subexons[i].end + 1 ; + ni.end = subexons[i + 1].start - 1 ; + ni.type = 1 ; + ni.idx = intronicInfos.size() ; + seIntervals.push_back( ni ) ; + + nii.chrId = subexons[i].chrId ; + nii.start = ni.start ; + nii.end = ni.end ; + nii.leftSubexonIdx = i ; + nii.rightSubexonIdx = i + 1 ; + nii.irClassifier = 0 ; + nii.irCnt = 0 ; + nii.validIrCnt = 0 ; + nii.leftOverhang.cnt = 0 ; + nii.leftOverhang.validCnt = 0 ; + nii.leftOverhang.length = 0 ; + nii.leftOverhang.classifier = 0 ; + nii.rightOverhang.cnt = 0 ; + nii.rightOverhang.validCnt = 0 ; + nii.rightOverhang.length = 0 ; + nii.rightOverhang.classifier = 0 ; + intronicInfos.push_back( nii ) ; + /*if ( nii.end == 23667 ) + { + printf( "%d %d. %d (%d %d %d)\n", nii.start, nii.end, subexons[i].rightType, subexons[i+1].start, subexons[i + 1].end, subexons[i + 1].leftType ) ; + }*/ + } + } + + // Go through all the files to get some statistics number + double avgIrPiRatio = 0 ; + double avgIrPiCov = 0 ; + double irPiRatio, irKRatio[2], irThetaRatio[2] ; // Some statistical results + double irPiCov, irKCov[2], irThetaCov[2] ; + + double avgOverhangPiRatio = 0 ; + double avgOverhangPiCov = 0 ; + double overhangPiRatio, overhangKRatio[2], overhangThetaRatio[2] ; // Some statistical results + double overhangPiCov, overhangKCov[2], overhangThetaCov[2] ; + + for ( k = 0 ; k < fileCnt ; ++k ) + { + fp = fopen( files[k], "r" ) ; + + while ( fgets( buffer, sizeof( buffer), fp ) != NULL ) + { + if ( buffer[0] == '#' ) + { + char buffer2[100] ; + sscanf( buffer, "%s", buffer2 ) ; + if ( !strcmp( buffer2, "#fitted_ir_parameter_ratio:" ) ) + { + // TODO: ignore certain samples if the coverage seems wrong. + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &irPiRatio, buffer2, &irKRatio[0], buffer2, &irThetaRatio[0], + buffer2, &irKRatio[1], buffer2, &irThetaRatio[1] ) ; + avgIrPiRatio += irPiRatio ; + } + else if ( !strcmp( buffer2, "#fitted_ir_parameter_cov:" ) ) + { + } + else if ( !strcmp( buffer2, "#fitted_overhang_parameter_ratio:" ) ) + { + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &overhangPiRatio, buffer2, &overhangKRatio[0], buffer2, &overhangThetaRatio[0], + buffer2, &overhangKRatio[1], buffer2, &overhangThetaRatio[1] ) ; + avgOverhangPiRatio += overhangPiRatio ; + } + } + else + break ; + } + fclose( fp ) ; + } + avgIrPiRatio /= fileCnt ; + avgOverhangPiRatio /= fileCnt ; + + // Go through all the files to put statistical results into each subexon. + std::vector< struct _subexon > sampleSubexons ; + int subexonCnt = subexons.size() ; + for ( k = 0 ; k < fileCnt ; ++k ) + { + //if ( k == 220 ) + // exit( 1 ) ; + fp = fopen( files[k], "r" ) ; + struct _subexon se ; + struct _subexonSplit sp ; + char chrName[50] ; + + sampleSubexons.clear() ; + + int tag = 0 ; + while ( fgets( buffer, sizeof( buffer), fp ) != NULL ) + { + if ( buffer[0] == '#' ) + { + char buffer2[200] ; + sscanf( buffer, "%s", buffer2 ) ; + if ( !strcmp( buffer2, "#fitted_ir_parameter_ratio:" ) ) + { + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &irPiRatio, buffer2, &irKRatio[0], buffer2, &irThetaRatio[0], + buffer2, &irKRatio[1], buffer2, &irThetaRatio[1] ) ; + } + else if ( !strcmp( buffer2, "#fitted_ir_parameter_cov:" ) ) + { + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &irPiCov, buffer2, &irKCov[0], buffer2, &irThetaCov[0], + buffer2, &irKCov[1], buffer2, &irThetaCov[1] ) ; + } + else if ( !strcmp( buffer2, "#fitted_overhang_parameter_ratio:" ) ) + { + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &overhangPiRatio, buffer2, &overhangKRatio[0], buffer2, &overhangThetaRatio[0], + buffer2, &overhangKRatio[1], buffer2, &overhangThetaRatio[1] ) ; + } + else if ( !strcmp( buffer2, "#fitted_overhang_parameter_cov:" ) ) + { + sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", + buffer2, buffer2, &overhangPiCov, buffer2, &overhangKCov[0], buffer2, &overhangThetaCov[0], + buffer2, &overhangKCov[1], buffer2, &overhangThetaCov[1] ) ; + } + continue ; + } + else + break ; + + //SubexonGraph::InputSubexon( buffer, alignments, se, true ) ; + //sampleSubexons.push_back( se ) ; + } + + //int sampleSubexonCnt = sampleSubexons.size() ; + int intervalCnt = seIntervals.size() ; + //for ( i = 0 ; i < sampleSubexonCnt ; ++i ) + int iterCnt = 0 ; + while ( 1 ) + { + if ( iterCnt > 0 && fgets( buffer, sizeof( buffer), fp ) == NULL) + break ; + ++iterCnt ; + + struct _subexon se ; + SubexonGraph::InputSubexon( buffer, alignments, se, true ) ; + + while ( tag < intervalCnt ) + { + if ( seIntervals[tag].chrId < se.chrId || + ( seIntervals[tag].chrId == se.chrId && seIntervals[tag].end < se.start ) ) + { + ++tag ; + continue ; + } + else + break ; + } + + for ( j = tag ; j < intervalCnt ; ++j ) + { + if ( seIntervals[j].start > se.end || seIntervals[j].chrId > se.chrId ) // terminate if no overlap. + break ; + int idx ; + + if ( seIntervals[j].type == 0 ) + { + idx = seIntervals[j].idx ; + if ( subexons[idx].leftType == 1 && se.leftType == 1 && subexons[idx].start == se.start ) + { + double tmp = se.leftClassifier ; + if ( se.leftClassifier == 0 ) + tmp = 1e-7 ; + subexons[idx].leftClassifier -= 2.0 * log( tmp ) ; + ++subexons[idx].lcCnt ; + subexons[idx].prev = MergePositions( subexons[idx].prev, subexons[idx].prevCnt, + se.prev, se.prevCnt, subexons[idx].prevCnt ) ; + + if ( se.rightType == 0 ) // a gene end here + { + for ( int l = idx ; l < subexonCnt ; ++l ) + { + if ( l > idx && ( subexons[l].end > subexons[l - 1].start + 1 + || subexons[l].chrId != subexons[l - 1].chrId ) ) + break ; + if ( subexons[l].rightType == 2 ) + { + double adjustAvgDepth = se.avgDepth ; + if ( se.end - se.start + 1 >= 100 ) + adjustAvgDepth += se.avgDepth * 100.0 / ( se.end - se.start + 1 ) ; + else + adjustAvgDepth *= 2 ; + double p = GetPValueOfGeneEnd( adjustAvgDepth ) ; + //if ( se.end - se.start + 1 >= 500 && p > 0.001 ) + // p = 0.001 ; + + subexons[l].rightClassifier -= 2.0 * log( p ) ; + ++subexons[l].rcCnt ; + break ; + } + } + } + } + //if ( se.chrId == 25 ) + // printf( "(%d %d %d. %d) (%d %d %d. %d)\n", se.chrId, se.start, se.end, se.rightType, subexons[idx].chrId, subexons[idx].start, subexons[idx].end, subexons[idx].rightType ) ; + if ( subexons[idx].rightType == 2 && se.rightType == 2 && subexons[idx].end == se.end ) + { + double tmp = se.rightClassifier ; + if ( se.rightClassifier == 0 ) + tmp = 1e-7 ; + subexons[idx].rightClassifier -= 2.0 * log( tmp ) ; + ++subexons[idx].rcCnt ; + + subexons[idx].next = MergePositions( subexons[idx].next, subexons[idx].nextCnt, + se.next, se.nextCnt, subexons[idx].nextCnt ) ; + + if ( se.leftType == 0 ) + { + for ( int l = idx ; l >= 0 ; --l ) + { + if ( l < idx && ( subexons[l].end < subexons[l + 1].start - 1 + || subexons[l].chrId != subexons[l + 1].chrId ) ) + break ; + if ( subexons[l].leftType == 1 ) + { + double adjustAvgDepth = se.avgDepth ; + if ( se.end - se.start + 1 >= 100 ) + adjustAvgDepth += se.avgDepth * 100.0 / ( se.end - se.start + 1 ) ; + else + adjustAvgDepth *= 2 ; + double p = GetPValueOfGeneEnd( adjustAvgDepth ) ; + //if ( se.end - se.start + 1 >= 500 && p >= 0.001 ) + // p = 0.001 ; + subexons[l].leftClassifier -= 2.0 * log( p ) ; + ++subexons[l].lcCnt ; + break ; + } + } + } + } + + if ( subexons[idx].leftType == 0 && subexons[idx].rightType == 0 + && se.leftType == 0 && se.rightType == 0 ) // the single-exon island. + { + double tmp = se.leftClassifier ; + if ( se.leftClassifier == 0 ) + tmp = 1e-7 ; + subexons[idx].leftClassifier -= 2.0 * log( tmp ) ; + subexons[idx].rightClassifier = subexons[idx].leftClassifier ; + ++subexons[idx].lcCnt ; + ++subexons[idx].rcCnt ; + } + } + else if ( seIntervals[j].type == 1 ) + { + idx = seIntervals[j].idx ; + // Overlap on the left part of intron + if ( se.start <= intronicInfos[idx].start && se.end < intronicInfos[idx].end + && subexons[ intronicInfos[idx].leftSubexonIdx ].rightType != 0 ) + { + int len = se.end - intronicInfos[idx].start + 1 ; + intronicInfos[idx].leftOverhang.length += len ; + ++intronicInfos[idx].leftOverhang.cnt ; + + // Note that the sample subexon must have a soft boundary at right hand side, + // otherwise, this part is not an intron and won't show up in intronic Info. + if ( se.leftType == 2 ) + { + if ( se.leftRatio > 0 && se.avgDepth > 1 ) + { + ++intronicInfos[idx].leftOverhang.validCnt ; + + double update = GetUpdateMixtureGammaClassifier( se.leftRatio, se.avgDepth, + overhangPiRatio, overhangKRatio, overhangThetaRatio, + overhangPiCov, overhangKCov, overhangThetaCov, false ) ; + intronicInfos[idx].leftOverhang.classifier += update ; + } + } + else if ( se.leftType == 1 ) + { + ++intronicInfos[idx].leftOverhang.validCnt ; + double update = GetUpdateMixtureGammaClassifier( 1.0, se.avgDepth, + overhangPiRatio, overhangKRatio, overhangThetaRatio, + overhangPiCov, overhangKCov, overhangThetaCov, true ) ; + intronicInfos[idx].leftOverhang.classifier += update ; + + int seIdx = intronicInfos[idx].leftSubexonIdx ; + subexons[seIdx].rightClassifier -= 2.0 * log( GetPValueOfGeneEnd( se.avgDepth ) ) ; + ++subexons[ seIdx ].rcCnt ; + } + // ignore the contribution of single-exon island here? + } + // Overlap on the right part of intron + else if ( se.start > intronicInfos[idx].start && se.end >= intronicInfos[idx].end + && subexons[ intronicInfos[idx].rightSubexonIdx ].leftType != 0 ) + { + int len = intronicInfos[idx].end - se.start + 1 ; + intronicInfos[idx].rightOverhang.length += len ; + ++intronicInfos[idx].rightOverhang.cnt ; + + // Note that the sample subexon must have a soft boundary at left hand side, + // otherwise, this won't show up in intronic Info + if ( se.rightType == 1 ) + { + if ( se.rightRatio > 0 && se.avgDepth > 1 ) + { + ++intronicInfos[idx].rightOverhang.validCnt ; + + double update = GetUpdateMixtureGammaClassifier( se.rightRatio, se.avgDepth, + overhangPiRatio, overhangKRatio, overhangThetaRatio, + overhangPiCov, overhangKCov, overhangThetaCov, false ) ; + intronicInfos[idx].rightOverhang.classifier += update ; + } + } + else if ( se.rightType == 2 ) + { + ++intronicInfos[idx].rightOverhang.validCnt ; + + double update = GetUpdateMixtureGammaClassifier( 1, se.avgDepth, + overhangPiRatio, overhangKRatio, overhangThetaRatio, + overhangPiCov, overhangKCov, overhangThetaCov, true ) ; + intronicInfos[idx].rightOverhang.classifier += update ; + + int seIdx = intronicInfos[idx].rightSubexonIdx ; + /*if ( subexons[ seIdx ].start == 6873648 ) + { + printf( "%lf %lf: %lf %lf %lf\n", subexons[seIdx].leftClassifier, GetPValueOfGeneEnd( se.avgDepth ), se.avgDepth, sqrt( se.avgDepth ), log( se.avgDepth ) ) ; + }*/ + subexons[seIdx].leftClassifier -= 2.0 * log( GetPValueOfGeneEnd( se.avgDepth ) ) ; + ++subexons[ seIdx ].lcCnt ; + } + } + // Intron is fully contained in this sample subexon, then it is a ir candidate + else if ( se.start <= intronicInfos[idx].start && se.end >= intronicInfos[idx].end ) + { + if ( se.leftType == 2 && se.rightType == 1 ) + { + double ratio = regions.PickLeftAndRightRatio( se.leftRatio, se.rightRatio ) ; + ++intronicInfos[idx].irCnt ; + if ( ratio > 0 && se.avgDepth > 1 ) + { + double update = GetUpdateMixtureGammaClassifier( ratio, se.avgDepth, + irPiRatio, irKRatio, irThetaRatio, + irPiCov, irKCov, irThetaCov, true ) ; + //if ( intronicInfos[idx].start == 37617368 ) + // printf( "hi %lf %d %d: %d %d\n", update, se.start, se.end, intronicInfos[idx].start, intronicInfos[idx].end ) ; + intronicInfos[idx].irClassifier += update ; + ++intronicInfos[idx].validIrCnt ; + } + } + else if ( se.leftType == 1 || se.rightType == 2 ) + { + //intronicInfos[idx].irClassifier += LogGammaDensity( 4.0, irKRatio[1], irThetaRatio[1] ) + // - LogGammaDensity( 4.0, irKRatio[0], irThetaRatio[0] ) ; + /*if ( se.start == 37617368 ) + { + printf( "%lf: %lf %lf\n", se.avgDepth, MixtureGammaAssignment( ( irKCov[0] - 1 ) * irThetaCov[0], irPiRatio, irKCov, irThetaCov ), + MixtureGammaAssignment( TransformCov( 4.0 ), irPiRatio, irKCov, irThetaCov ) ) ; + }*/ + if ( se.avgDepth > 1 ) + { + // let the depth be the threshold to determine. + double update = GetUpdateMixtureGammaClassifier( 4.0, se.avgDepth, + irPiRatio, irKRatio, irThetaRatio, + irPiCov, irKCov, irThetaCov, true ) ; + //if ( intronicInfos[idx].start == 36266630 ) + // printf( "hi %lf %d %d: %d %d\n", update, se.start, se.end, intronicInfos[idx].start, intronicInfos[idx].end ) ; + intronicInfos[idx].irClassifier += update ; + ++intronicInfos[idx].irCnt ; + ++intronicInfos[idx].validIrCnt ; + } + } + else + { + // the intron is contained in a overhang subexon from the sample or single-exon island + } + } + // sample subexon is contained in the intron. + else + { + // Do nothing. + } + } + } + + //if ( se.nextCnt > 0 ) + delete[] se.next ; + //if ( se.prevCnt > 0 ) + delete[] se.prev ; + } + fclose( fp ) ; + + /*for ( i = 0 ; i < sampleSubexonCnt ; ++i ) + { + if ( sampleSubexons[i].nextCnt > 0 ) + delete[] sampleSubexons[i].next ; + if ( sampleSubexons[i].prevCnt > 0 ) + delete[] sampleSubexons[i].prev ; + }*/ + } + + CleanUpSubexonConnections( subexons ) ; + + // Convert the temporary statistics number into formal statistics result. + for ( i = 0 ; i < subexonCnt ; ++i ) + { + struct _subexon &se = subexons[i] ; + if ( se.leftType == 0 && se.rightType == 0 ) // single-exon txpt. + { + se.leftClassifier = se.rightClassifier = 1 - chicdf( se.rightClassifier, 2 * se.rcCnt ) ; + } + else + { + if ( se.leftType == 1 ) + { + se.leftClassifier = 1 - chicdf( se.leftClassifier, 2 * se.lcCnt ) ; + } + else + se.leftClassifier = -1 ; + + if ( se.rightType == 2 ) + se.rightClassifier = 1 - chicdf( se.rightClassifier, 2 * se.rcCnt ) ; + else + se.rightClassifier = -1 ; + } + } + + int iiCnt = intronicInfos.size() ; //intronicInfo count + for ( i = 0 ; i < iiCnt ; ++i ) + { + struct _intronicInfo &ii = intronicInfos[i] ; + if ( ii.validIrCnt > 0 ) + { + for ( j = 0 ; j < fileCnt - ii.validIrCnt ; ++j ) + { + ii.irClassifier -= log( 10.0 ) ; + } + /*if ( ii.validIrCnt < fileCnt * 0.15 ) + ii.irClassifier -= log( 1000.0 ) ; + else if ( ii.validIrCnt < fileCnt * 0.5 ) + ii.irClassifier -= log( 100.0 ) ;*/ + ii.irClassifier = (double)1.0 / ( 1.0 + exp( ii.irClassifier + log( 1 - avgIrPiRatio ) - log( avgIrPiRatio ) ) ) ; + } + else + ii.irClassifier = -1 ; + + if ( ii.leftOverhang.validCnt > 0 ) + ii.leftOverhang.classifier = (double)1.0 / ( 1.0 + exp( ii.leftOverhang.classifier + + log( 1 - avgOverhangPiRatio ) - log( avgOverhangPiRatio ) ) ) ; + else + ii.leftOverhang.classifier = -1 ; + + if ( ii.rightOverhang.validCnt > 0 ) + ii.rightOverhang.classifier = (double)1.0 / ( 1.0 + exp( ii.rightOverhang.classifier + + log( 1 - avgOverhangPiRatio ) - log( avgOverhangPiRatio ) ) ) ; + else + ii.rightOverhang.classifier = -1 ; + } + + // Change the classifier for the hard boundaries if its adjacent intron has intron retention classifier + // which collide with overhang subexon. + int intervalCnt = seIntervals.size() ; + for ( i = 0 ; i < intervalCnt ; ++i ) + { + if ( seIntervals[i].type == 1 && intronicInfos[ seIntervals[i].idx ].irCnt > 0 ) + { + int idx = seIntervals[i].idx ; + if ( intronicInfos[idx].leftOverhang.cnt > 0 ) + { + int k = seIntervals[i - 1].idx ; + // Should aim for more conservative? + if ( subexons[k].rightClassifier > intronicInfos[idx].leftOverhang.classifier ) + subexons[k].rightClassifier = intronicInfos[idx].leftOverhang.classifier ; + } + + if ( intronicInfos[idx].rightOverhang.cnt > 0 ) + { + int k = seIntervals[i + 1].idx ; + if ( subexons[k].leftClassifier > intronicInfos[idx].rightOverhang.classifier ) + subexons[k].leftClassifier = intronicInfos[idx].rightOverhang.classifier ; + } + } + } + + // Output the result. + for ( i = 0 ; i < intervalCnt ; ++i ) + { + if ( seIntervals[i].type == 0 ) + { + struct _subexon &se = subexons[ seIntervals[i].idx ] ; + + char ls, rs ; + + ls = StrandNumToSymbol( se.leftStrand ) ; + rs = StrandNumToSymbol( se.rightStrand ) ; + + printf( "%s %d %d %d %d %c %c -1 -1 -1 %lf %lf ", alignments.GetChromName( se.chrId ), se.start, se.end, + se.leftType, se.rightType, ls, rs, se.leftClassifier, se.rightClassifier ) ; + if ( i > 0 && seIntervals[i - 1].chrId == seIntervals[i].chrId + && seIntervals[i - 1].end + 1 == seIntervals[i].start + && !( seIntervals[i - 1].type == 0 && + subexons[ seIntervals[i - 1].idx ].rightType != se.leftType ) + && !( seIntervals[i - 1].type == 1 && intronicInfos[ seIntervals[i - 1].idx ].irCnt == 0 + && intronicInfos[ seIntervals[i - 1].idx ].rightOverhang.cnt == 0 ) + && ( se.prevCnt == 0 || se.start - 1 != se.prev[ se.prevCnt - 1 ] ) ) // The connection showed up in the subexon file. + { + printf( "%d ", se.prevCnt + 1 ) ; + for ( j = 0 ; j < se.prevCnt ; ++j ) + printf( "%d ", se.prev[j] ) ; + printf( "%d ", se.start - 1 ) ; + } + else + { + printf( "%d ", se.prevCnt ) ; + for ( j = 0 ; j < se.prevCnt ; ++j ) + printf( "%d ", se.prev[j] ) ; + } + + if ( i < intervalCnt - 1 && seIntervals[i].chrId == seIntervals[i + 1].chrId + && seIntervals[i].end == seIntervals[i + 1].start - 1 + && !( seIntervals[i + 1].type == 0 && + subexons[ seIntervals[i + 1].idx ].leftType != se.rightType ) + && !( seIntervals[i + 1].type == 1 && intronicInfos[ seIntervals[i + 1].idx ].irCnt == 0 + && intronicInfos[ seIntervals[i + 1].idx ].leftOverhang.cnt == 0 ) + && ( se.nextCnt == 0 || se.end + 1 != se.next[0] ) ) + { + printf( "%d %d ", se.nextCnt + 1, se.end + 1 ) ; + } + else + printf( "%d ", se.nextCnt ) ; + for ( j = 0 ; j < se.nextCnt ; ++j ) + printf( "%d ", se.next[j] ) ; + printf( "\n" ) ; + } + else if ( seIntervals[i].type == 1 ) + { + struct _intronicInfo &ii = intronicInfos[ seIntervals[i].idx ] ; + if ( ii.irCnt > 0 ) + { + printf( "%s %d %d 2 1 . . -1 -1 -1 %lf %lf 1 %d 1 %d\n", + alignments.GetChromName( ii.chrId ), ii.start, ii.end, + ii.irClassifier, ii.irClassifier, + seIntervals[i - 1].end, seIntervals[i + 1].start ) ; + } + else + { + // left overhang. + if ( ii.leftOverhang.cnt > 0 ) + { + printf( "%s %d %d 2 0 . . -1 -1 -1 %lf %lf 1 %d 0\n", + alignments.GetChromName( ii.chrId ), ii.start, + ii.start + ( ii.leftOverhang.length / ii.leftOverhang.cnt ) - 1, + ii.leftOverhang.classifier, ii.leftOverhang.classifier, + ii.start - 1 ) ; + } + + // right overhang. + if ( ii.rightOverhang.cnt > 0 ) + { + printf( "%s %d %d 0 1 . . -1 -1 -1 %lf %lf 0 1 %d\n", + alignments.GetChromName( ii.chrId ), + ii.end - ( ii.rightOverhang.length / ii.rightOverhang.cnt ) + 1, ii.end, + ii.rightOverhang.classifier, ii.rightOverhang.classifier, + ii.end + 1 ) ; + } + + } + } + } + + return 0 ; +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/Constraints.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,438 @@ +#include "Constraints.hpp" + +// return whether this constraint is compatible with the subexons. +bool Constraints::ConvertAlignmentToBitTable( struct _pair *segments, int segCnt, + struct _subexon *subexons, int seCnt, int seStart, struct _constraint &ct ) +{ + int i, j, k ; + k = seStart ; + ct.vector.Init( seCnt ) ; + // Each segment of an alignment can cover several subexons. + // But the first and last segment can partially cover a subexon. + for ( i = 0 ; i < segCnt ; ++i ) + { + int leftIdx, rightIdx ; // the range of subexons covered by this segment. + leftIdx = -1 ; + rightIdx = -1 ; + + for ( ; k < seCnt ; ++k ) + { + //if ( segments[0].b == 110282529 && segCnt == 2 ) + // printf( "(%d:%d %d):(%d:%d %d)\n", i, (int)segments[i].a, (int)segments[i].b, k, (int)subexons[k].start, (int)subexons[k].end ) ; + if ( subexons[k].start > segments[i].b ) + break ; + if ( segments[i].a > subexons[k].end ) + continue ; + + int relaxedWidth = 0 ; + if ( ( subexons[k].start >= segments[i].a && subexons[k].end <= segments[i].b ) + || ( i == 0 && subexons[k].start - relaxedWidth < segments[i].a && subexons[k].end <= segments[i].b ) + || ( i == segCnt - 1 && subexons[k].start >= segments[i].a && subexons[k].end + relaxedWidth > segments[i].b ) + || ( i == 0 && i == segCnt - 1 && subexons[k].start - relaxedWidth < segments[i].a && subexons[k].end + relaxedWidth > segments[i].b ) ) + { + if ( leftIdx == -1 ) + leftIdx = k ; + rightIdx = k ; + ct.vector.Set( k ) ; + } + else + { + return false ; + } + } + + if ( leftIdx == -1 ) + return false ; + + // The cover contradict the boundary. + if ( !( ( subexons[leftIdx].leftType == 0 || subexons[leftIdx].start <= segments[i].a ) + && ( subexons[rightIdx].rightType == 0 || subexons[rightIdx].end >= segments[i].b ) ) ) + return false ; + + // The intron must exists in the subexon graph. + if ( i > 0 ) + { + for ( j = 0 ; j < subexons[ ct.last ].nextCnt ; ++j ) + if ( subexons[ct.last].next[j] == leftIdx ) + break ; + if ( j >= subexons[ ct.last ].nextCnt ) + return false ; + } + + // The subexons must be consecutive + for ( j = leftIdx + 1 ; j <= rightIdx ; ++j ) + if ( subexons[j].start > subexons[j - 1].end + 1 ) + return false ; + + if ( i == 0 ) + ct.first = leftIdx ; + ct.last = rightIdx ; + } + return true ; +} + +void Constraints::CoalesceSameConstraints() +{ + int i, k ; + int size = constraints.size() ; + for ( i = 0 ; i < size ; ++i ) + { + constraints[i].info = i ; + //printf( "constraints %d: %d %d %d\n", i, constraints[i].vector.Test( 0 ), constraints[i].vector.Test(1), constraints[i].support ) ; + } + + std::vector<int> newIdx ; + newIdx.resize( size, 0 ) ; + + // Update the constraints. + if ( size > 0 ) + { + std::sort( constraints.begin(), constraints.end(), CompSortConstraints ) ; + + k = 0 ; + newIdx[ constraints[0].info ] = 0 ; + for ( i = 1 ; i < size ; ++i ) + { + if ( constraints[k].vector.IsEqual( constraints[i].vector ) ) + { + constraints[k].weight += constraints[i].weight ; + constraints[k].support += constraints[i].support ; + constraints[k].uniqSupport += constraints[i].uniqSupport ; + constraints[k].maxReadLen = ( constraints[k].maxReadLen > constraints[i].maxReadLen ) ? + constraints[k].maxReadLen : constraints[i].maxReadLen ; + constraints[i].vector.Release() ; + } + else + { + ++k ; + if ( k != i ) + constraints[k] = constraints[i] ; + } + newIdx[ constraints[i].info ] = k ; + } + constraints.resize( k + 1 ) ; + } + + // Update the mate pairs. + size = matePairs.size() ; + if ( size > 0 ) + { + for ( i = 0 ; i < size ; ++i ) + { + //printf( "%d %d: %d => %d | %d =>%d\n", i, newIdx.size(), matePairs[i].i, newIdx[ matePairs[i].i ], + // matePairs[i].j, newIdx[ matePairs[i].j ] ) ; + matePairs[i].i = newIdx[ matePairs[i].i ] ; + matePairs[i].j = newIdx[ matePairs[i].j ] ; + } + + std::sort( matePairs.begin(), matePairs.end(), CompSortMatePairs ) ; + + k = 0 ; + for ( i = 1 ; i < size ; ++i ) + { + if ( matePairs[i].i == matePairs[k].i && matePairs[i].j == matePairs[k].j ) + { + matePairs[k].support += matePairs[i].support ; + matePairs[k].uniqSupport += matePairs[i].uniqSupport ; + } + else + { + ++k ; + matePairs[k] = matePairs[i] ; + } + } + //printf( "%s: %d\n", __func__, matePairs[1].i) ; + matePairs.resize( k + 1 ) ; + } + + // Update the data structure for future mate pairs. + mateReadIds.UpdateIdx( newIdx ) ; +} + +void Constraints::ComputeNormAbund( struct _subexon *subexons ) +{ + int i, j ; + int ctSize = constraints.size() ; + for ( i = 0 ; i < ctSize ; ++i ) + { + // spanned more than 2 subexon + int readLen = constraints[i].maxReadLen ; + if ( constraints[i].first + 1 < constraints[i].last ) + { + std::vector<int> subexonInd ; + constraints[i].vector.GetOnesIndices( subexonInd ) ; + int size = subexonInd.size() ; + for ( j = 1 ; j < size - 1 ; ++j ) + { + int a = subexonInd[j] ; + readLen -= ( subexons[a].end - subexons[a].start + 1 ) ; + } + } + + int effectiveLength ; + if ( constraints[i].first == constraints[i].last ) + { + effectiveLength = ( subexons[ constraints[i].first ].end - readLen + 1 )- subexons[ constraints[i].first ].start + 1 ; + if ( effectiveLength <= 0 ) // this happens in the 3',5'-end subexon, where we trimmed the length + effectiveLength = ( subexons[ constraints[i].first ].end - subexons[ constraints[i].first ].start + 1 ) / 2 + 1 ; + } + else + { + int a = constraints[i].first ; + int b = constraints[i].last ; + int start, end ; // the range of the possible start sites of a read in subexons[a]. + start = subexons[a].end + 1 - ( readLen - 1 ) ; + if ( start < subexons[a].start ) + start = subexons[a].start ; + + if ( subexons[b].end - subexons[b].start + 1 >= readLen - 1 || subexons[b].rightType == 0 ) + end = subexons[a].end ; + else + { + end = subexons[a].end + 1 - ( readLen - ( subexons[b].end - subexons[b].start + 1 ) ) ; + } + + if ( end < start ) // when we trimmed the subexon. + end = subexons[a].start ; + + effectiveLength = end - start + 1 ; + } + //printf( "%d: effectiveLength=%d support=%d\n", i, effectiveLength, constraints[i].support ) ; + constraints[i].normAbund = (double)constraints[i].weight / (double)effectiveLength ; + + if ( ( subexons[ constraints[i].first ].leftType == 0 && subexons[ constraints[i].first ].end - subexons[ constraints[i].first ].start + 1 >= 8 * pAlignments->readLen ) + || ( subexons[ constraints[i].last ].rightType == 0 && subexons[ constraints[i].last ].end - subexons[ constraints[i].last ].start + 1 >= 8 * pAlignments->readLen ) ) // some random elongation of the sequence might make unnecessary long effective length. + { + constraints[i].normAbund *= 2 ; + } + constraints[i].abundance = constraints[i].normAbund ; + } + + ctSize = matePairs.size() ; + for ( i = 0 ; i < ctSize ; ++i ) + { + double a = constraints[ matePairs[i].i ].normAbund ; + double b = constraints[ matePairs[i].j ].normAbund ; + + matePairs[i].normAbund = a < b ? a : b ; + + if ( matePairs[i].i != matePairs[i].j ) + { + if ( subexons[ constraints[ matePairs[i].i ].first ].leftType == 0 + && constraints[ matePairs[i].i ].first == constraints[ matePairs[i].i ].last + && a < b ) + { + matePairs[i].normAbund = b ; + } + else if ( subexons[ constraints[ matePairs[i].j ].last ].rightType == 0 + && constraints[ matePairs[i].j ].first == constraints[ matePairs[i].j ].last + && a > b ) + { + matePairs[i].normAbund = a ; + } + } + //matePairs[i].normAbund = sqrt( a * b ) ; + + matePairs[i].abundance = matePairs[i].normAbund ; + } +} + +int Constraints::BuildConstraints( struct _subexon *subexons, int seCnt, int start, int end ) +{ + int i ; + int tag = 0 ; + int coalesceThreshold = 16384 ; + Alignments &alignments = *pAlignments ; + // Release the memory from previous gene. + int size = constraints.size() ; + + if ( size > 0 ) + { + for ( i = 0 ; i < size ; ++i ) + constraints[i].vector.Release() ; + std::vector<struct _constraint>().swap( constraints ) ; + } + std::vector<struct _matePairConstraint>().swap( matePairs ) ; + mateReadIds.Clear() ; + + // Start to build the constraints. + bool callNext = false ; // the last used alignment + if ( alignments.IsAtBegin() ) + callNext = true ; + while ( !alignments.IsAtEnd() ) + { + if ( callNext ) + { + if ( !alignments.Next() ) + break ; + } + else + callNext = true ; + + if ( alignments.GetChromId() < subexons[0].chrId ) + continue ; + else if ( alignments.GetChromId() > subexons[0].chrId ) + break ; + // locate the first subexon in this region that overlapps with current alignment. + for ( ; tag < seCnt && subexons[tag].end < alignments.segments[0].a ; ++tag ) + ; + + if ( tag >= seCnt ) + break ; + if ( alignments.segments[ alignments.segCnt - 1 ].b < subexons[tag].start ) + continue ; + + int uniqSupport = 0 ; + if ( usePrimaryAsUnique ) + uniqSupport = alignments.IsPrimary() ? 1 : 0 ; + else + uniqSupport = alignments.IsUnique() ? 1 : 0 ; + + struct _constraint ct ; + ct.vector.Init( seCnt ) ; + //printf( "%s %d: %lld-%lld | %d-%d\n", __func__, alignments.segCnt, alignments.segments[0].a, alignments.segments[0].b, subexons[tag].start, subexons[tag].end ) ; + ct.weight = 1.0 / alignments.GetNumberOfHits() ; + if ( alignments.IsGCRich() ) + ct.weight *= 10 ; + ct.normAbund = 0 ; + ct.support = 1 ; + ct.uniqSupport = uniqSupport ; + ct.maxReadLen = alignments.GetRefCoverLength() ; + + if ( alignments.IsPrimary() && ConvertAlignmentToBitTable( alignments.segments, alignments.segCnt, + subexons, seCnt, tag, ct ) ) + { + + //printf( "%s ", alignments.GetReadId() ) ; + //ct.vector.Print() ; + + + // If the alignment has clipped end or tail. We only keep those clipped in the 3'/5'-end + bool validClip = true ; + if ( alignments.HasClipHead() ) + { + if ( ( ct.first < seCnt - 1 && subexons[ct.first].end + 1 == subexons[ct.first + 1].start ) + || subexons[ct.first].prevCnt > 0 + || alignments.segments[0].b - alignments.segments[0].a + 1 <= alignments.GetRefCoverLength() / 3.0 ) + validClip = false ; + } + if ( alignments.HasClipTail() ) + { + int tmp = alignments.segCnt - 1 ; + if ( ( ct.last > 0 && subexons[ct.last].start - 1 == subexons[ct.last - 1].end ) + || subexons[ct.last].nextCnt > 0 + || alignments.segments[tmp].b - alignments.segments[tmp].a + 1 <= alignments.GetRefCoverLength() / 3.0 ) + validClip = false ; + } + + if ( validClip ) + { + constraints.push_back( ct ) ; // if we just coalesced but the list size does not decrease, this will force capacity increase. + //if ( !strcmp( alignments.GetReadId(), "ERR188021.8489052" ) ) + // ct.vector.Print() ; + // Add the mate-pair information. + int mateChrId ; + int64_t matePos ; + alignments.GetMatePosition( mateChrId, matePos ) ; + if ( alignments.GetChromId() == mateChrId ) + { + if ( matePos < alignments.segments[0].a ) + { + int mateIdx = mateReadIds.Query( alignments.GetReadId(), alignments.segments[0].a ) ; + if ( mateIdx != -1 ) + { + struct _matePairConstraint nm ; + nm.i = mateIdx ; + nm.j = constraints.size() - 1 ; + nm.abundance = 0 ; + nm.support = 1 ; + nm.uniqSupport = uniqSupport ; + nm.effectiveCount = 2 ; + matePairs.push_back( nm ) ; + } + } + else if ( matePos > alignments.segments[0].a ) + { + mateReadIds.Insert( alignments.GetReadId(), alignments.segments[0].a, constraints.size() - 1, matePos ) ; + } + else // two mates have the same coordinate. + { + if ( alignments.IsFirstMate() ) + { + struct _matePairConstraint nm ; + nm.i = constraints.size() - 1 ; + nm.j = constraints.size() - 1 ; + nm.abundance = 0 ; + nm.support = 1 ; + nm.uniqSupport = uniqSupport ; + nm.effectiveCount = 2 ; + matePairs.push_back( nm ) ; + } + } + } + } + else + ct.vector.Release() ; + + // Coalesce if necessary. + size = constraints.size() ; + if ( (int)size > coalesceThreshold && size == (int)constraints.capacity() ) + { + + //printf( "start coalescing. %d\n", constraints.capacity() ) ; + CoalesceSameConstraints() ; + + // Not coalesce enough + if ( constraints.size() >= constraints.capacity() / 2 ) + { + coalesceThreshold *= 2 ; + } + } + } + else + { + //printf( "not compatible\n" ) ; + ct.vector.Release() ; + } + } + //printf( "start coalescing. %d %d\n", constraints.size(), matePairs.size() ) ; + CoalesceSameConstraints() ; + //printf( "after coalescing. %d %d\n", constraints.size(), matePairs.size() ) ; + //for ( i = 0 ; i < matePairs.size() ; ++i ) + // printf( "matePair: %d %d %d\n", matePairs[i].i, matePairs[i].j, matePairs[i].support ) ; + // single-end data set + //if ( matePairs.size() == 0 ) + if ( alignments.fragStdev == 0 ) + { + int size = constraints.size() ; + matePairs.clear() ; + + for ( i = 0 ; i < size ; ++i ) + { + struct _matePairConstraint nm ; + nm.i = i ; + nm.j = i ; + nm.abundance = 0 ; + nm.support = constraints[i].support ; + nm.uniqSupport = constraints[i].uniqSupport ; + nm.effectiveCount = 1 ; + + matePairs.push_back( nm ) ; + } + } + + ComputeNormAbund( subexons ) ; + + /*for ( i = 0 ; i < constraints.size() ; ++i ) + { + printf( "constraints %d: %lf %d %d %d ", i, constraints[i].normAbund, constraints[i].first, constraints[i].last, constraints[i].support ) ; + constraints[i].vector.Print() ; + } + + for ( i = 0 ; i < matePairs.size() ; ++i ) + { + printf( "mates %d: %lf %d %d %d %d\n", i, matePairs[i].normAbund, matePairs[i].i, matePairs[i].j, matePairs[i].support, matePairs[i].uniqSupport ) ; + }*/ + + return 0 ; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/Constraints.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,445 @@ +#ifndef _MOURISL_CLASSES_CONSTRAINTS_HEADER +#define _MOURISL_CLASSES_CONSTRAINTS_HEADER + +#include <vector> +#include <map> +#include <algorithm> +#include <string> +#include <string.h> + +#include "BitTable.hpp" +#include "alignments.hpp" +#include "SubexonGraph.hpp" + +struct _constraint +{ + BitTable vector ; // subexon vector + double weight ; + double normAbund ; + double abundance ; + int support ; + int uniqSupport ; + int maxReadLen ; // the longest read length support this constraint. + + int info ; // other usages. + int first, last ; // indicate the first and last index of the subexons. +} ; + +struct _matePairConstraint +{ + int i, j ; + + int support ; + int uniqSupport ; + + double abundance ; + double normAbund ; + int effectiveCount ; + int type ; +} ; + +struct _readIdHeap +{ + char *readId ; + int pos ; + int matePos ; + int idx ; +} ; + +//---------------------------------------------------------------------------------- +// We assume the access to the data structure is sorted by matePos. +// So we can "pre-cache" the ids with the same matePos. +class MateReadIds +{ +private: + std::vector< struct _readIdHeap > heap ; + + void HeapifyUp( int tag ) + { + while ( tag > 1 ) + { + if ( heap[tag / 2].matePos < heap[tag].matePos ) + return ; + struct _readIdHeap tmp ; + tmp = heap[tag / 2] ; + heap[tag / 2] = heap[tag] ; + heap[tag] = tmp ; + + tag /= 2 ; + } + } + + void HeapifyDown( int tag ) + { + int size = heap.size() ; + while ( 2 * tag < size ) + { + int choose = 2 * tag ; + if ( 2 * tag + 1 < size && + heap[ 2 * tag + 1].matePos < heap[2 * tag ].matePos ) + { + choose = 2 * tag + 1 ; + } + + if ( heap[tag].matePos < heap[choose].matePos ) + return ; + + struct _readIdHeap tmp ; + tmp = heap[choose] ; + heap[choose] = heap[tag] ; + heap[tag] = tmp ; + + tag = choose ; + } + } + + struct _readIdHeap Pop() + { + struct _readIdHeap ret ; + int size = heap.size() ; + if ( size < 2 ) + { + ret.readId = NULL ; + return ret ; + } + + ret = heap[1] ; + + heap[1] = heap[ heap.size() - 1] ; + heap.pop_back() ; + HeapifyDown( 1 ) ; + + return ret ; + } + + int cachedMatePos ; + std::map<std::string, int> cachedIdx ; + bool hasMateReadIdSuffix ; // ignore the last ".{1,2}" or "/{1,2}" . +public: + MateReadIds() + { + // Push a dummy element so the vector becomes 1-based. + struct _readIdHeap nh ; + nh.readId = NULL ; + nh.pos = nh.idx = nh.matePos = -1 ; + heap.push_back( nh ) ; + cachedMatePos = -1 ; + hasMateReadIdSuffix = false ; + } + ~MateReadIds() + { + int size = heap.size() ; + std::map<std::string, int>().swap( cachedIdx ) ; + for ( int i = 0 ; i < size ; ++i ) + if ( heap[i].readId != NULL ) + free( heap[i].readId ) ; + } + + void Clear() + { + std::map<std::string, int>().swap( cachedIdx ) ; + + int size = heap.size() ; + for ( int i = 0 ; i < size ; ++i ) + if ( heap[i].readId != NULL ) + free( heap[i].readId ) ; + std::vector<struct _readIdHeap>().swap( heap ) ; + + struct _readIdHeap nh ; + nh.readId = NULL ; + nh.pos = nh.idx = nh.matePos = -1 ; + heap.push_back( nh ) ; + cachedMatePos = -1 ; + } + + void Insert( char *id, int pos, int idx, int matePos ) + { + struct _readIdHeap nh ; + nh.readId = strdup( id ) ; + nh.pos = pos ; + nh.idx = idx ; + nh.matePos = matePos ; + + heap.push_back( nh ) ; + HeapifyUp( heap.size() - 1 ) ; + } + + // If the id does not exist, return -1. + int Query( char *id, int matePos ) + { + int size ; + size = heap.size() ; + if ( matePos > cachedMatePos ) + { + std::map<std::string, int>().swap( cachedIdx ) ; + + while ( size >= 2 && heap[1].matePos < matePos ) + { + struct _readIdHeap r = Pop() ; + if ( r.readId ) + { + free( r.readId ) ; + } + --size ; + } + + while ( size >= 2 && heap[1].matePos == matePos ) + { + struct _readIdHeap r = Pop() ; + cachedIdx[ std::string( r.readId ) ] = r.idx ; + if ( r.readId ) + free( r.readId ) ; + --size ; + } + cachedMatePos = matePos ; + } + std::string s( id ) ; + if ( hasMateReadIdSuffix ) + { + int len = s.length() ; + if ( len >= 2 && ( s[len - 1] == '1' || s[len - 1] == '2' ) + && ( s[len - 2] == '.' || s[len - 2] == '/' ) ) + { + s[len - 1] = '2' - s[len - 1] + '1' ; + } + } + + if ( cachedIdx.find( s ) != cachedIdx.end() ) + { + return cachedIdx[s] ; + } + return -1 ; + } + + void UpdateIdx( std::vector<int> &newIdx ) + { + int size = heap.size() ; + int i ; + for ( i = 1 ; i < size ; ++i ) + { + heap[i].idx = newIdx[ heap[i].idx ] ; + } + + for ( std::map<std::string, int>::iterator it = cachedIdx.begin() ; it != cachedIdx.end() ; ++it ) + it->second = newIdx[ it->second ] ; + } + + void SetHasMateReadIdSuffix( bool in ) + { + hasMateReadIdSuffix = true ; + } +} ; + + +//-------------------------------------------------------------------------- +class Constraints +{ +private: + int prevStart, prevEnd ; + bool usePrimaryAsUnique ; + MateReadIds mateReadIds ; + + Alignments *pAlignments ; + + //@return: whether this alignment is compatible with the subexons or not. + bool ConvertAlignmentToBitTable( struct _pair *segments, int segCnt, struct _subexon *subexons, int seCnt, int seStart, struct _constraint &ct ) ; + + // Sort to increasing order. Since the first subexon occupies the least important digit. + static bool CompSortConstraints( const struct _constraint &a, const struct _constraint &b ) + { + //int k + if ( a.first < b.first ) + return true ; + else if ( a.first > b.first ) + return false ; + + int diffPos = a.vector.GetFirstDifference( b.vector ) ; + if ( diffPos == -1 ) // case of equal. + return false ; + + if ( a.vector.Test( diffPos )) + return false ; + else + return true ; + } + + static bool CompSortMatePairs( const struct _matePairConstraint &a, const struct _matePairConstraint &b ) + { + if ( a.i < b.i ) + return true ; + else if ( a.i > b.i ) + return false ; + else + { + if ( a.j < b.j ) + return true ; + else + return false ; + } + } + + void CoalesceSameConstraints() ; + void ComputeNormAbund( struct _subexon *subexons ) ; +public: + std::vector<struct _constraint> constraints ; + std::vector<struct _matePairConstraint> matePairs ; + + Constraints() + { + usePrimaryAsUnique = false ; + } + + Constraints( Alignments *a ): pAlignments( a ) + { + } + + ~Constraints() + { + int i ; + int size = constraints.size() ; + for ( i = 0 ; i < size ; ++i ) + constraints[i].vector.Release() ; + constraints.clear() ; + std::vector<struct _constraint>().swap( constraints ) ; + matePairs.clear() ; + std::vector<struct _matePairConstraint>().swap( matePairs ) ; + } + + void Clear() + { + //TODO: do I need to release the memory from BitTable? + constraints.clear() ; + } + + void SetAlignments( Alignments *a ) + { + pAlignments = a ; + } + + void SetUsePrimaryAsUnique( bool in ) + { + usePrimaryAsUnique = in ; + } + + void Assign( Constraints &c ) + { + int i ; + int size = constraints.size() ; + if ( size > 0 ) + { + for ( i = 0 ; i < size ; ++i ) + constraints[i].vector.Release() ; + constraints.clear() ; + std::vector<struct _constraint>().swap( constraints ) ; + } + matePairs.clear() ; + std::vector<struct _matePairConstraint>().swap( matePairs ) ; + + //constraints.resize( c.constraints.size() ) ; + constraints = c.constraints ; + size = c.constraints.size() ; + for ( i = 0 ; i < size ; ++i ) + { + /*struct _constraint nc ; + nc.weight = c.constraints[i].weight ; + nc.normAbund = c.constraints[i].normAbund ; + nc.abundance = c.constraints[i].abundance ; + nc.support = c.constraints[i].support ; + nc.uniqSupport = c.constraints[i].uniqSupport ; + nc.maxReadLen = c.constraints[i].maxReadLen ; + nc.info = c.constraints[i].info ; + nc.first = c.constraints[i].first ; + nc.last = c.constraints[i].last ; + nc.vector.Duplicate( c.constraints[i].vector ) ; + constraints[i] = ( nc ) ; */ + constraints[i].vector.Nullify() ; // so that it won't affect the BitTable in "c" + constraints[i].vector.Duplicate( c.constraints[i].vector ) ; + } + matePairs = c.matePairs ; + pAlignments = c.pAlignments ; + } + + void DownsampleConstraintsFrom( Constraints &c, int stride = 10 ) + { + int i ; + int size = constraints.size(), k ; + + if ( size > 0 ) + { + for ( i = 0 ; i < size ; ++i ) + constraints[i].vector.Release() ; + constraints.clear() ; + std::vector<struct _constraint>().swap( constraints ) ; + } + matePairs.clear() ; + std::vector<struct _matePairConstraint>().swap( matePairs ) ; + + //constraints.resize( c.constraints.size() ) ; + //constraints = c.constraints ; + k = 0 ; + size = c.constraints.size() ; + for ( i = 0 ; i < size ; i += stride, ++k ) + { + constraints.push_back( c.constraints[i] ) ; + constraints[k].vector.Nullify() ; // so that it won't affect the BitTable in "c" + constraints[k].vector.Duplicate( c.constraints[i].vector ) ; + + /*std::vector<int> seIdx ; + constraints[k].vector.GetOnesIndices( seIdx ) ; + int j, l = seIdx.size() ; + for ( j = 2 ; j < l ; ++j ) + { + constraints[k].vector.Unset( seIdx[j] ) ; + } + constraints[k].last = seIdx[1] ;*/ + } + // mate pairs is not used. if we down-sampling + pAlignments = c.pAlignments ; + } + + void TruncateConstraintsCoverFrom( Constraints &c, int seCnt, int maxConstraintSize ) + { + int i ; + int size = constraints.size() ; + + if ( size > 0 ) + { + for ( i = 0 ; i < size ; ++i ) + constraints[i].vector.Release() ; + constraints.clear() ; + std::vector<struct _constraint>().swap( constraints ) ; + } + matePairs.clear() ; + std::vector<struct _matePairConstraint>().swap( matePairs ) ; + + //constraints.resize( c.constraints.size() ) ; + //constraints = c.constraints ; + size = c.constraints.size() ; + for ( i = 0 ; i < size ; ++i ) + { + constraints.push_back( c.constraints[i] ) ; + constraints[i].vector.Nullify() ; // so that it won't affect the BitTable in "c" + constraints[i].vector.Init( seCnt ) ; + std::vector<int> seIdx ; + c.constraints[i].vector.GetOnesIndices( seIdx ) ; + int j, l = seIdx.size() ; + for ( j = 0 ; j < maxConstraintSize && j < l ; ++j ) + { + constraints[i].vector.Set( seIdx[j] ) ; + } + constraints[i].last = seIdx[j - 1] ; + } + // mate pairs is not used. if we down-sampling + pAlignments = c.pAlignments ; + } + + void SetHasMateReadIdSuffix( bool in ) + { + mateReadIds.SetHasMateReadIdSuffix( in ) ; + } + + int BuildConstraints( struct _subexon *subexons, int seCnt, int start, int end ) ; + +} ; + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/FilterSplice.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,48 @@ +#!/bin/perl + +use strict ; +use warnings ; + +die "usage: a.pl a.raw_splice trusted.plisce > filtered.splice\n" if ( @ARGV == 0 ) ; + +my %trustedSplices ; + +open FP1, $ARGV[1] ; +while ( <FP1> ) +{ + chomp ; + my @cols = split /\s+/, $_ ; + my $key = $cols[0]." ".$cols[1]." ".$cols[2] ; + $trustedSplices{ $key } = $cols[4] ; +} +close FP1 ; + +open FP1, $ARGV[0] ; +while ( <FP1> ) +{ + chomp ; + my @cols = split /\s+/, $_ ; + my $key = $cols[0]." ".$cols[1]." ".$cols[2] ; + next if ( !defined $trustedSplices{ $key } ) ; + + my $trustedStrand = $trustedSplices{ $key } ; + if ( $cols[3] <= 0 ) + { + print $cols[0], " ", $cols[1], " ", $cols[2], " 1 ", $trustedStrand, " 1 0 0 0\n" ; + } + else + { + if ( $cols[4] eq $trustedStrand ) + { + print $_, "\n" ; + } + else + { + $cols[4] = $trustedStrand ; + print join( " ", @cols ), "\n" ; + } + } + +} +close FP1 ; +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/FindJunction.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1108 @@ +/* + Find junctions from the SAM file generated by Tophat2. + The SAM file should be sorted. + The junction is the start and the end coordinates of the spliced region in this program. The output of the junction is a bit different. + + Usage: ./a.out input [option] >output. + ./a.out -h for help. +*/ +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#include "sam.h" + +#define LINE_SIZE 4097 +#define QUEUE_SIZE 10001 +#define HASH_MAX 1000003 + +struct _readTree +{ + char id[256] ; + int leftAnchor, rightAnchor ; + int pos ; + //bool secondary ; + bool valid ; + int editDistance ; + int NH, cnt ; // if cnt < NH, then it has real secondary match for this splice junction + struct _readTree *left, *right ; + + int flag ;// The flag from sam head. +} ; + +// The structure of a junction +struct _junction +{ + int start, end ; + int readCnt ; // The # of reads containing this junction + int secReadCnt ; // The # of reads whose secondary alignment has this junction. + char strand ; // On '+' or '-' strand + int leftAnchor, rightAnchor ; // The longest left and right anchor + int oppositeAnchor ; // The longest anchor of the shorter side. + int uniqEditDistance, secEditDistance ; + struct _readTree head ; +} ; + +char line[LINE_SIZE] ; +char col[11][LINE_SIZE] ; // The option fields is not needed. +char strand ; // Extract XS field +char noncanonStrandInfo ; +//bool secondary ; +int NH ; +int editDistance ; +int mateStart ; +int filterYS ; +int samFlag ; + +struct _junction junctionQueue[QUEUE_SIZE] ; // Expected only a few junctions in it for each read. This queue is sorted. + +int qHead, qTail ; + +bool flagPrintJunction ; +bool flagPrintAll ; +bool flagStrict ; +int junctionCnt ; +bool anchorBoth ; +bool validRead ; + +int flank ; +struct _cigarSeg +{ + int len ; + char type ; +} ; + +struct _readTree *contradictedReads ; + +char nucToNum[26] = { 0, 4, 1, 4, 4, 4, 2, + 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 3, + 4, 4, 4, 4, 4, 4 } ; + +void PrintHelp() +{ + printf( + "Prints reads from the SAM/BAM file that containing junctions.\n" + "Usage: ./a.out input [option]>output\n" + "Options:\n" + "\t-j xx [-B]: Output the junctions using 1-based coordinates. The format is \"reference id\" start end \"# of read\" strand.(They are sorted)\n and the xx is an integer means the maximum unqualified anchor length for a splice junction(default=8). If -B, the splice junction must be supported by a read whose both anchors are longer than xx.\n" + "\t-a: Output all the junctions, and use non-positive support number to indicate unqualified junctions.\n" + "\t-y: If the bits from YS field of bam matches the argument, we filter the alignment (default: 4).\n" + ) ; +} + +void GetJunctionInfo( struct _junction &junc, struct _readTree *p ) +{ + if ( p == NULL ) + return ; + + if ( p->valid ) + { + //if ( junc.start == 22381343 + 1 && junc.end == 22904987 - 1 ) + // printf( "%s %d %d %d\n", p->id, p->leftAnchor, p->rightAnchor, p->flag ) ; + + + + if ( p->cnt < p->NH ) + { + junc.secEditDistance += p->editDistance ; + ++junc.secReadCnt ; + } + else + { + junc.uniqEditDistance += p->editDistance ; + ++junc.readCnt ; + } + int l = p->leftAnchor, r = p->rightAnchor ; + + if ( !anchorBoth ) + { + if ( l > junc.leftAnchor ) + junc.leftAnchor = l ; + if ( r > junc.rightAnchor ) + junc.rightAnchor = r ; + + if ( l <= r && l > junc.oppositeAnchor ) + junc.oppositeAnchor = l ; + else if ( r < l && r > junc.oppositeAnchor ) + junc.oppositeAnchor = r ; + } + else + { + if ( l > flank && r > flank ) + { + junc.leftAnchor = l ; + junc.rightAnchor = r ; + } + + if ( l <= r && l > junc.oppositeAnchor ) + junc.oppositeAnchor = l ; + else if ( r < l && r > junc.oppositeAnchor ) + junc.oppositeAnchor = r ; + } + } + GetJunctionInfo( junc, p->left ) ; + GetJunctionInfo( junc, p->right ) ; +} + +void PrintJunctionReads( struct _junction &junc, struct _readTree *p ) +{ + if ( p == NULL ) + return ; + + if ( p->valid ) + printf( "%s\n", p->id ) ; + PrintJunctionReads( junc, p->left ) ; + PrintJunctionReads( junc, p->right ) ; +} + +void PrintJunction( char *chrome, struct _junction &junc ) +{ + int sum ; + junc.leftAnchor = 0 ; + junc.rightAnchor = 0 ; + junc.oppositeAnchor = 0 ; + junc.readCnt = 0 ; + junc.secReadCnt = 0 ; + junc.uniqEditDistance = 0 ; + junc.secEditDistance = 0 ; + GetJunctionInfo( junc, &junc.head ) ; + + sum = junc.readCnt + junc.secReadCnt ; + + if ( junc.leftAnchor <= flank || junc.rightAnchor <= flank || ( junc.readCnt + junc.secReadCnt <= 0 ) ) + { + if ( flagPrintAll ) + { + sum = -sum ; + } + else + return ; + } + + /*if ( junc.end - junc.start + 1 >= 200000 ) + { + if ( junc.secReadCnt > 0 ) + junc.secReadCnt = 1 ; + }*/ + + /*if ( junc.oppositeAnchor <= ( ( flank / 2 < 1 ) ? flank / 2 : 1 ) && ( junc.readCnt + junc.secReadCnt ) <= 10 ) + { + if ( junc.readCnt > 0 ) + junc.readCnt = 1 ; + if ( junc.secReadCnt > 0 ) + junc.secReadCnt = 0 ; + }*/ + + printf( "%s %d %d %d %c %d %d %d %d\n", chrome, junc.start - 1, junc.end + 1, sum, junc.strand, + junc.readCnt, junc.secReadCnt, junc.uniqEditDistance, junc.secEditDistance ) ; + //PrintJunctionReads( junc, &junc.head ) ; +} + +void ClearReadTree( struct _readTree *p ) +{ + if ( p == NULL ) + return ; + ClearReadTree( p->left ) ; + ClearReadTree( p->right ) ; + free( p ) ; +} + +// Insert to the read tree +bool InsertReadTree( struct _readTree *p, char *id, int l, int r ) +{ + int tmp = strcmp( p->id, id ) ; + if ( tmp == 0 ) + { + p->cnt += 1 ; + return true ; + } + else if ( tmp < 0 ) + { + if ( p->left ) + return InsertReadTree( p->left, id, l, r ) ; + else + { + p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; + strcpy( p->left->id, id ) ; + p->left->leftAnchor = l ; + p->left->rightAnchor = r ; + //p->left->secondary = secondary ; + p->left->editDistance = editDistance ; + p->left->left = p->left->right = NULL ; + p->left->valid = validRead ; + p->left->cnt = 1 ; + p->left->NH = NH ; + p->left->flag = samFlag ; + return false ; + } + } + else + { + if ( p->right ) + return InsertReadTree( p->right, id, l, r ) ; + else + { + p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; + strcpy( p->right->id, id ) ; + p->right->leftAnchor = l ; + p->right->rightAnchor = r ; + //p->right->secondary = secondary ; + p->right->editDistance = editDistance ; + p->right->left = p->right->right = NULL ; + p->right->valid = validRead ; + p->right->cnt = 1 ; + p->right->NH = NH ; + p->right->flag = samFlag ; + return false ; + } + } +} + + +bool SearchContradictedReads( struct _readTree *p, char *id, int pos ) +{ + if ( p == NULL ) + return false ; + int tmp = strcmp( p->id, id ) ; + if ( tmp == 0 && p->pos == pos ) + return true ; + else if ( tmp <= 0 ) + return SearchContradictedReads( p->left, id, pos ) ; + else + return SearchContradictedReads( p->right, id, pos ) ; +} + +bool InsertContradictedReads( struct _readTree *p, char *id, int pos ) +{ + if ( p == NULL ) + { + contradictedReads = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; + strcpy( contradictedReads->id, id ) ; + contradictedReads->pos = pos ; + contradictedReads->left = contradictedReads->right = NULL ; + return false ; + } + int tmp = strcmp( p->id, id ) ; + if ( tmp == 0 && p->pos == pos ) + return true ; + else if ( tmp <= 0 ) + { + if ( p->left ) + return InsertContradictedReads( p->left, id, pos ) ; + else + { + p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; + strcpy( p->left->id, id ) ; + p->left->pos = pos ; + p->left->left = p->left->right = NULL ; + return false ; + } + } + else + { + if ( p->right ) + return InsertContradictedReads( p->right, id, pos ) ; + else + { + p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ; + strcpy( p->right->id, id ) ; + p->right->pos = pos ; + p->right->left = p->right->right = NULL ; + return false ; + } + } +} + +struct _readTree *GetReadTreeNode( struct _readTree *p, char *id ) +{ + if ( p == NULL ) + return NULL ; + int tmp = strcmp( p->id, id ) ; + if ( tmp == 0 ) + return p ; + else if ( tmp < 0 ) + return GetReadTreeNode( p->left, id ) ; + else + return GetReadTreeNode( p->right, id ) ; +} + +// Insert the new junction into the queue, +// and make sure the queue is sorted. +// Assume each junction range is only on one strand. +// l, r is the left and right anchor from a read +void InsertQueue( int start, int end, int l, int r ) +{ + int i, j ; + i = qTail ; + + + while ( i != qHead ) + { + j = i - 1 ; + if ( j < 0 ) + j = QUEUE_SIZE - 1 ; + + if ( junctionQueue[j].start < start + || ( junctionQueue[j].start == start && junctionQueue[j].end < end ) ) + break ; + + junctionQueue[i] = junctionQueue[j] ; + --i ; + + if ( i < 0 ) + i = QUEUE_SIZE - 1 ; + } + + junctionQueue[i].start = start ; + junctionQueue[i].end = end ; + /*if ( !secondary ) + { + junctionQueue[i].readCnt = 1 ; + junctionQueue[i].secReadCnt = 0 ; + } + else + { + junctionQueue[i].readCnt = 0 ; + junctionQueue[i].secReadCnt = 1 ; + }*/ + junctionQueue[i].strand = strand ; + junctionQueue[i].leftAnchor = l ; + junctionQueue[i].rightAnchor = r ; + + strcpy( junctionQueue[i].head.id, col[0] ) ; + junctionQueue[i].head.valid = validRead ; + junctionQueue[i].head.leftAnchor = l ; + junctionQueue[i].head.rightAnchor = r ; + junctionQueue[i].head.left = junctionQueue[i].head.right = NULL ; + //junctionQueue[i].head.secondary = secondary ; + junctionQueue[i].head.NH = NH ; + junctionQueue[i].head.cnt = 1 ; + junctionQueue[i].head.editDistance = editDistance ; + + ++qTail ; + if ( qTail >= QUEUE_SIZE ) + qTail = 0 ; +} + +// Search the queue, and remove the heads if its end(start) is smaller than prune.(Because it is impossible to have that junction again) +// Add the junction to the queue, if it is not in it. +// Return true, if it finds the junction. Otherwise, return false. +bool SearchQueue( int start, int end, int prune, int l, int r ) +{ + int i ; + + // Test whether this read might be a false alignment. + i = qHead ; + while ( i != qTail ) + { + struct _readTree *rt ; + if ( ( junctionQueue[i].start == start && junctionQueue[i].end < end ) || + ( junctionQueue[i].start > start && junctionQueue[i].end == end ) ) + { + // This alignment is false ; + rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ; + // the commented out logic because it is handled by contradicted reads + if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) ) + { + if ( rt->leftAnchor <= flank || rt->rightAnchor <= flank )//|| rt->secondary ) + { + if ( l > flank && r > flank )//&& !secondary ) + rt->valid = false ; + else + validRead = false ; + } + else + { + // Ignore this read + //return true ; + validRead = false ; + } + } + } + else if ( ( junctionQueue[i].start == start && junctionQueue[i].end > end ) || + ( junctionQueue[i].start < start && junctionQueue[i].end == end ) ) + { + // This other alignment is false ; + rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ; + //if ( rt != NULL ) + if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) ) + { + if ( l <= flank || r <= flank )//|| secondary ) + { + if ( rt->leftAnchor > flank && rt->rightAnchor > flank )//&& !rt->secondary ) + validRead = false ; + else + rt->valid = false ; + } + else + rt->valid = false ; + } + } + ++i ; + if ( i >= QUEUE_SIZE ) + i = 0 ; + } + + //if ( start == 8077108 && end == 8078439 ) + // exit( 1 ) ; + i = qHead ; + + while ( i != qTail ) + { + if ( junctionQueue[i].start == start && + junctionQueue[i].end == end ) + { + if (junctionQueue[i].strand == '?' && junctionQueue[i].strand != strand ) + { + junctionQueue[i].strand = strand ; + } + InsertReadTree( &junctionQueue[i].head, col[0], l, r ) ; + return true ; + } + + if ( junctionQueue[i].end < prune && i == qHead ) + { + // pop + PrintJunction( col[2], junctionQueue[i] ) ; + ClearReadTree( junctionQueue[i].head.left ) ; + ClearReadTree( junctionQueue[i].head.right ) ; + ++qHead ; + if ( qHead >= QUEUE_SIZE ) + qHead = 0 ; + } + ++i ; + if ( i >= QUEUE_SIZE ) + i = 0 ; + } + + InsertQueue( start, end, l, r ) ; + return false ; +} + +// Compute the junctions based on the CIGAR +bool CompareJunctions( int startLocation, char *cigar ) +{ + int currentLocation = startLocation ; // Current location on the reference genome + int i, j ; + int num ; + int newJuncCnt = 0 ; // The # of junctions in the read, and the # of new junctions among them. + + struct _cigarSeg cigarSeg[1000] ; // A segment of the cigar. + int ccnt = 0 ; // cigarSeg cnt + + j = 0 ; + num = 0 ; + validRead = true ; + + for ( i = 0 ; cigar[i] ; ++i ) + { + if ( cigar[i] >= '0' && cigar[i] <= '9' ) + { + num = num * 10 + cigar[i] - '0' ; + } + else + { + cigarSeg[ccnt].len = num ; + cigarSeg[ccnt].type = cigar[i] ; + ++ccnt ; + num = 0 ; + } + } + + // Filter low complex alignment. + // Only applies this to alignments does not have a strand information. + if ( strand == '?' ) + { + if ( noncanonStrandInfo != -1 ) + { + if ( ( noncanonStrandInfo & filterYS ) != 0 ) + validRead = false ; + } + else + { + int softStart = -1 ; + int softEnd = 0 ; + if ( cigarSeg[0].type == 'S' ) + softStart = cigarSeg[0].len ; + if ( cigarSeg[ ccnt - 1 ].type == 'S' ) + softEnd = cigarSeg[ ccnt - 1 ].len ; + int readLen = strlen( col[9] ) ; + int count[5] = { 0, 0, 0, 0, 0 } ; + + int pos = 0 ; + for ( i = 0 ; i < ccnt ; ++i ) + { + switch ( cigarSeg[i].type ) + { + case 'S': + pos += cigarSeg[i].len ; + case 'M': + case 'I': + { + for ( j = 0 ; j < cigarSeg[i].len ; ++j ) + ++count[ nucToNum[ col[9][pos + j] - 'A' ] ] ; + pos += j ; + } break ; + case 'N': + { + int max = 0 ; + int sum = 0 ; + for ( j = 0 ; j < 5 ; ++j ) + { + if ( count[j] > max ) + max = count[j] ; + sum += count[j] ; + } + if ( max > 0.8 * sum ) + validRead = false ; + count[0] = count[1] = count[2] = count[3] = count[4] = 0 ; + } break ; + case 'H': + case 'P': + case 'D': + default: break ; + } + } + + int max = 0 ; + int sum = 0 ; + for ( j = 0 ; j < 5 ; ++j ) + { + if ( count[j] > max ) + max = count[j] ; + sum += count[j] ; + } + if ( max > 0.8 * sum ) + validRead = false ; + /* count[0] = count[1] = count[2] = count[3] = count[4] = 0 ; + + + for ( i = softStart + 1 ; i < readLen - softEnd ; ++i ) + { + switch ( col[9][i] ) + { + case 'A': ++count[0] ; break ; + case 'C': ++count[1] ; break ; + case 'G': ++count[2] ; break ; + case 'T': ++count[3] ; break ; + default: ++count[4] ; + } + } + int max = 0 ; + for ( j = 0 ; j < 5 ; ++j ) + if ( count[j] > max ) + max = count[j] ; + if ( max > 0.6 * ( readLen - softEnd - softStart - 1 ) ) + validRead = false ;*/ + } + } + + // Test whether contradict with mate pair + if ( col[6][0] == '=' ) + { + currentLocation = startLocation ; + for ( i = 0 ; i < ccnt ; ++i ) + { + if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H' + || cigarSeg[i].type == 'P' ) + continue ; + else if ( cigarSeg[i].type == 'N' ) + { + if ( mateStart >= currentLocation && mateStart <= currentLocation + cigarSeg[i].len - 1 ) + { + /*if ( cigarSeg[i].len == 91486 ) + { + printf( "%s %d %d\n", currentLocation, mateStart ) ; + exit( 1 ) ; + }*/ + // ignore this read + //return false ; + InsertContradictedReads( contradictedReads, col[0], mateStart ) ; + validRead = false ; + break ; + } + } + currentLocation += cigarSeg[i].len ; + } + + i = qHead ; + // Search if it falls in the splices junction created by its mate. + while ( i != qTail ) + { + if ( mateStart < junctionQueue[i].start && junctionQueue[i].start <= startLocation + && startLocation <= junctionQueue[i].end + && SearchContradictedReads( contradictedReads, col[0], startLocation ) ) + { + validRead = false ; + break ; + } + ++i ; + if ( i >= QUEUE_SIZE ) + i = 0 ; + } + } + + currentLocation = startLocation ; + for ( i = 0 ; i < ccnt ; ++i ) + { + if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H' + || cigarSeg[i].type == 'P' ) + continue ; + else if ( cigarSeg[i].type == 'N' ) + { + int left, right ; + int tmp ; + tmp = i ; + while ( i > 0 && ( cigarSeg[i - 1].type == 'I' || cigarSeg[i - 1].type == 'D' ) ) + --i ; + if ( i > 0 && cigarSeg[i - 1].type == 'M' ) + { + left = cigarSeg[i - 1].len ; + if ( i >= 2 && cigarSeg[i - 2].type == 'N' && left <= flank ) + { + left = flank + 1 ; + } + } + else + left = 0 ; + + i = tmp ; + while ( i < ccnt && ( cigarSeg[i + 1].type == 'I' || cigarSeg[i + 1].type == 'D' ) ) + ++i ; + if ( i < ccnt && cigarSeg[i + 1].type == 'M' ) + { + right = cigarSeg[i + 1].len ; + if ( i + 2 < ccnt && cigarSeg[i + 2].type == 'N' && right <= flank ) + { + right = flank + 1 ; + } + } + else + right = 0 ; + i = tmp ; + if ( !SearchQueue( currentLocation, currentLocation + cigarSeg[i].len - 1, startLocation, left, right ) ) + { + ++newJuncCnt ; + } + } + currentLocation += cigarSeg[i].len ; + } + /*else if ( cigar[i] == 'I' ) + { + num = 0 ; + } + else if ( cigar[i] == 'N' ) + { + if ( !SearchQueue( currentLocation, currentLocation + num - 1, startLocation ) ) + { + ++newJuncCnt ; + //if ( flagPrintJunction ) + // printf( "%s %d %d\n", col[2], currentLocation - 2, currentLocation + len - 1 ) ; + } + currentLocation += num ; + num = 0 ; + } + else // Other operations, like M, D,..., + { + currentLocation += num ; + num = 0 ; + }*/ + + junctionCnt += newJuncCnt ; + + if ( newJuncCnt ) + return true ; + else + return false ; +} + +void cigar2string( bam1_core_t *c, uint32_t *in_cigar, char *out_cigar ) +{ + int k, op, l ; + char opcode ; + + *out_cigar = '\0' ; + for ( k = 0 ; k < c->n_cigar ; ++k ) + { + op = in_cigar[k] & BAM_CIGAR_MASK ; + l = in_cigar[k] >> BAM_CIGAR_SHIFT ; + switch (op) + { + case BAM_CMATCH: opcode = 'M' ; break ; + case BAM_CINS: opcode = 'I' ; break ; + case BAM_CDEL: opcode = 'D' ; break ; + case BAM_CREF_SKIP: opcode = 'N' ; break ; + case BAM_CSOFT_CLIP: opcode = 'S' ; break ; + case BAM_CHARD_CLIP: opcode = 'H' ; break ; + case BAM_CPAD: opcode = 'P' ; break ; + } + sprintf( out_cigar + strlen( out_cigar ), "%d%c", l, opcode ) ; + } +} + + + +int main( int argc, char *argv[] ) +{ + FILE *fp ; + samfile_t *fpsam ; + bam1_t *b = NULL ; + bool useSam = true ; + + int i, len ; + int startLocation ; + bool flagRemove = false ; + char prevChrome[103] ; + + anchorBoth = false ; + + flagPrintJunction = false ; + flagPrintAll = false ; + flagStrict = false ; + junctionCnt = 0 ; + bool hasMateReadIdSuffix = false ; + + strcpy( prevChrome, "" ) ; + flagRemove = true ; + flagPrintJunction = true ; + flank = 8 ; + filterYS = 4 ; + + contradictedReads = NULL ; + + // processing the argument list + for ( i = 1 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "-h" ) ) + { + PrintHelp() ; + return 0 ; + } + else if ( !strcmp( argv[i], "-r" ) ) + { + flagRemove = true ; + } + else if ( !strcmp( argv[i], "-j" ) ) + { + strcpy( prevChrome, "" ) ; + flagRemove = true ; + flagPrintJunction = true ; + flank = atoi( argv[i + 1] ) ; + if ( i + 2 < argc && !strcmp( argv[i+2], "-B" ) ) + { + anchorBoth = true ; + ++i ; + } + ++i ; + } + else if ( !strcmp( argv[i], "-a" ) ) + { + flagPrintAll = true ; + } + else if ( !strcmp( argv[i], "--strict" ) ) + { + flagStrict = true ; + } + else if ( !strcmp( argv[i], "-y" ) ) + { + filterYS = atoi( argv[i + 1] ) ; + ++i ; + } + else if ( !strcmp( argv[i], "--hasMateIdSuffix" ) ) + { + hasMateReadIdSuffix = true ; + ++i ; + } + else if ( i > 1 ) + { + printf( "Unknown option %s\n", argv[i] ) ; + exit( 1 ) ; + } + } + if ( argc == 1 ) + { + PrintHelp() ; + return 0 ; + } + + len = strlen( argv[1] ) ; + if ( argv[1][len-3] == 'b' || argv[1][len-3] == 'B' ) + { + if ( !( fpsam = samopen( argv[1], "rb", 0 ) ) ) + return 0 ; + + if ( !fpsam->header ) + { + //samclose( fpsam ) ; + //fpsam = samopen( argv[1], "r", 0 ) ; + //if ( !fpsam->header ) + //{ + useSam = false ; + fp = fopen( argv[1], "r" ) ; + //} + } + } + else + { + useSam = false ; + fp = NULL ; + if ( !strcmp( argv[1], "-" ) ) + fp = stdin ; + else + fp = fopen( argv[1], "r" ) ; + if ( fp == NULL ) + { + printf( "Could not open file %s\n", argv[1] ) ; + return 0 ; + } + } + + while ( 1 ) + { + int flag = 0 ; + if ( useSam ) + { + if ( b ) + bam_destroy1( b ) ; + b = bam_init1() ; + if ( samread( fpsam, b ) <= 0 ) + break ; + if ( b->core.tid >= 0 ) + strcpy( col[2], fpsam->header->target_name[b->core.tid] ) ; + else + strcpy( col[2], "-1" ) ; + cigar2string( &(b->core), bam1_cigar( b ), col[5] ) ; + strcpy( col[0], bam1_qname( b ) ) ; + flag = b->core.flag ; + if ( bam_aux_get( b, "NH" ) ) + { + /*if ( bam_aux2i( bam_aux_get(b, "NH" ) ) >= 2 ) + { + secondary = true ; + } + else + secondary = false ;*/ + + NH = bam_aux2i( bam_aux_get( b, "NH" ) ) ; + } + else + { + //secondary = false ; + NH = 1 ; + } + + if ( bam_aux_get( b, "NM" ) ) + { + editDistance = bam_aux2i( bam_aux_get( b, "NM" ) ) ; + } + else if ( bam_aux_get( b, "nM" ) ) + { + editDistance = bam_aux2i( bam_aux_get( b, "nM" ) ) ; + } + else + editDistance = 0 ; + + mateStart = b->core.mpos + 1 ; + if ( b->core.mtid == b->core.tid ) + col[6][0] = '=' ; + else + col[6][0] = '*' ; + + if ( b->core.l_qseq < 20 ) + continue ; + + for ( i = 0 ; i < b->core.l_qseq ; ++i ) + { + int bit = bam1_seqi( bam1_seq( b ), i ) ; + switch ( bit ) + { + case 1: col[9][i] = 'A' ; break ; + case 2: col[9][i] = 'C' ; break ; + case 4: col[9][i] = 'G' ; break ; + case 8: col[9][i] = 'T' ; break ; + case 15: col[9][i] = 'N' ; break ; + default: col[9][i] = 'A' ; break ; + } + } + col[9][i] = '\0' ; + + /*if ( flag & 0x100 ) + secondary = true ; + else + secondary = false ;*/ + } + else + { + char *p ; + if ( !fgets( line, LINE_SIZE, fp ) ) + break ; + if ( line[0] == '\0' || line[0] == '@' ) + continue ; + sscanf( line, "%s%s%s%s%s%s%s%s%s%s%s", col, col + 1, col + 2, col + 3, col + 4, + col + 5, col + 6, col + 7, col + 8, col + 9, col + 10 ) ; + + flag = atoi( col[1] ) ; + if ( p = strstr( line, "NH" ) ) + { + int k = 0 ; + p += 5 ; + while ( *p != ' ' && *p != '\t' && *p != '\0') + { + k = k * 10 + *p - '0' ; + ++p ; + } + /*if ( k >= 2 ) + { + secondary = true ; + } + else + secondary = false ;*/ + NH = k ; + } + else + { + //secondary = false ; + NH = 1 ; + } + if ( ( p = strstr( line, "NM" ) ) || ( p = strstr( line, "nM" ) ) ) + { + int k = 0 ; + p += 5 ; + while ( *p != ' ' && *p != '\t' && *p != '\0') + { + k = k * 10 + *p - '0' ; + ++p ; + } + editDistance = k ; + } + else + editDistance = 0 ; + + mateStart = atoi( col[7] ) ; + /*if ( flag & 0x100 ) + secondary = true ; + else + secondary = false ;*/ + + } + samFlag = flag ; + for ( i = 0 ; col[5][i] ; ++i ) + if ( col[5][i] == 'N' ) + break ; + + if ( !col[5][i] ) + continue ; + + // remove .1, .2 or /1, /2 suffix + if ( hasMateReadIdSuffix ) + { + char *s = col[0] ; + int len = strlen( s ) ; + if ( len >= 2 && ( s[len - 1] == '1' || s[len - 1] == '2' ) + && ( s[len - 2] == '.' || s[len - 2] == '/' ) ) + { + s[len - 2] = '\0' ; + } + } + + if ( useSam ) + { + if ( bam_aux_get( b, "XS" ) ) + { + strand = bam_aux2A( bam_aux_get( b, "XS" ) ) ; + if ( bam_aux_get( b, "YS" ) ) + { + noncanonStrandInfo = bam_aux2i( bam_aux_get( b, "YS" ) ) ; + } + else + { + noncanonStrandInfo = -1 ; + } + } + else + { + strand = '?' ; + noncanonStrandInfo = -1 ; + } + startLocation = b->core.pos + 1 ; + } + else + { + if ( strstr( line, "XS:A:-" ) ) // on negative strand + strand = '-' ; + else if ( strstr( line, "XS:A:+" ) ) + strand = '+' ; + else + { + strand = '?' ; + char *p = strstr( line, "YS:i:" ) ; + if ( p != NULL ) + { + p += 5 ; + noncanonStrandInfo = atoi( p ) ; + } + else + noncanonStrandInfo = -1 ; + } + startLocation = atoi( col[3] ) ; + } + + // Found the junctions from the read. + if ( strcmp( prevChrome, col[2] ) ) + { + if ( flagPrintJunction ) + { + // Print the remaining elements in the queue + i = qHead ; + while ( i != qTail ) + { + PrintJunction( prevChrome, junctionQueue[i] ) ; + ++i ; + if ( i >= QUEUE_SIZE ) + i = 0 ; + } + } + + // new chromosome + ClearReadTree( contradictedReads ) ; + contradictedReads = NULL ; + qHead = qTail = 0 ; + strcpy( prevChrome, col[2] ) ; + } + + if ( flagRemove ) + { + if ( CompareJunctions( startLocation, col[5] ) ) + { + // Test whether this read has new junctions + //++junctionCnt ; + if ( !flagPrintJunction ) + printf( "%s", line ) ; + } + } + else + { + ++junctionCnt ; + printf( "%s", line ) ; + } + //printf( "hi2 %s\n", col[0] ) ; + } + + if ( flagPrintJunction ) + { + // Print the remaining elements in the queue + i = qHead ; + while ( i != qTail ) + { + PrintJunction( prevChrome, junctionQueue[i] ) ; + ++i ; + if ( i >= QUEUE_SIZE ) + i = 0 ; + } + } + + //fprintf( stderr, "The number of junctions: %d\n", junctionCnt ) ; + return 0 ; +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/GetTrustedSplice.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,429 @@ +#include <stdio.h> +#include <math.h> +#include <vector> +#include <algorithm> + +#include "alignments.hpp" + +#define MAX(x, y) (((x)<(y))?(y):(x)) +#define MIN(x, y) (((x)<(y))?(x):(y)) + +char usage[] = "Usage: ./trust-splice splice_file_list one_bam_file [OPTIONS]\n" + "Options:\n" + "\t-a FLOAT: average number of supported reads from the samples (default: 0.5)\n" ; + +struct _intron +{ + int chrId ; + int start, end ; + double support ; + int uniqSupport ; + int secSupport ; + char strand ; + int editDist ; + + int sampleSupport ; +} ; + +struct _site +{ + int chrId ; + int pos ; + double support ; + + char strand ; + int associatedIntronCnt ; +} ; + +bool CompIntrons( struct _intron a, struct _intron b ) +{ + if ( a.chrId != b.chrId ) + return a.chrId < b.chrId ; + else if ( a.start != b.start ) + return a.start < b.start ; + else if ( a.end != b.end ) + return a.end < b.end ; + return false ; +} + +bool CompSites( struct _site a, struct _site b ) +{ + if ( a.chrId != b.chrId ) + return a.chrId < b.chrId ; + else if ( a.pos != b.pos ) + return a.pos < b.pos ; + return false ; +} + +void CoalesceIntrons( std::vector<struct _intron> &introns ) +{ + std::sort( introns.begin(), introns.end(), CompIntrons ) ; + int size = introns.size() ; + int i, k ; + k = 0 ; + for ( i = 1 ; i < size ; ++i ) + { + if ( introns[i].chrId == introns[k].chrId && introns[i].start == introns[k].start + && introns[i].end == introns[k].end ) + { + introns[k].support += introns[i].support ; + introns[k].uniqSupport += introns[i].uniqSupport ; + introns[k].secSupport += introns[i].secSupport ; + introns[k].sampleSupport += introns[i].sampleSupport ; + + if ( introns[k].strand == '?' ) + introns[k].strand = introns[i].strand ; + } + else + { + ++k ; + introns[k] = introns[i] ; + } + } + introns.resize( k + 1 ) ; +} + +void CoalesceSites( std::vector<struct _site> &sites ) +{ + std::sort( sites.begin(), sites.end(), CompSites ) ; + int size = sites.size() ; + int i, k ; + k = 0 ; + for ( i = 1 ; i < size ; ++i ) + { + if ( sites[i].chrId == sites[k].chrId && sites[i].pos == sites[k].pos ) + { + sites[k].support += sites[i].support ; + sites[k].associatedIntronCnt += sites[i].associatedIntronCnt ; + } + else + { + ++k ; + sites[k] = sites[i] ; + } + } + sites.resize( k + 1 ) ; +} + +int main( int argc, char *argv[] ) +{ + int i, j, k ; + FILE *fpList ; + FILE *fp ; + char spliceFile[2048] ; + char chrName[1024], strand[3] ; + int start, end, uniqSupport, secSupport, uniqEditDistance, secEditDistance ; + double support ; + Alignments alignments ; + std::vector<struct _intron> introns ; + std::vector<struct _site> sites ; + double averageSupportThreshold = 0.5 ; + + if ( argc <= 1 ) + { + printf( "%s", usage ) ; + exit( 1 ) ; + } + + for ( i = 3 ; i < argc ; ++i ) + { + if ( !strcmp( argv[ i ], "-a" ) ) + { + averageSupportThreshold = atof( argv[i + 1] ) ; + ++i ; + } + else + { + printf( "Unknown option: %s", argv[i] ) ; + exit( 1 ) ; + } + } + + alignments.Open( argv[2] ) ; + + // Get the number of samples. + fpList = fopen( argv[1], "r" ) ; + int sampleCnt = 0 ; + while ( fscanf( fpList, "%s", spliceFile ) != EOF ) + { + ++sampleCnt ; + } + fclose( fpList ) ; + + // Get all the introns. + fpList = fopen( argv[1], "r" ) ; + while ( fscanf( fpList, "%s", spliceFile ) != EOF ) + { + fp = fopen( spliceFile, "r" ) ; + while ( fscanf( fp, "%s %d %d %lf %s %d %d %d %d", chrName, &start, &end, &support, + strand, &uniqSupport, &secSupport, &uniqEditDistance, &secEditDistance ) != EOF ) + { + + if ( support <= 0 ) + support = 0.1 ; + else if ( support == 1 && sampleCnt > 5 ) + support = 0.75 ; + + struct _intron ni ; + ni.chrId = alignments.GetChromIdFromName( chrName ) ; + ni.start = start ; + ni.end = end ; + ni.support = support ; + ni.strand = strand[0] ; + ni.uniqSupport = uniqSupport ; + ni.secSupport = secSupport ; + ni.sampleSupport = 1 ; + ni.editDist = uniqEditDistance + secEditDistance ; + introns.push_back( ni ) ; + } + fclose( fp ) ; + + CoalesceIntrons( introns ) ; + } + fclose( fpList ) ; + + // Obtain the split sites. + int intronCnt = introns.size() ; + for ( i = 0 ; i < intronCnt ; ++i ) + { + struct _site ns ; + ns.chrId = introns[i].chrId ; + ns.associatedIntronCnt = 1 ; + ns.support = introns[i].support ; + ns.strand = introns[i].strand ; + + ns.pos = introns[i].start ; + sites.push_back( ns ) ; + ns.pos = introns[i].end ; + sites.push_back( ns ) ; + } + CoalesceSites( sites ) ; + + // Get the chromsomes with too many split sites. + int siteCnt = sites.size() ; + std::vector<bool> badChrom ; + + badChrom.resize( alignments.GetChromCount() ) ; + int size = alignments.GetChromCount() ; + for ( i = 0 ; i < size ; ++i ) + badChrom[i] = false ; + + for ( i = 0 ; i < siteCnt ; ) + { + for ( j = i + 1 ; sites[j].chrId == sites[i].chrId ; ++j ) + ; + //printf( "%s %d %d:\n", alignments.GetChromName( sites[i].chrId ), alignments.GetChromLength( sites[i].chrId ), j - i ) ; + if ( ( j - i ) * 20 > alignments.GetChromLength( sites[i].chrId ) ) + badChrom[ sites[i].chrId ] = true ; + i = j ; + } + + // Output the valid introns. + k = 0 ; + double unit = sampleCnt / 50 ; + if ( unit < 1 ) + unit = 1 ; + + int longIntronSize ; + std::vector<int> intronSizes ; + for (i = 0 ; i < intronCnt ; ++i) + intronSizes.push_back( introns[i].end - introns[i].start + 1 ) ; + std::sort( intronSizes.begin(), intronSizes.end() ) ; + longIntronSize = intronSizes[ int(intronCnt * 0.99) ] ; + if ( longIntronSize > 100000 ) + longIntronSize = 100000 ; + for ( i = 0 ; i < intronCnt ; ++i ) + { + if ( introns[i].support / sampleCnt < averageSupportThreshold ) + continue ; + + if ( badChrom[ introns[i].chrId ] ) + { + if ( introns[i].sampleSupport <= sampleCnt / 2 ) + continue ; + } + + if ( introns[i].uniqSupport <= 2 && ( introns[i].support / sampleCnt < 1 || introns[i].sampleSupport < MIN( 2, sampleCnt ) ) ) + continue ; + + //Locate the two split sites. + while ( sites[k].chrId < introns[i].chrId || ( sites[k].chrId == introns[i].chrId && sites[k].pos < introns[i].start ) ) + { + ++k ; + } + int a, b ; + a = k ; + for ( b = a + 1 ; b < siteCnt ; ++b ) + { + if ( sites[b].chrId == introns[i].chrId && sites[b].pos == introns[i].end ) + break ; + } + double siteSupport = MAX( sites[a].support, sites[b].support ) ; + //if ( introns[i].start == 100233371 && introns[i].end == 100236850 ) + // printf( "test: %lf %lf\n", introns[i].support, siteSupport) ; + if ( introns[i].support < siteSupport / 10.0 ) + { + double needSample = MIN( ( -log( introns[i].support / siteSupport ) / log( 10.0 ) + 1 ) * unit, sampleCnt ) ; + if ( introns[i].sampleSupport < needSample ) + continue ; + } + + if ( sampleCnt >= 100 ) //&& introns[i].end - introns[i].start + 1 >= 50000 ) + { + if ( introns[i].sampleSupport <= sampleCnt * 0.01 ) + continue ; + } + /*if ( sampleCnt >= 50 ) + { + // just some randomly intron. + if ( introns[i].sampleSupport == 1 + && ( sites[a].associatedIntronCnt == 1 || sites[b].associatedIntronCnt == 1 ) ) + continue ; + }*/ + + /*if ( introns[i].end - introns[i].start + 1 < 50 ) + { + int needSample = MIN( ( 5 - ( introns[i].end - introns[i].start + 1 ) / 10 ) * unit, sampleCnt ) ; + int flag = 0 ; + + if ( introns[i].sampleSupport < needSample ) + flag = 1 ; + if ( flag == 1 ) + continue ; + }*/ + if ( introns[i].strand == '?' && sampleCnt == 1 && + ( introns[i].support < 5 || introns[i].uniqSupport == 0 || introns[i].support < 2 * introns[i].editDist ) ) + continue ; + + // Since the strand is uncertain, the alinger may make different decision sample from sample. + // To keep this intron, one of its splice sites must be more supported than adjacent splice sites. + if ( introns[i].strand == '?' && introns[i].sampleSupport <= 0.5 * sampleCnt ) + { + int s, e ; + int l ; + int cnt = 0 ; + for ( l = 0 ; l < 2 ; ++l ) + { + int ind = ( l == 0 ) ? a : b ; + double max = sites[ind].support ; + int maxTag = ind ; + for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s ) + { + if ( sites[s].pos + 7 < sites[s + 1].pos ) + break ; + if ( sites[s].support >= max ) + { + max = sites[s].support ; + maxTag = s ; + } + } + + for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e ) + { + if ( sites[e].pos - 7 > sites[e - 1].pos ) + break ; + if ( sites[e].support >= max ) + { + max = sites[e].support ; + maxTag = e ; + } + } + + if ( maxTag == ind ) + ++cnt ; + } + if ( cnt == 0 ) + continue ; + } + + // Test whether this a intron coming from a wrong strand + /*if ( b - a + 1 >= 10 && introns[i].strand != '?' && introns[i].sampleSupport <= 0.5 * sampleCnt ) + { + int plusStrand = 0 ; + int minusStrand = 0 ; + int l ; + int s, e ; + + if ( introns[i].strand == '+' ) + plusStrand = 2 ; + else + minusStrand = 2 ; + + for ( l = 0 ; l < 2 ; ++l ) + { + int ind = ( l == 0 ) ? a : b ; + for ( s = ind - 1 ; s >= 0 && sites[s].chrId == sites[ind].chrId ; --s ) + { + if ( sites[s].pos + 10000 < sites[s + 1].pos ) + break ; + + if ( sites[s].strand == '+' ) + ++plusStrand ; + else if ( sites[s].strand == '-' ) + ++minusStrand ; + } + + for ( e = ind + 1 ; e < siteCnt && sites[e].chrId == sites[ind].chrId ; ++e ) + { + if ( sites[e].pos - 10000 > sites[e - 1].pos ) + break ; + + if ( sites[e].strand == '+' ) + ++plusStrand ; + else if ( sites[e].strand == '-' ) + ++minusStrand ; + } + } + + if ( introns[i].start == 161517978 ) + printf( "capture: %d %d %d %d\n", a, b, minusStrand, plusStrand) ; + + if ( introns[i].strand == '+' && minusStrand >= 20 && plusStrand == 2 ) + continue ; + else if ( introns[i].strand == '-' && plusStrand >= 20 && minusStrand == 2 ) + continue ; + + }*/ + + // Filter a intron if one of its splice site associated with too many introns. + /*if ( sites[a].associatedIntronCnt >= 10 ) + { + int needSample = MIN( sites[a].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ; + if ( introns[i].support < sites[a].support / 100 && + introns[i].sampleSupport < needSample ) + { + continue ; + } + } + if ( sites[b].associatedIntronCnt >= 10 ) + { + int needSample = MIN( sites[b].associatedIntronCnt / 100 * sampleCnt, sampleCnt ) ; + if ( introns[i].support < sites[b].support / 100 && + introns[i].sampleSupport < needSample ) + { + continue ; + } + }*/ + + + // Test for long intron + if ( introns[i].end - introns[i].start + 1 >= longIntronSize ) + { + int needSample = MIN( ( ( introns[i].end - introns[i].start + 1 ) / longIntronSize + 1 ) * unit, sampleCnt ) ; + int flag = 0 ; + if ( (double)introns[i].uniqSupport / ( introns[i].uniqSupport + introns[i].secSupport ) < 0.1 + || introns[i].uniqSupport / sampleCnt < 1 + || introns[i].sampleSupport < needSample ) + flag = 1 ; + if ( flag == 1 && introns[i].end - introns[i].start + 1 >= 3 * longIntronSize ) + continue ; + if ( flag == 1 && ( sites[a].associatedIntronCnt > 1 || sites[b].associatedIntronCnt > 1 || introns[i].sampleSupport <= 1 ) ) // an intron may connect two genes + continue ; + } + + + printf( "%s %d %d 10 %c 10 0 0 0\n", alignments.GetChromName( introns[i].chrId ), introns[i].start, introns[i].end, introns[i].strand ) ; + } + + return 0 ; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/GetTrustedSplice.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,125 @@ +#!/bin/perl + +use strict ; +use warnings ; +use List::Util qw[min max]; + +die "usage: a.pl path_to_list_of_splice_file > trusted.splice\n" if ( @ARGV == 0 ) ; + +my %spliceSupport ; +my %spliceSampleSupport ; +my %spliceUniqSupport ; +my %spliceSecSupport ; +my %uniqSpliceSites ; +my %spliceSiteSupport ; + +my $sampleCnt = 0 ; +open FP1, $ARGV[0] ; +while ( <FP1> ) +{ + ++$sampleCnt ; +} +close FP1 ; + +open FP1, $ARGV[0] ; +while ( <FP1> ) +{ + chomp ; + open FP2, $_ ; + while ( <FP2> ) + { + chomp ; + my $line = $_ ; + my @cols = split /\s+/, $line ; + my $key = $cols[0]." ".$cols[1]." ".$cols[2]." ".$cols[4] ; + if ( $cols[3] <= 0 ) + { + $cols[3] = 0.1 ; + } + elsif ( $cols[3] == 1 && $sampleCnt > 5 ) + { + $cols[3] = 0.75 ; + } + + if ( ! defined $spliceSupport{$key} ) + { + $spliceSupport{ $key } = $cols[3] ; + $spliceSampleSupport{ $key } = 1 ; + $spliceUniqSupport{ $key } = $cols[5] ; + $spliceSecSupport{ $key } = $cols[6] ; + } + else + { + $spliceSupport{ $key } += $cols[3] ; + $spliceSampleSupport{ $key } += 1 ; + $spliceUniqSupport{ $key } += $cols[5] ; + $spliceSecSupport{ $key } += $cols[6] ; + } + + for ( my $i = 1 ; $i <=2 ; ++$i ) + { + $key = $cols[0]." ".$cols[$i] ; + if ( defined $spliceSiteSupport{ $key } ) + { + $spliceSiteSupport{ $key } += $cols[3] ; + } + else + { + $spliceSiteSupport{ $key } = $cols[3] ; + } + + if ( defined $uniqSpliceSites{ $key } && $uniqSpliceSites{ $key } != $cols[2 - $i + 1] ) + { + $uniqSpliceSites{ $key } = -1 ; + } + else + { + $uniqSpliceSites{ $key } = $cols[2 - $i + 1] ; + } + } + } + close FP2 ; +} +close FP1 ; + +foreach my $key (keys %spliceSupport) +{ + next if ( $spliceSupport{ $key } / $sampleCnt < 0.5 ) ; + #next if ( $spliceUniqSupport{$key} / ( $spliceSecSupport{$key} + $spliceUniqSupport{$key} ) < 0.01 ) ; + next if ( $spliceUniqSupport{$key} <= 2 && ( $spliceSupport{ $key } / $sampleCnt < 1 || $spliceSampleSupport{$key} < min( 2, $sampleCnt ) ) ) ; + + my @cols = split /\s+/, $key ; + my $flag = 0 ; + #if ( $cols[2] - $cols[1] + 1 >= 10000 ) + #{ + # $flag = 1 if ( $spliceSupport{ $key } / $sampleCnt < 1 ) ; + #} + my $siteSupport = max( $spliceSiteSupport{ $cols[0]." ".$cols[1] }, $spliceSiteSupport{ $cols[0]." ".$cols[2] } ) ; + + if ( $spliceSupport{ $key } < $siteSupport / 10.0 ) + { + #print $spliceSupport{ $key } / $siteSupport, " ", -log( $spliceSupport{ $key } / $siteSupport ) / log( 10.0 ), "\n" ; + #if ( $cols[1] == 73518141 && $cols[2] == 73518206 ) + #{ + # print "test: ", $spliceSupport{$key}, " $siteSupport ", -log( $spliceSupport{ $key } / $siteSupport ), "\n"; + #} + my $needSample = min( -log( $spliceSupport{ $key } / $siteSupport ) / log( 10.0 ) + 1, $sampleCnt ) ; + next if ( $spliceSampleSupport{ $key } < $needSample ) ; + } + + if ( $cols[2] - $cols[1] + 1 >= 100000 ) + { + my $needSample = int( ( $cols[2] - $cols[1] + 1 ) / 100000 ) + 1 ; + $needSample = $sampleCnt if ( $needSample > $sampleCnt ) ; + $flag = 1 if ( $spliceUniqSupport{$key} / ( $spliceSecSupport{$key} + $spliceUniqSupport{$key} ) < 0.1 + || ( $spliceUniqSupport{ $key } / $sampleCnt < 1 ) + || $spliceSampleSupport{ $key } < $needSample ) ; + next if ( $flag == 1 && $cols[2] - $cols[1] + 1 >= 300000 ) ; + } + if ( $flag == 1 && ( ( $uniqSpliceSites{ $cols[0]." ".$cols[1] } == -1 || $uniqSpliceSites{ $cols[0]." ".$cols[2] } == -1 ) + || $spliceSampleSupport{ $key } <= 1 ) ) + { + next ; + } + print $cols[0], " ", $cols[1], " ", $cols[2], " 10 ", $cols[3], " 10 0 0 0\n" ; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/Makefile Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,70 @@ +CXX = g++ +CXXFLAGS= -Wall -O3 #-g #-std=c++11 #-Wall #-g +#CXXFLAGS= -Wall -g #-std=c++11 #-Wall #-g +LINKPATH= -I./samtools-0.1.19 -L./samtools-0.1.19 +LINKFLAGS = -lbam -lz -lm -lpthread +DEBUG= +OBJECTS = stats.o subexon-graph.o + +all: subexon-info combine-subexons classes vote-transcripts junc grader trust-splice add-genename addXS + +subexon-info: subexon-info.o $(OBJECTS) + if [ ! -f ./samtools-0.1.19/libbam.a ] ; \ + then \ + cd samtools-0.1.19 ; make ;\ + fi ; + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) subexon-info.o $(LINKFLAGS) + +combine-subexons: combine-subexons.o $(OBJECTS) + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) combine-subexons.o $(LINKFLAGS) + +classes: classes.o constraints.o transcript-decider.o $(OBJECTS) + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) constraints.o transcript-decider.o classes.o $(LINKFLAGS) + +trust-splice: trust-splice.o + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) trust-splice.o $(LINKFLAGS) + +vote-transcripts: vote-transcripts.o + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) $(OBJECTS) vote-transcripts.o $(LINKFLAGS) + +junc: junc.o + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) junc.o $(LINKFLAGS) + +grader: grader.o + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) grader.o $(LINKFLAGS) + +addXS: addXS.o + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) addXS.o $(LINKFLAGS) + +add-genename: add-genename.o + $(CXX) -o $@ $(LINKPATH) $(CXXFLAGS) add-genename.o $(LINKFLAGS) + +subexon-info.o: SubexonInfo.cpp alignments.hpp blocks.hpp support.hpp defs.h stats.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +combine-subexons.o: CombineSubexons.cpp alignments.hpp blocks.hpp support.hpp defs.h stats.hpp SubexonGraph.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +stats.o: stats.cpp stats.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +subexon-graph.o: SubexonGraph.cpp SubexonGraph.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +constraints.o: Constraints.cpp Constraints.hpp SubexonGraph.hpp alignments.hpp BitTable.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +transcript-decider.o: TranscriptDecider.cpp TranscriptDecider.hpp Constraints.hpp BitTable.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +classes.o: classes.cpp SubexonGraph.hpp SubexonCorrelation.hpp BitTable.hpp Constraints.hpp alignments.hpp TranscriptDecider.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +trust-splice.o: GetTrustedSplice.cpp alignments.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +vote-transcripts.o: Vote.cpp TranscriptDecider.hpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +junc.o: FindJunction.cpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +grader.o: grader.cpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +addXS.o: AddXS.cpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) +add-genename.o: AddGeneName.cpp + $(CXX) -c -o $@ $(LINKPATH) $(CXXFLAGS) $< $(LINKFLAGS) + +clean: + rm -f *.o *.gch subexon-info combine-subexons trust-splice vote-transcripts junc grader add-genename addXS
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/ManipulateIntronFile.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,71 @@ +#!/usr/bin/env perl + +use strict ; +use warnings ; + +die "usage: a.pl intronA intronB [op]\n" if (@ARGV == 0 ) ; + +my %intronInfo ; +my %chromRank ; + +sub sortIntron +{ + my @cols1 = split /\s+/, $a ; + my @cols2 = split /\s+/, $b ; + + if ( $cols1[0] ne $cols2[0] ) + { + $chromRank{ $cols1[0] } cmp $chromRank{ $cols2[0] } ; + } + elsif ( $cols1[1] != $cols2[1] ) + { + $cols1[1] <=> $cols2[1] ; + } + else + { + $cols1[2] <=> $cols2[2] ; + } +} + +open FP1, $ARGV[0] ; +my $cnt = 0 ; +while ( <FP1> ) +{ + chomp ; + my $line = $_ ; + my @cols = split /\s+/ ; + push @cols, 1 ; + @{ $intronInfo{ $cols[0]." ".$cols[1]." ".$cols[2] } } = @cols ; + if ( !defined $chromRank{ $cols[0]} ) + { + $chromRank{ $cols[0] } = $cnt ; + ++$cnt ; + } +} +close FP1 ; + +open FP1, $ARGV[1] ; +while ( <FP1> ) +{ + chomp ; + my $line = $_ ; + my @cols = split /\s+/ ; + my $key = $cols[0]." ".$cols[1]." ".$cols[2] ; + next if ( !defined $intronInfo{ $key } ) ; + my @infoCols = @{ $intronInfo{ $key } } ; + $infoCols[4] = $cols[4] if ( $infoCols[4] ne "+" || $infoCols[4] ne "-" ) ; + $infoCols[9] |= 2 ; + + @{ $intronInfo{ $key } } = @infoCols ; +} +close FP1 ; + +foreach my $key (sort sortIntron keys %intronInfo ) +{ + my @infoCols = @{ $intronInfo{ $key } } ; + #print join( " ", @infoCols ), "\n" ; + + next if ( $infoCols[9] != 3 ) ; + pop @infoCols ; + print join( " ", @infoCols ), "\n" ; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/README.md Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,118 @@ +PsiCLASS +======= + +Described in: + +Song, L., Sabunciyan, S., Yang, G. and Florea, L. [A multi-sample approach increases the accuracy of transcript assembly](https://www.nature.com/articles/s41467-019-12990-0). *Nat Commun* 10, 5000 (2019) + + Copyright (C) 2018- and GNU GPL by Li Song, Liliana Florea + +Includes portions copyright from: + + samtools - Copyright (C) 2008-, Genome Research Ltd, Heng Li + +Commands, scripts and supporting data for the paper can be found [here](https://github.com/splicebox/PsiCLASS_paper/). + +### What is PsiCLASS? + +PsiCLASS is a reference-based transcriptome assembler for single or multiple RNA-seq samples. Unlike conventional methods that analyze each sample separately and then merge the outcomes to create a unified set of meta-annotations, PsiCLASS takes a multi-sample approach, simultaneously analyzing all RNA-seq data sets in an experiment. PsiCLASS is both a transcript assembler and a meta-assembler, producing separate transcript sets for the individual samples and a unified set of meta-annotations. The algorithmic underpinnings of PsiCLASS include using a global subexon splice graph, statistical cross-sample feature (intron, subexon) selection methods, and an efficient dynamic programming algorithm to select a subset of transcripts from among those encoded in the graph, based on the read support in each sample. Lastly, the set of meta-annotations is selected from among the transcripts generated for individual samples by voting. While PsiCLASS is highly accurate and efficient for medium-to-large collections of RNA-seq data, its accuracy is equally high for small RNA-seq data sets (2-10 samples) and is competitive to reference methods for single samples. Additionally, its performance is robust with the aggregation method used, including the built-in voting and assembly-based approaches such as StringTie-merge and TACO. Therefore, it can be effectively used as a multi-sample and as a single-sample assembler, as well as in conventional assemble-and-merge protocols. + +### Install + +1. Clone the [GitHub repo](https://github.com/splicebox/psiclass), e.g. with `git clone https://github.com/splicebox/psiclass.git` +2. Run `make` in the repo directory + +You will find the executable files in the downloaded directory. If you want to run PsiCLASS without specifying the directory, you can either add the directory of PsiCLASS to the environment variable PATH or create a soft link ("ln -s") of the file "psiclass" to a directory in PATH. + +PsiCLASS depends on [pthreads](http://en.wikipedia.org/wiki/POSIX_Threads) and samtools depends on [zlib](http://en.wikipedia.org/wiki/Zlib). + + +### Usage + + Usage: ./psiclass [OPTIONS] + Required: + -b STRING: paths to the alignment BAM files; use comma to separate multiple BAM files + or + --lb STRING: path to the file listing the alignment BAM files + Optional: + -s STRING: path to the trusted splice file (default: not used) + -o STRING: prefix of output files (default: ./psiclass) + -p INT: number of threads (default: 1) + -c FLOAT: only use the subexons with classifier score <= than the given number. (default: 0.05) + --sa FLOAT: the minimum average number of supported read for retained introns (default: 0.5) + --vd FLOAT : the minimum average coverage depth of a transcript to be reported in voting (defaults: 1.0) + --maxDpConstraintSize: the number of subexons a constraint can cover in DP. (default: 7. -1 for inf) + --primaryParalog: use primary alignment to retain paralog genes (default: use unique alignments) + --version: print version and exit + --stage INT: (default: 0) + 0-start from the beginning - building the splice site file for each sample + 1-start from building the subexon file for each samples + 2-start from combining the subexon files across samples + 3-start from assembling the transcripts for each sample + 4-start from voting the consensus transcripts across samples + +### Practical notes + +*Alignment compatibility.* PsiCLASS has been tuned to run on alignments generated with the tools [HISAT](https://ccb.jhu.edu/software/hisat/index.shtml) and [STAR](https://github.com/alexdobin/STAR). + +When running PsiCLASS with STAR alignments, run STAR with the option `--outSAMstrandField intronMotif`, which will include the XS field indicating the strand in the BAM alignments. Further, when including alignments with *non-canonical splice sites*, use the provided `addXS` executable to add the XS field: + + samtools view -h in.bam | ./addXS reference_genome.fa | samtools view -bS - > out.bam + +*Trusted introns from other sources.* By default, PsiCLASS determines a set of trusted introns from the input spliced alignments, to use in building the global subexon graph. Alternatively, the user can supply an external set of trusted introns, for instance extracted from the GENCODE gene annotations or judiciously selected from the input data using a tool like [JULIP](https://github.com/Guangyu-Yang/JULiP). This file must contain three columns: + + chr_name start_site end_site + +*Voting optimization.* The default parameters for voting have been calibrated and perform near-optimally for a wide variety of data, including with varying levels of coverage and different library construction protocols. However, if further optimization is desired, to determine a better cutoff value one can run the voting stage (see [Usage](#usage) above) with different parameter values, and assess the performance against a reference set of gene annotations, such as [GENCODE](https://www.gencodegenes.org). The program 'grader', included in the package, can be used for this purpose. Note that the per sample sets of transcripts will remain unchanged. + +*Add gene name.* For many applications, it would be desirable to associate the known (annotated) gene name with each transcript. PsiCLASS provides the program "add-genename" for such purpose. "add-genename" takes as input a GTF file containing a reference set of gene annotations and a file listing the raw GTF files, and generates a new GTF file for each input raw GTF file by appending the annotated gene names. If a gene is not found in the annotation, "add-genename" will use "novel_INT" to represent its gene name. The program can be run as: + + ./add-genename annotation.gtf gtflist + +### Input/Output + +The primary input to PsiCLASS is a set of BAM alignment files, one for each RNA-seq sample in the analysis. The program calculates a set of subexon files and a set of splice (intron) files, for the individual samples. (Optionally, one may specify a path to an external file of trusted introns as explained [above](#practical-notes).) The output consists of one GTF file of transcripts for each sample, and the GTF file of meta-annotations produced by voting, stored in the output directory: + + Sample-wise GTF files: (psiclass)_sample_{0,1,...,n-1}.gtf + Meta-assembly GTF file: (psiclass)_vote.gtf + +where indices 0,1,...,n-1 match the order of the input BAM files. + +Subexon and splice (intron) files, and other auxiliary files, are in the subdirectories: + + Intron files: splice/* + Subexon graph files: subexon/* + Log file: (psiclass)_classes.log + +### Example + +The directory './example' in this distribution contains two BAM files, along with an example of a BAM list file. Run PsiCLASS with: + + ./psiclass -b example/s1.bam,example/s2.bam + +or + + ./psiclass --lb example/slist + +The run will generate the files 'psiclass_sample_0.gtf' for 's1.bam', 'psiclass_sample_1.gtf' for 's2.bam', and the file 'psiclass_vote.gtf' containing the meta-assemblies. + +### Terms of use + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received (LICENSE.txt) a copy of the GNU General +Public License along with this program; if not, you can obtain one from +http://www.gnu.org/licenses/gpl.txt or by writing to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +### Support + +Create a [GitHub issue](https://github.com/splicebox/PsiCLASS/issues).
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/SubexonCorrelation.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,270 @@ +#ifndef _MOURISL_CLASSES_SUBEXONCORRELATION_HEADER +#define _MOURISL_CLASSES_SUBEXONCORRELATION_HEADER + +#include "SubexonGraph.hpp" + +#include <stdio.h> +#include <math.h> + +class SubexonCorrelation +{ +private: + struct _subexon *lastSubexons ; + std::vector<FILE *> fileList ; + int offset ; + int prevSeCnt ; + int seCnt ; + + double **correlation ; + + int OverlapSize( int s0, int e0, int s1, int e1 ) + { + int s = -1, e = -1 ; + if ( e0 < s1 || s0 > e1 ) + return 0 ; + s = s0 > s1 ? s0 : s1 ; + e = e0 < e1 ? e0 : e1 ; + return e - s + 1 ; + } +public: + SubexonCorrelation() + { + offset = 0 ; + lastSubexons = NULL ; + correlation = NULL ; + prevSeCnt = 0 ; + } + + ~SubexonCorrelation() + { + int cnt = fileList.size() ; + int i ; + for ( i = 0 ; i < cnt ; ++i ) + fclose( fileList[i] ) ; + delete[] lastSubexons ; + } + + void Initialize( char *f ) + { + FILE *fpSl ; + fpSl = fopen( f, "r" ) ; + char buffer[1024] ; + while ( fgets( buffer, sizeof( buffer ), fpSl ) != NULL ) + { + int len = strlen( buffer ) ; + if ( buffer[len - 1] == '\n' ) + { + buffer[len - 1] = '\0' ; + --len ; + + } + FILE *fp = fopen( buffer, "r" ) ; + fileList.push_back( fp ) ; + } + + lastSubexons = new struct _subexon[ fileList.size() ] ; + int i, cnt ; + cnt = fileList.size() ; + for ( i = 0 ; i < cnt ; ++i ) + lastSubexons[i].chrId = -1 ; + + } + + // We assume the subexons only contains the subexons we are interested in. + // And we assume that each time we call this function, the subexons we are interested are + // sorted in order. + void ComputeCorrelation( struct _subexon *subexons, int cnt, Alignments &alignments ) + { + int sampleCnt = fileList.size() ; + if ( sampleCnt <= 1 ) + return ; + int i, j, k ; + seCnt = cnt ; + + // Obtain the depth matrix. + double **depth ; + depth = new double* [sampleCnt] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + depth[i] = new double[ cnt ] ; + memset( depth[i], 0, sizeof( double ) * cnt ) ; + } + + char buffer[2048] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + struct _subexon se ; + bool useLastSubexon = false ; + if ( lastSubexons[i].chrId != -1 ) + { + se = lastSubexons[i] ; + useLastSubexon = true ; + } + else + se.chrId = -1 ; + + // Locate the subexon + while ( 1 ) + { + if ( se.chrId < subexons[0].chrId + || ( se.chrId == subexons[0].chrId && se.end < subexons[0].start ) ) + { + if ( fgets( buffer, sizeof( buffer ), fileList[i] ) == NULL ) + { + buffer[0] = '\0' ; + break ; + } + if ( buffer[0] == '#' ) + continue ; + + SubexonGraph::InputSubexon( buffer, alignments, se ) ; + --se.start ; --se.end ; + useLastSubexon = false ; + continue ; + } + break ; + } + if ( buffer[0] == '\0' ) + { + lastSubexons[i] = se ; + continue ; + } + + // Process the subexon. + int tag = 0 ; + while ( 1 ) + { + lastSubexons[i] = se ; + + if ( useLastSubexon == false ) + { + SubexonGraph::InputSubexon( buffer, alignments, se ) ; + --se.start ; --se.end ; + } + + if ( se.chrId > subexons[cnt - 1].chrId || se.start > subexons[cnt - 1].end ) + break ; + + while ( 1 ) + { + if ( tag > cnt || subexons[tag].end >= se.start ) + break ; + ++tag ; + } + + for ( j = tag ; j < cnt && subexons[j].start <= se.end ; ++j ) + { + int overlap = OverlapSize( se.start, se.end, subexons[j].start, subexons[j].end ) ; + depth[i][j] += overlap * se.avgDepth ; + } + + if ( fgets( buffer, sizeof( buffer ), fileList[i] ) == NULL ) + break ; + } + } + // Normalize the depth + for ( i = 0 ; i < sampleCnt ; ++i ) + { + for ( j = 0 ; j < cnt ; ++j ) + depth[i][j] /= ( subexons[j].end - subexons[j].start + 1 ) ; + } + + // Compute the correlation. + double *avg = new double[cnt] ; + double *var = new double[cnt] ; + if ( correlation != NULL ) + { + for ( i = 0 ; i < prevSeCnt ; ++i ) + delete[] correlation[i] ; + delete[] correlation ; + } + + correlation = new double*[cnt] ; + for ( i = 0 ; i < cnt ; ++i ) + correlation[i] = new double[cnt] ; + + memset( avg, 0, sizeof( double ) * cnt ) ; + memset( var, 0, sizeof( double ) * cnt ) ; + for ( i = 0 ; i < cnt ; ++i ) + { + //avg[i] = 0 ; + //var[i] = 0 ; + memset( correlation[i], 0, sizeof( double ) * cnt ) ; + correlation[i][i] = 1 ; + } + + for ( i = 0 ; i < sampleCnt ; ++i ) + for ( j = 0 ; j < cnt ; ++j ) + { + avg[j] += depth[i][j] ; + var[j] += depth[i][j] * depth[i][j] ; + } + for ( i = 0 ; i < cnt ; ++i ) + { + avg[i] /= sampleCnt ; + var[i] = var[i] / sampleCnt - avg[i] * avg[i] ; + } + + for ( i = 0 ; i < sampleCnt ; ++i ) + for ( j = 0 ; j < cnt ; ++j ) + for ( k = j + 1 ; k < cnt ; ++k ) + { + //if ( subexons[j].geneId == subexons[k].geneId ) + correlation[j][k] += depth[i][j] * depth[i][k] ; + } + for ( j = 0 ; j < cnt ; ++j ) + for ( k = j + 1 ; k < cnt ; ++k ) + { + if ( var[j] > 1e-6 && var[k] > 1e-6 ) + { + correlation[j][k] = ( correlation[j][k] / sampleCnt - avg[j] * avg[k] ) / sqrt( var[j] * var[k] ) ; + } + else + correlation[j][k] = 0 ; + //printf( "%d %d %d\n", j, k, cnt ) ; + correlation[k][j] = correlation[j][k] ; + } + //printf( "%lf\n", correlation[0][1] ) ; + // Release the memory. + for ( i = 0 ; i < sampleCnt ; ++i ) + delete[] depth[i] ; + delete[] depth ; + delete[] avg ; + delete[] var ; + + prevSeCnt = cnt ; + } + + double Query( int i, int j ) + { + if ( fileList.size() > 1 ) + return correlation[i][j] ; + else + return 0 ; + } + + void Assign( const SubexonCorrelation &c ) + { + int i ; + fileList = c.fileList ; + if ( fileList.size() <= 1 ) + return ; + + if ( correlation != NULL ) + { + for ( i = 0 ; i < seCnt ; ++i ) + delete[] correlation[i] ; + delete[] correlation ; + } + + seCnt = c.seCnt ; + correlation = new double*[seCnt] ; + for ( i = 0 ; i < seCnt ; ++i ) + { + correlation[i] = new double[seCnt] ; + memcpy( correlation[i], c.correlation[i], sizeof( double ) * seCnt ) ; + } + } +} ; + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/SubexonGraph.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,241 @@ +#include "SubexonGraph.hpp" + +void SubexonGraph::GetGeneBoundary( int tag, int &boundary, int timeStamp ) +{ + if ( visit[tag] == timeStamp ) + return ; + //printf( "%d %d\n", tag, timeStamp ) ; + visit[tag] = timeStamp ; + if ( subexons[tag].end > boundary ) + boundary = subexons[tag].end ; + //if ( subexons[tag].start == 2858011 ) + // printf( "%d: %d %d\n", tag, subexons[tag].nextCnt, subexons[tag].prevCnt) ; + int i ; + int cnt = subexons[tag].nextCnt ; + for ( i = 0 ; i < cnt ; ++i ) + { + //printf( "next of %d: %d %d\n", tag, i, subexons[tag].next[i] ) ; + GetGeneBoundary( subexons[tag].next[i], boundary, timeStamp ) ; + } +} + +int SubexonGraph::GetGeneIntervalIdx( int startIdx, int &endIdx, int timeStamp ) +{ + int i ; + int seCnt = subexons.size() ; + if ( startIdx >= seCnt ) + return -1 ; + int farthest = -1 ; + GetGeneBoundary( startIdx, farthest, timeStamp ) ; + + for ( i = startIdx + 1 ; i < seCnt ; ++i ) + { + if ( subexons[i].start > farthest || subexons[i].chrId != subexons[ startIdx ].chrId ) + break ; + + GetGeneBoundary( i, farthest, timeStamp ) ; + } + endIdx = i - 1 ; + + return endIdx ; +} + +int SubexonGraph::ComputeGeneIntervals() +{ + int i, cnt ; + int seCnt = subexons.size() ; + visit = new int[seCnt] ; + memset( visit, -1, sizeof( int ) * seCnt ) ; + int tag = 0 ; + cnt = 0 ; + while ( 1 ) + { + struct _geneInterval ngi ; + //printf( "%d %d %d\n", tag, subexons[tag].start + 1, subexons[tag].end + 1 ) ; + if ( GetGeneIntervalIdx( tag, ngi.endIdx, cnt ) == -1 ) + break ; + ++cnt ; + ngi.startIdx = tag ; + ngi.start = subexons[ ngi.startIdx ].start ; + ngi.end = subexons[ ngi.endIdx ].end ; + + tag = ngi.endIdx + 1 ; + // Adjust the extent + // Adjust the start + if ( subexons[ ngi.startIdx ].leftStrand != 0 + && subexons[ngi.startIdx].leftStrand != subexons[ngi.startIdx ].rightStrand ) + // We should make sure that rightstrand is non-zero whenever left-strand is non-zero for the startIdx. + { + for ( i = ngi.startIdx ; i >= 0 ; --i ) + { + if ( ( subexons[i].leftType == 1 && subexons[i].leftClassifier < classifierThreshold ) // an end within the subexon + || ( subexons[i].leftType == 0 ) // probably a overhang subexon. It should be a subset of the criterion following. + || ( i > 0 && subexons[i - 1].end + 1 < subexons[i].start ) ) // a gap. + break ; + } + ngi.start = subexons[i].start ; + } + + // Adjust the end. + // And here, we also need to decide wether we need to adjust "tag" or not, + // because the next interval might be overlap with current interval by the last subexon. + // We solve the overlap genes now, so we DON'T need to adjust tag. + if ( subexons[ ngi.endIdx ].rightStrand != 0 + && subexons[ngi.endIdx].leftStrand != subexons[ngi.endIdx ].rightStrand ) + { + for ( i = ngi.endIdx ; i < seCnt ; ++i ) + { + if ( ( subexons[i].rightType == 2 && subexons[i].rightClassifier < classifierThreshold ) // an end within the subexon + || ( subexons[i].rightType == 0 ) // probably a overhang subexon. + || ( i < seCnt - 1 && subexons[i].end + 1 < subexons[i + 1].start ) ) // a gap + break ; + } + ngi.end = subexons[i].end ; + + /*if ( subexons[ ngi.endIdx ].rightType == 2 ) + { + for ( i = ngi.endIdx ; i >= ngi.startIdx ; --i ) + { + if ( subexons[i].leftType == 1 ) + break ; + } + // The last region overlapps. + if ( i >= ngi.startIdx && subexons[i].leftStrand != subexons[ ngi.endIdx ].rightStrand ) + --tag ; + }*/ + } + geneIntervals.push_back( ngi ) ; + } + delete[] visit ; + + return cnt ; +} + +int SubexonGraph::ExtractSubexons( int startIdx, int endIdx, struct _subexon *retList ) +{ + int i, j, k ; + int cnt = endIdx - startIdx + 1 ; + //printf( "%s: %d %d %d\n", __func__, startIdx, endIdx, cnt ) ; + for ( i = 0 ; i < cnt ; ++i ) + { + retList[i] = subexons[i + startIdx] ; + retList[i].geneId = -1 ; + retList[i].prev = new int[ retList[i].prevCnt ] ; + retList[i].next = new int[ retList[i].nextCnt ] ; + + for ( j = 0 ; j < retList[i].prevCnt ; ++j ) + retList[i].prev[j] = subexons[i + startIdx].prev[j] - startIdx ; + for ( j = 0 ; j < retList[i].nextCnt ; ++j ) + retList[i].next[j] = subexons[i + startIdx].next[j] - startIdx ; + + for ( j = 0, k = 0 ; j < retList[i].prevCnt ; ++j ) + if ( retList[i].prev[j] >= 0 && retList[i].prev[j] < cnt ) + { + retList[i].prev[k] = retList[i].prev[j] ; + ++k ; + } + retList[i].prevCnt = k ; + + for ( j = 0, k = 0 ; j < retList[i].nextCnt ; ++j ) + if ( retList[i].next[j] >= 0 && retList[i].next[j] < cnt ) + { + retList[i].next[k] = retList[i].next[j] ; + ++k ; + } + retList[i].nextCnt = k ; + } + UpdateGeneId( retList, cnt ) ; + return cnt ; +} + +void SubexonGraph::SetGeneId( int tag, int strand, struct _subexon *subexons, int seCnt, int id ) +{ + if ( subexons[tag].geneId != -1 && subexons[tag].geneId != -2 ) + { + if ( subexons[tag].geneId != id ) // a subexon may belong to more than one gene. + { + //printf( "Set -2, %d: %d %d %d %d\n", id, tag, subexons[tag].geneId, subexons[tag].start + 1, strand ) ; + subexons[tag].geneId = -2 ; + } + else + return ; + // There is no need to terminate at the ambiguous exon, the strand will prevent + // us from overwriting previous gene ids. + //return ; + } + else if ( subexons[tag].geneId == -2 ) + return ; + //printf( "%d: %d %d %d %d\n", id, tag, subexons[tag].geneId, subexons[tag].start + 1, strand ) ; + int i ; + if ( subexons[tag].geneId != -2 ) + subexons[ tag ].geneId = id ; + int cnt = subexons[tag].nextCnt ; + // Set through the introns. + if ( IsSameStrand( strand, subexons[tag].rightStrand ) ) + { + for ( i = 0 ; i < cnt ; ++i ) + if ( subexons[ subexons[tag].next[i] ].start > subexons[tag].end + 1 ) + SetGeneId( subexons[tag].next[i], strand, subexons, seCnt, id ) ; + } + + cnt = subexons[tag].prevCnt ; + if ( IsSameStrand( strand, subexons[tag].leftStrand ) ) + { + for ( i = 0 ; i < cnt ; ++i ) + if ( subexons[ subexons[tag].prev[i] ].end < subexons[tag].start - 1 ) + SetGeneId( subexons[tag].prev[i], strand, subexons, seCnt, id ) ; + } + + // Set through the adjacent subexons. + if ( tag < seCnt - 1 && subexons[tag + 1].start == subexons[tag].end + 1 ) + { + SetGeneId( tag + 1, strand, subexons, seCnt, id ) ; + } + + if ( tag > 0 && subexons[tag].start - 1 == subexons[tag - 1].end ) + { + SetGeneId( tag - 1, strand, subexons, seCnt, id ) ; + } + +} + +void SubexonGraph::UpdateGeneId( struct _subexon *subexons, int seCnt ) +{ + int i ; + baseGeneId = usedGeneId ; + int lastMinusStrandGeneId = -1 ; + for ( int strand = -1 ; strand <= 1 ; strand +=2 ) + { + for ( i = 0 ; i < seCnt ; ++i ) + { + //printf( "%d (%d %d) %d.\n", i, subexons[i].start + 1, subexons[i].end + 1, subexons[i].geneId ) ; + if ( ( subexons[i].geneId == -1 && ( ( strand == 1 && subexons[i].rightStrand == 0 ) || subexons[i].rightStrand == strand ) ) + || ( strand == 1 && baseGeneId <= subexons[i].geneId && subexons[i].geneId <= lastMinusStrandGeneId && subexons[i].rightStrand == strand ) ) + { + SetGeneId( i, strand, subexons, seCnt, usedGeneId ) ; + if ( strand == -1 ) + lastMinusStrandGeneId = usedGeneId ; + ++usedGeneId ; + } + } + } + + for ( i = 0 ; i < seCnt ; ++i ) + if ( subexons[i].leftType == 0 && subexons[i].rightType == 0 ) + { + subexons[i].geneId = usedGeneId ; + ++usedGeneId ; + } + // Put base and usedGeneId in lcCnt, rcCnt field. + for ( i = 0 ; i < seCnt ; ++i ) + { + subexons[i].lcCnt = baseGeneId ; + subexons[i].rcCnt = usedGeneId ; + } + + /*for ( i = 0 ; i < seCnt ; ++i ) + { + printf( "geneId %d: %d-%d %d\n", i, subexons[i].start + 1, subexons[i].end + 1, subexons[i].geneId ) ; + } + printf("%d %d\n", baseGeneId, usedGeneId ) ;*/ +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/SubexonGraph.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,348 @@ +#ifndef _MOURISL_CLASSES_SUBEXONGRAPH_HEADER +#define _MOURISL_CLASSES_SUBEXONGRAPH_HEADER + +#include "alignments.hpp" +#include "blocks.hpp" + +struct _subexon +{ + int chrId ; + int geneId ; + int start, end ; + int leftType, rightType ; + double avgDepth ; + //double ratio, classifier ; + double leftRatio, rightRatio ; + double leftClassifier, rightClassifier ; + int lcCnt, rcCnt ; + int leftStrand, rightStrand ; + + int nextCnt, prevCnt ; + int *next, *prev ; + + bool canBeStart, canBeEnd ; +} ; + +struct _geneInterval +{ + int startIdx, endIdx ; + int start, end ; // The start and end of a gene interval might be adjusted, so it does not + // need to be match with the corresponding subexons +} ; + +class SubexonGraph +{ +private: + int *visit ; + double classifierThreshold ; + + int usedGeneId ; + int baseGeneId ; + + // The function to assign gene ids to subexons. + void SetGeneId( int tag, int strand, struct _subexon *subexons, int seCnt, int id ) ; + void GetGeneBoundary( int tag, int &boundary, int timeStamp ) ; + void UpdateGeneId( struct _subexon *subexons, int seCnt ) ; +public: + std::vector<struct _subexon> subexons ; + std::vector<struct _geneInterval> geneIntervals ; + + ~SubexonGraph() + { + int i ; + int size = subexons.size() ; + for ( i = 0 ; i < size ; ++i ) + { + if ( subexons[i].next ) + delete[] subexons[i].next ; + if ( subexons[i].prev ) + delete[] subexons[i].prev ; + } + } + + SubexonGraph( double classifierThreshold, Alignments &bam, FILE *fpSubexon ) + { + // Read in the subexons + rewind( fpSubexon ) ; + char buffer[2048] ; + int subexonCnt ; + int i, j, k ; + while ( fgets( buffer, sizeof( buffer ), fpSubexon ) != NULL ) + { + if ( buffer[0] == '#' ) + continue ; + + struct _subexon se ; + InputSubexon( buffer, bam, se, true ) ; + + // filter. + if ( ( se.leftType == 0 && se.rightType == 0 ) + || ( se.leftType == 0 && se.rightType == 1 ) // overhang + || ( se.leftType == 2 && se.rightType == 0 ) // overhang + || ( se.leftType == 2 && se.rightType == 1 ) ) // ir + { + if ( ( se.leftType == 0 && se.rightType == 1 ) + || ( se.leftType == 2 && se.rightType == 0 ) ) // if the overhang is too small + { + if ( se.end - se.start + 1 <= 7 ) + { + if ( se.next ) + delete[] se.next ; + if ( se.prev ) + delete[] se.prev ; + continue ; + } + } + + if ( se.leftClassifier >= classifierThreshold || se.leftClassifier < 0 ) + { + if ( se.next ) + delete[] se.next ; + if ( se.prev ) + delete[] se.prev ; + continue ; + } + } + + // Adjust the coordinate. + subexons.push_back( se ) ; + } + + // Convert the coordinate to index + // Note that each coordinate can only associate with one subexon. + subexonCnt = subexons.size() ; + for ( i = 0 ; i < subexonCnt ; ++i ) + { + struct _subexon &se = subexons[i] ; + //printf( "hi1 %d: %d %d\n", i, se.prevCnt, se.prev[0] ) ; + int cnt = 0 ; + + // due to filter, we may not fully match the coordinate and the subexon + int bound = 0 ; + if ( se.prevCnt > 0 ) + bound = se.prev[0] ; + for ( j = i - 1, k = 0 ; k < se.prevCnt && j >= 0 && subexons[j].end >= bound ; --j ) + { + //printf( " %d %d: %d %d\n", j, k, se.prev[ se.prevCnt - 1 - k], subexons[j].end ) ; + if ( subexons[j].end == se.prev[se.prevCnt - 1 - k] ) // notice the order is reversed + { + se.prev[se.prevCnt - 1 - cnt] = j ; + ++k ; + ++cnt ; + } + else if ( subexons[j].end < se.prev[ se.prevCnt - 1 - k ] ) // the corresponding subexon gets filtered. + { + ++k ; + ++j ; // counter the --j in the loop + } + } + //printf( "hi2 %d : %d\n", i, se.prevCnt ) ; + // shft the list + for ( j = 0, k = se.prevCnt - cnt ; j < cnt ; ++j, ++k ) + { + se.prev[j] = se.prev[k] ; + } + se.prevCnt = cnt ; + cnt = 0 ; + if ( se.nextCnt > 0 ) + bound = se.next[ se.nextCnt - 1] ; + for ( j = i + 1, k = 0 ; k < se.nextCnt && j < subexonCnt && subexons[j].start <= bound ; ++j ) + { + if ( subexons[j].start == se.next[k] ) + { + se.next[cnt] = j ; // cnt is always less than k, so we don't need to worry about overwrite. + ++k ; + ++cnt ; + } + else if ( subexons[j].start > se.next[k] ) + { + ++k ; + --j ; + } + } + se.nextCnt = cnt ; + } + + // Adjust the coordinate + int seCnt = subexons.size() ; + for ( i = 0 ; i < seCnt ; ++i ) + { + --subexons[i].start ; + --subexons[i].end ; + } + rewind( fpSubexon ) ; + + // Adjust the classifier for hard boundary, if there is a overhang attached to that region. + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].leftType == 1 && subexons[i].leftClassifier < 1 ) + { + for ( j = i - 1 ; j >= 0 ; --j ) + if ( subexons[j].end < subexons[j + 1].start - 1 ) + break ; + if ( subexons[j + 1].leftType == 0 ) + subexons[i].leftClassifier = 1 ; + } + if ( subexons[i].rightType == 2 && subexons[i].rightClassifier < 1 ) + { + for ( j = i + 1 ; j < seCnt ; ++j ) + if ( subexons[j].start > subexons[j - 1].end + 1 ) + break ; + if ( subexons[j - 1].rightType == 0 ) + subexons[i].rightClassifier = 1 ; + } + } + + // For the region of mixture of plus and minus strand subexons, if there is + // no overhang attached to it, we need to let the hard boundary be a candidate terminal sites. + for ( i = 0 ; i < seCnt ; ) + { + // [i,j) is a region + int support[2] = {0, 0} ; // the index, 0 is for minus strand, 1 is for plus strand + for ( j = i + 1 ; j < seCnt ; ++j ) + { + if ( subexons[j].start > subexons[j - 1].end + 1 ) + break ; + } + + for ( k = i ; k < j ; ++k ) + { + if ( subexons[k].leftStrand != 0 ) + ++support[ ( subexons[k].leftStrand + 1 ) / 2 ] ; + if ( subexons[k].rightStrand != 0 ) + ++support[ ( subexons[k].rightStrand + 1 ) / 2 ] ; + } + if ( support[0] == 0 || support[1] == 0 ) + { + i = j ; + continue ; + } + // a mixture region. + // We force a terminal site if we have only coming-in and no going-out introns. + int leftSupport[2] = {0, 0}, rightSupport[2] = {0, 0}; + int l ; + for ( k = i ; k < j ; ++k ) + { + int cnt = subexons[k].prevCnt ; + if ( subexons[k].leftStrand != 0 ) + for ( l = 0 ; l < cnt ; ++l ) + if ( subexons[k].prev[l] < i ) + { + ++leftSupport[ ( subexons[k].leftStrand + 1 ) / 2 ] ; + break ; + } + cnt = subexons[k].nextCnt ; + if ( subexons[k].rightStrand != 0 ) + for ( l = 0 ; l < cnt ; ++l ) + if ( subexons[k].next[l] >= j ) + { + ++rightSupport[ ( subexons[k].rightStrand + 1 ) / 2 ] ; + break ; + } + } + + if ( ( ( leftSupport[0] > 0 && rightSupport[0] == 0 ) || + ( leftSupport[1] > 0 && rightSupport[1] == 0 ) ) && + subexons[j - 1].rightType != 0 ) + { + subexons[j - 1].rightClassifier = 0 ; + } + + if ( ( ( leftSupport[0] == 0 && rightSupport[0] > 0 ) || + ( leftSupport[1] == 0 && rightSupport[1] > 0 ) ) && + subexons[j - 1].leftType != 0 ) + { + subexons[j - 1].leftClassifier = 0 ; + } + + i = j ; + } + + this->classifierThreshold = classifierThreshold ; + + usedGeneId = baseGeneId = 0 ; + } + + static bool IsSameStrand( int a, int b ) + { + if ( a == 0 || b == 0 ) + return true ; + if ( a != b ) + return false ; + return true ; + } + // Parse the input line + static int InputSubexon( char *in, Alignments &alignments, struct _subexon &se, bool needPrevNext = false ) + { + int i ; + char chrName[50] ; + char ls[3], rs[3] ; + sscanf( in, "%s %d %d %d %d %s %s %lf %lf %lf %lf %lf", chrName, &se.start, &se.end, &se.leftType, &se.rightType, ls, rs, + &se.avgDepth, &se.leftRatio, &se.rightRatio, + &se.leftClassifier, &se.rightClassifier ) ; + se.chrId = alignments.GetChromIdFromName( chrName ) ; + se.nextCnt = se.prevCnt = 0 ; + se.next = se.prev = NULL ; + se.lcCnt = se.rcCnt = 0 ; + + if ( ls[0] == '+' ) + se.leftStrand = 1 ; + else if ( ls[0] == '-' ) + se.leftStrand = -1 ; + else + se.leftStrand = 0 ; + + if ( rs[0] == '+' ) + se.rightStrand = 1 ; + else if ( rs[0] == '-' ) + se.rightStrand = -1 ; + else + se.rightStrand = 0 ; + + if ( needPrevNext ) + { + char *p = in ; + // Locate the offset for prevCnt + for ( i = 0 ; i <= 11 ; ++i ) + { + p = strchr( p, ' ' ) ; + ++p ; + } + + sscanf( p, "%d", &se.prevCnt ) ; + p = strchr( p, ' ' ) ; + ++p ; + se.prev = new int[ se.prevCnt ] ; + for ( i = 0 ; i < se.prevCnt ; ++i ) + { + sscanf( p, "%d", &se.prev[i] ) ; + p = strchr( p, ' ' ) ; + ++p ; + } + + sscanf( p, "%d", &se.nextCnt ) ; + p = strchr( p, ' ' ) ; + ++p ; + se.next = new int[ se.nextCnt ] ; + for ( i = 0 ; i < se.nextCnt ; ++i ) + { + sscanf( p, "%d", &se.next[i] ) ; + p = strchr( p, ' ' ) ; + ++p ; + } + + } + return 1 ; + } + + int GetGeneIntervalIdx( int startIdx, int &endIdx, int timeStamp ) ; + + //@return: the number of intervals found + int ComputeGeneIntervals() ; + + // Return a list of subexons in that interval and in retList the id of subexon + // should be adjusted to start from 0. + int ExtractSubexons( int startIdx, int endIdx, struct _subexon *retList ) ; +} ; + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/SubexonInfo.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1214 @@ +// report the total depth for the segment +// Format: ./a.out alignments.bam splice_site +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#define __STDC_FORMAT_MACROS +#include <inttypes.h> + +#include <algorithm> +#include <vector> +#include <math.h> + +#include "alignments.hpp" +#include "blocks.hpp" +#include "stats.hpp" + +#define ABS(x) ((x)<0?-(x):(x)) + +char usage[] = "./subexon-info alignment.bam intron.splice [options]\n" + "options:\n" + "\t--minDepth INT: the minimum coverage depth considered as part of a subexon (default: 2)\n" + "\t--noStats: do not compute the statistical scores (default: not used)\n" ; +char buffer[4096] ; + +int gMinDepth ; + +bool CompSplitSite( struct _splitSite a, struct _splitSite b ) +{ + if ( a.chrId < b.chrId ) + return true ; + else if ( a.chrId > b.chrId ) + return false ; + else if ( a.pos != b.pos ) + return a.pos < b.pos ; + else if ( a.type != b.type ) + return a.type < b.type ; // We want the start of exons comes first, since we are scanning the genome from left to right, + // so we can terminate the extension early and create a single-base subexon later. + else + return a.oppositePos < b.oppositePos ; +} + +bool CompBlocksByAvgDepth( struct _block a, struct _block b ) +{ + double avgA = a.depthSum / (double)( a.end - a.start + 1 ) ; + double avgB = b.depthSum / (double)( b.end - b.start + 1 ) ; + return avgA < avgB ; +} + +bool CompBlocksByRatio( struct _block a, struct _block b ) +{ + return a.ratio < b.ratio ; +} + +int CompDouble( const void *p1, const void *p2 ) +{ + double a = *(double *)p1 ; + double b = *(double *)p2 ; + if ( a > b ) + return 1 ; + else if ( a < b ) + return -1 ; + else + return 0 ; +} + +// Clean up the split sites; +void FilterAndSortSplitSites( std::vector<struct _splitSite> &sites ) +{ + std::sort( sites.begin(), sites.end(), CompSplitSite ) ; + int i, j, k, l ; + int size = sites.size() ; + + for ( i = 0 ; i < size ; ) + { + for ( j = i + 1 ; j < size ; ++j ) + if ( sites[j].chrId != sites[i].chrId || sites[j].pos != sites[i].pos || sites[j].type != sites[i].type ) + break ; + int maxSupport = 0 ; + for ( k = i ; k < j ; ++k ) + if ( sites[k].support > maxSupport ) + maxSupport = sites[k].support ; + + int strandCnt[2] = {0, 0} ; + char strand = sites[i].strand ; + for ( k = i ; k < j ; ++k ) + { + if ( sites[k].strand == '-' ) + strandCnt[0] += sites[k].support ; + else if ( sites[k].strand == '+' ) + strandCnt[1] += sites[k].support ; + + } + if ( strandCnt[0] > strandCnt[1] ) + strand = '-' ; + else if ( strandCnt[1] > strandCnt[0] ) + strand = '+' ; + + bool allOneExceptMax = false ; + if ( maxSupport >= 20 ) + { + allOneExceptMax = true ; + for ( k = i ; k < j ; ++k ) + { + if ( sites[k].support == maxSupport ) + continue ; + if ( sites[k].support > 1 ) + { + allOneExceptMax = false ; + break ; + } + } + } + + for ( k = i ; k < j ; ++k ) + { + if ( ( sites[k].support < 0.01 * maxSupport && sites[k].support <= 3 ) + || sites[k].support < 0.001 * maxSupport || sites[k].strand != strand // The introns from the different strand are filtered. + || ( sites[k].support < 0.02 * maxSupport && sites[k].mismatchSum >= 2 * sites[k].support ) + || ( allOneExceptMax && sites[k].support == 1 ) + || ( sites[k].support <= 2 && sites[k].mismatchSum >= 2 * sites[k].support ) + || ( maxSupport >= 2 && sites[k].support == 1 && ( ABS( sites[k].oppositePos - sites[k].pos - 1 ) >= 10000 || sites[k].mismatchSum != 0 ) ) ) + { + for ( l = i - 1 ; l >= 0 && sites[l].chrId == sites[i].chrId && sites[l].pos >= sites[k].oppositePos ; --l ) + { + if ( sites[l].pos == sites[k].oppositePos && sites[l].oppositePos == sites[k].pos ) + { + sites[l].support = -1 ; + sites[k].support = -1 ; + break ; + } + } + + for ( l = j ; l < size && sites[l].chrId == sites[i].chrId && sites[l].pos <= sites[k].oppositePos ; ++l ) + { + if ( sites[l].pos == sites[k].oppositePos && sites[l].oppositePos == sites[k].pos ) + { + sites[l].support = -1 ; + sites[k].support = -1 ; + break ; + } + } + } + /*else if ( sites[k].support <= 1 && sites[k].oppositePos - sites[k].pos + 1 >= 30000 ) + { + for ( l = j ; l < size && sites[l].chrId == sites[i].chrId && sites[l].pos <= sites[k].oppositePos ; ++l ) + { + if ( sites[l].pos == sites[k].oppositePos && sites[l].oppositePos == sites[k].pos ) + { + if ( l - j >= 20 ) + { + sites[l].support = -1 ; + sites[k].support = -1 ; + break ; + } + } + } + + }*/ + } + i = j ; + } + + k = 0 ; + for ( i = 0 ; i < size ; ++i ) + { + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + if ( sites[k].strand == '?' ) + sites[k].strand = '.' ; + ++k ; + } + } + sites.resize( k ) ; +} + +// Remove the same sites. +void KeepUniqSplitSites( std::vector< struct _splitSite> &sites ) +{ + int i, j ; + int size = sites.size() ; + int k = 0 ; + for ( i = 0 ; i < size ; ) + { + for ( j = i + 1 ; j < size ; ++j ) + if ( sites[j].chrId != sites[i].chrId || sites[j].pos != sites[i].pos || sites[j].type != sites[i].type ) + break ; + sites[k] = sites[i] ; + ++k ; + /*else + { + if ( sites[i].type != sites[k-1].type ) + { + printf( "%d\n", sites[i].pos ) ; + } + }*/ + i = j ; + } + sites.resize( k ) ; + + // For the sites that corresponds to the start of an exon, we remove the adjust to it and + /*for ( i = 1 ; i < size ; ++i ) + { + if ( sites[i].pos == sites[i - 1].pos && sites[i - 1].type == 2 ) + { + if ( sites[i].type == 1 ) + ++sites[i].pos ; + } + }*/ +} + +// Filter split sites that are extremely close to each other but on different strand. +void FilterNearSplitSites( std::vector< struct _splitSite> &sites ) +{ + int i, j ; + int size = sites.size() ; + int k = 0 ; + for ( i = 0 ; i < size - 1 ; ++i ) + { + if ( sites[i].support < 0 || sites[i].type != sites[i + 1].type || sites[i].chrId != sites[i + 1].chrId ) + continue ; + if ( sites[i + 1].pos - sites[i].pos <= 7 && + ( sites[i + 1].strand != sites[i].strand || + sites[i].strand == '?' ) ) + { + int tag = i ; + if ( sites[i + 1].support < sites[i].support ) + tag = i + 1 ; + sites[tag].support = -1 ; + int direction ; + if ( sites[tag].oppositePos < sites[tag].pos ) + direction = -1 ; + else + direction = 1 ; + + for ( j = tag ; j >= 0 && j < size ; j += direction ) + if ( sites[j].pos == sites[tag].oppositePos && sites[j].oppositePos == sites[tag].pos ) + { + sites[j].support = -1 ; + break ; + } + } + } + + for ( i = 0 ; i < size ; ++i ) + { + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + ++k ; + } + } + sites.resize( k ) ; +} + +void FilterRepeatSplitSites( std::vector<struct _splitSite> &sites ) +{ + int i, j ; + int size = sites.size() ; + int k = 0 ; + for ( i = 0 ; i < size ; ) + { + for ( j = i + 1 ; j < size ; ++j ) + { + if ( sites[j].pos != sites[i].pos || sites[j].type != sites[i].type || sites[i].chrId != sites[j].chrId ) + break ; + } + int max = -1 ; + int maxtag = 0 ; + for ( k = i ; k < j ; ++k ) + { + if ( sites[k].uniqSupport > max ) + { + max = sites[k].uniqSupport ; + maxtag = k ; + } + else if ( sites[k].uniqSupport == max && sites[k].support > sites[maxtag].support ) + { + maxtag = k ; + } + } + + if ( max > -1 ) + { + if ( sites[maxtag].uniqSupport > sites[maxtag].support * 0.1 ) + { + for ( k = i ; k < j ; ++k ) + if ( sites[k].uniqSupport < 0.05 * sites[k].support ) + { + sites[k].support = -1 ; + + int direction ; + if ( sites[k].oppositePos < sites[k].pos ) + direction = -1 ; + else + direction = 1 ; + int l ; + for ( l = k ; l >= 0 && l < size ; l += direction ) + if ( sites[l].pos == sites[k].oppositePos && sites[l].oppositePos == sites[k].pos ) + { + sites[l].support = -1 ; + break ; + } + } + } + else + { + for ( k = i ; k < j ; ++k ) + { + if ( sites[k].support <= 10 ) + { + sites[k].support = -1 ; + + int direction ; + if ( sites[k].oppositePos < sites[k].pos ) + direction = -1 ; + else + direction = 1 ; + int l ; + for ( l = k ; l >= 0 && l < size ; l += direction ) + if ( sites[l].pos == sites[k].oppositePos && sites[l].oppositePos == sites[k].pos ) + { + sites[l].support = -1 ; + break ; + } + } + } + } + } + + i = j ; + } + + k = 0 ; + for ( i = 0 ; i < size ; ++i ) + { + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + ++k ; + } + } + sites.resize( k ) ; + +} + + +// When k=1, the gamma distribution becomes exponential distribution, and can be optimized analytically.. +// Maximize: sum_i z_i( log( 1/theta e^{-x_i / theta} ) +/*double ThetaOfExponentialDistribution( double *x, double *z, int n ) +{ + double sumZ = 0; + double sumZX = 0 ; + for ( i = 0 ; i < n ; ++i ) + { + sumZ += z[i] ; + sumZX += z[i] * x[i] ; + } +}*/ + +// for boundK, if it is positive, it represent the upper bound. If it is negative, -boundK will be the lower bound for k. +// if boundK==0, there is no extra bound. +// The same logic for boundProduct, which bounds k*theta +void GradientDescentGammaDistribution( double &k, double &theta, double initK, double initTheta, double lowerBoundK, double upperBoundK, + double lowerBoundMean, double upperBoundMean, double *x, double *z, int n ) +{ + int i ; + k = initK ; + theta = initTheta ; + double c = 0.5 ; + int iterCnt = 1 ; + + double sumZ = 0 ; + double sumZX = 0 ; + double sumZLogX = 0 ; + double Hessian[2][2] ; // 0 for k, 1 for theta + double inverseHessian[2][2] ; + int tmp ; + + double positiveKRecord = -5, positiveThetaRecord = -5 ; // record the value of k, theta when theta, k becomes non-positive. + + for ( i = 0 ; i < n ; ++i ) + { + sumZ += z[i] ; + sumZX += z[i] * x[i] ; + sumZLogX += z[i] * log( x[i] ) ; + } + + while ( 1 ) + { + double gradK = 0 ; + double gradTheta = 0 ; + + double prevK = k ; + double prevTheta = theta ; + double digammaK = digammal( k ) ; + + gradK = sumZ * ( -log( theta ) - digammaK ) + sumZLogX ; + gradTheta = -sumZ * ( k / theta ) + sumZX / ( theta * theta ) ; + + Hessian[0][0] = -sumZ * trigamma( k, &tmp ) ; + Hessian[0][1] = -sumZ / theta ; // \partial l / ( \partial k \partial theta) + Hessian[1][0] = -sumZ / theta ; + Hessian[1][1] = sumZ * k / ( theta * theta ) - 2 * sumZX / ( theta * theta * theta ) ; + + double det = Hessian[0][0] * Hessian[1][1] - Hessian[0][1] * Hessian[1][0] ; + /*printf( "%s iter %d:\n", __func__, iterCnt ) ; + printf( "%lf %lf %lf %lf\n", sumZ, k, theta, sumZX ) ; + printf( "%lf %lf %lf\n", gradK, gradTheta, det ) ; + printf( "%lf %lf %lf %lf\n", Hessian[0][0], Hessian[0][1], Hessian[1][0], Hessian[1][1] ) ;*/ + if ( det <= 1e-4 && det >=-1e-4 ) + { + k = k + c / iterCnt * gradK ; + theta = theta + c / iterCnt * gradTheta ; + } + else + { + inverseHessian[0][0] = Hessian[1][1] / det ; + inverseHessian[0][1] = -Hessian[0][1] / det ; + inverseHessian[1][0] = -Hessian[1][0] / det ; + inverseHessian[1][1] = Hessian[0][0] / det ; + //printf( "%lf %lf %lf %lf: %lf\n=====\n", inverseHessian[0][0], inverseHessian[0][1], inverseHessian[1][0], inverseHessian[1][1], + // Hessian[1][0] * inverseHessian[0][1] + Hessian[1][1] * inverseHessian[1][1] ) ; + double step = 0.5 ; + k = k - step * ( inverseHessian[0][0] * gradK + inverseHessian[0][1] * gradTheta ) ; + theta = theta - step * ( inverseHessian[1][0] * gradK + inverseHessian[1][1] * gradTheta ) ; + + bool flag = false ; + if ( k <= 1e-6 ) + { + step = ( prevK - 1e-6 ) / ( inverseHessian[0][0] * gradK + inverseHessian[0][1] * gradTheta ) ; + + if ( ABS( theta - positiveThetaRecord ) < 1e-5 ) + flag = true ; + positiveThetaRecord = theta ; + } + if ( theta <= 1e-6 ) + { + double tmp = ( prevTheta - 1e-6 ) / ( inverseHessian[1][0] * gradK + inverseHessian[1][1] * gradTheta ) ; + if ( tmp < step ) + step = tmp ; + + if ( ABS( k - positiveKRecord ) < 1e-5 ) + flag = true ; + positiveKRecord = k ; + } + + if ( step != 0.5 ) + { + k = prevK - step * ( inverseHessian[0][0] * gradK + inverseHessian[0][1] * gradTheta ) ; + theta = prevTheta - step * ( inverseHessian[1][0] * gradK + inverseHessian[1][1] * gradTheta ) ; + } + + /*if ( flag ) + { + k = prevK ; + theta = prevTheta ; + break ; + }*/ + } + + if ( upperBoundK > 0 && k > upperBoundK ) + k = upperBoundK ; + else if ( lowerBoundK > 0 && k < lowerBoundK ) + k = lowerBoundK ; + + if ( upperBoundMean > 0 && k * theta > upperBoundMean ) + { + theta = upperBoundMean / k ; + } + else if ( lowerBoundMean > 0 && k * theta < lowerBoundMean ) + { + theta = lowerBoundMean / k ; + } + + if ( k <= 1e-6 ) + { + k = 1e-6 ; + } + + if ( theta <= 1e-6 ) + { + theta = 1e-6 ; + } + + double diff = ABS( prevK - k ) + ABS( prevTheta - theta ) ; + if ( diff < 1e-5 ) + break ; + + ++iterCnt ; + //if ( det <= 1e-4 && det >=-1e-4 && k >= 5000 ) //&& diff < 1 ) + if ( k >= 10000 ) //&& diff < 1 ) + { + k = prevK ; + theta = prevTheta ; + break ; + } + if ( iterCnt == 1000 ) + break ; + } +} + +double MixtureGammaEM( double *x, int n, double &pi, double *k, double *theta, int tries, double meanBound[2], int iter = 1000 ) +{ + int i ; + double *z = new double[n] ; // the expectation that it assigned to model 0. + double *oneMinusZ = new double[n] ; + int t = 0 ; + double history[5] = {-1, -1, -1, -1, -1} ; + double maxX = -1 ; + double sumX = 0 ; + if ( n <= 0 ) + return 0 ; + + for ( i = 0 ; i < n ; ++i ) + { + sumX += x[i] ; + if ( x[i] > maxX ) + maxX = x[i] ; + } + if ( maxX > meanBound[1] && meanBound[1] >= 0 ) + maxX = meanBound[1] ; + + /*if ( meanBound[1] == -1 ) + { + // The EM for coverage + maxX = 10.0 ; + }*/ + + while ( 1 ) + { + double npi, nk[2], ntheta[2] ; + double sum = 0 ; + for ( i = 0 ; i < n ; ++i ) + { + //double lf0 = -k[0] * log( theta[0] ) + ( k[0] - 1 ) * log( cov[i]) - cov[i] / theta[0] - lgamma( k[0] ); + //double lf1 = -k[1] * log( theta[1] ) + ( k[1] - 1 ) * log( cov[i]) - cov[i] / theta[1] - lgamma( k[1] ); + //z[i] = exp( lf0 + log( pi ) ) / ( exp( lf0 + log( pi ) ) + exp( lf1 + log( 1 - pi ) ) ) ; + if ( pi != 0 ) + z[i] = MixtureGammaAssignment( x[i], pi, k, theta ) ; + else + z[i] = 0 ; + /*if ( isnan( z[i] ) ) + { + printf( "nan: %lf %lf %lf %lf\n", x[i], pi, k, theta ) ; + }*/ + oneMinusZ[i] = 1 - z[i] ; + sum += z[i] ; + } + + // compute new pi. + npi = sum / n ; + + // Use gradient descent to compute new k and theta. + if ( 1 ) //pi > 0 ) + { + double bound ; + if ( meanBound[1] != -1 ) // the EM for ratio + { + bound = ( theta[1] * k[1] > 1 ) ? 1 : ( theta[1] * k[1] ) / ( 1 + tries ); + GradientDescentGammaDistribution( nk[0], ntheta[0], k[0], theta[0], k[1], -1, -1, bound, x, z, n ) ; // It seems setting an upper bound 1 for k[0] is not a good idea. + } + else + { + bound = ( theta[1] * k[1] > 1 ) ? 1 : ( theta[1] * k[1] ) / ( 1 + tries ) ; + GradientDescentGammaDistribution( nk[0], ntheta[0], k[0], theta[0], k[1], -1, meanBound[0], bound, x, z, n ) ; // It seems setting an upper bound 1 for k[0] is not a good idea. + } + GradientDescentGammaDistribution( nk[1], ntheta[1], k[1], theta[1], -1, k[0], theta[0] * k[0], maxX, x, oneMinusZ, n ) ; + } + else + { + GradientDescentGammaDistribution( nk[1], ntheta[1], k[1], theta[1], 0, 0, 0, 0, x, oneMinusZ, n ) ; + } + + double diff ; + if ( isnan( npi ) || isnan( nk[0] ) || isnan( nk[1] ) || isnan( ntheta[0] ) || isnan( ntheta[1] ) ) + { + delete[] z ; + delete[] oneMinusZ ; + return -1 ; + } + diff = ABS( nk[0] - k[0] ) + ABS( nk[1] - k[1] ) + + ABS( ntheta[0] - theta[0] ) + ABS( ntheta[1] - theta[1] ) ; // pi is fully determined by these 4 parameters. + if ( diff < 1e-4 ) + break ; + diff = ABS( nk[0] - history[1] ) + ABS( nk[1] - history[2] ) + + ABS( ntheta[0] - history[3] ) + ABS( ntheta[1] - history[4] ) ; // pi is fully determined by these 4 parameters. + if ( diff < 1e-4 ) + break ; + + history[0] = pi ; + history[1] = k[0] ; + history[2] = k[1] ; + history[3] = theta[0] ; + history[4] = theta[1] ; + + pi = npi ; + k[0] = nk[0] ; + k[1] = nk[1] ; + theta[0] = ntheta[0] ; + theta[1] = ntheta[1] ; + + /*double logLikelihood = 0 ; + for ( i = 0 ; i < n ; ++i ) + logLikelihood += log( pi * exp( LogGammaDensity( x[i], k[0], theta[0]) ) + + (1 - pi ) * exp( LogGammaDensity( x[i], k[1], theta[1] ) ) ) ;*/ + + //printf( "%d: %lf %lf %lf %lf %lf\n", t, pi, k[0], theta[0], k[1], theta[1] ) ; + + ++t ; + if ( iter != -1 && t >= iter ) + break ; + } + delete[] z ; + delete[] oneMinusZ ; + return 0 ; +} + +bool IsParametersTheSame( double *k, double *theta ) +{ + if ( ABS( k[0] - k[1] ) < 1e-2 && ABS( theta[0] - theta[1] ) < 1e-2 ) + return true ; + return false ; +} + +int RatioAndCovEM( double *covRatio, double *cov, int n, double &piRatio, double kRatio[2], + double thetaRatio[2], double &piCov, double kCov[2], double thetaCov[2] ) +{ + int i ; + piRatio = 0.6 ; // mixture coefficient for model 0 and 1 + kRatio[0] = 0.9 ; + kRatio[1] = 0.45 ; + thetaRatio[0] = 0.05 ; + thetaRatio[1] = 1 ; + double meanBound[2] = {-1, 1} ; // [0] is for the lower bound of the noise model, [1] is for the upper bound of the true model + + /*double *filteredCovRatio = new double[n] ;// ignore the ratio that is greater than 5. + int m = 0 ; + for ( i = 0 ; i < n ; ++i ) + if ( covRatio[i] < 1.0 ) + { + filteredCovRatio[m] = covRatio[i] ; + ++m ; + }*/ + srand( 17 ) ; + int maxTries = 10 ; + int t = 0 ; + double *buffer = new double[n] ; + for ( i = 0 ; i < n ; ++i ) + buffer[i] = covRatio[i] ; + qsort( buffer, n, sizeof( double ), CompDouble ) ; + //covRatio = buffer ; + while ( 1 ) + { + //printf( "EM\n" ) ; + MixtureGammaEM( covRatio, n, piRatio, kRatio, thetaRatio, t, meanBound ) ; + //printf( "%lf %lf %lf %lf %lf\n", piRatio, kRatio[0], kRatio[1], thetaRatio[0], thetaRatio[1] ) ; + if ( piRatio > 0.999 || piRatio < 0.001 || IsParametersTheSame( kRatio, thetaRatio ) ) + { + ++t ; + if ( t > maxTries ) + break ; + piRatio = 0.6 ; + kRatio[0] += ( ( rand() * 0.5 - RAND_MAX ) / (double)RAND_MAX * 0.1 ) ; + if ( kRatio[0] <= 0 ) + kRatio[0] = 0.9 ; + kRatio[1] += ( ( rand() * 0.5 - RAND_MAX ) / (double)RAND_MAX * 0.1 ) ; + if ( kRatio[1] <= 0 ) + kRatio[1] = 0.45 ; + thetaRatio[0] += ( ( rand() * 0.5 - RAND_MAX ) / (double)RAND_MAX * 0.1 ) ; + if ( thetaRatio[0] <= 0 ) + thetaRatio[0] = 0.05 ; + thetaRatio[1] += ( ( rand() * 0.5 - RAND_MAX ) / (double)RAND_MAX * 0.1 ) ; + if ( thetaRatio[1] <= 0 ) + thetaRatio[1] = 1 ; + if ( kRatio[0] < kRatio[1] ) + { + if ( rand() & 1 ) + kRatio[0] = kRatio[1] ; + else + kRatio[1] = kRatio[0] ; + } + if ( kRatio[0] * thetaRatio[0] > kRatio[1] * thetaRatio[1] ) + { + thetaRatio[0] = kRatio[1] * thetaRatio[1] / kRatio[0] ; + } + //printf( "%lf %lf %lf %lf %lf\n", piRatio, kRatio[0], kRatio[1], thetaRatio[0], thetaRatio[1] ) ; + + continue ; + } + + break ; + } + //delete[] filteredCovRatio ; + if ( t > maxTries && piRatio > 0.999 ) + { + /*piRatio = 0.6 ; // mixture coefficient for model 0 and 1 + kRatio[0] = 0.9 ; + kRatio[1] = 0.45 ; + thetaRatio[0] = 0.05 ; + thetaRatio[1] = 1 ;*/ + piRatio = 0.999 ; + } + if ( IsParametersTheSame( kRatio, thetaRatio ) || piRatio <= 1e-3 ) + piRatio = 1e-3 ; + + piCov = piRatio ; // mixture coefficient for model 0 and 1 + kCov[0] = 0.9 ; + kCov[1] = 0.45 ; + thetaCov[0] = 3 ; + thetaCov[1] = 12 ; + + // only do one iteration of EM, so that pi does not change? + // But it seems it still better to run full EM. + meanBound[0] = 1.01 ; + meanBound[1] = -1 ; + + //printf( "for coverage:\n" ) ; + //piCov = 0.001000 ; + MixtureGammaEM( cov, n, piCov, kCov, thetaCov, 0, meanBound ) ; + //printf( "for coverage done\n" ) ; + piCov = piRatio ; + + delete []buffer ; + + return 0 ; +} + +double GetPValue( double x, double *k, double *theta ) +{ + int fault ; + double p ; + p = 1 - gammad( x / theta[0], k[0], &fault ) ; + return p ; +} + +// if x's value is less than the average of (k0-1)*theta0, then we force x=(k0-1)*theta0, +// the mode of the model 0. Of course, it does not affect when k0<=1 already. +double MixtureGammaAssignmentAdjust( double x, double pi, double* k, double *theta ) +{ + if ( x < ( k[0] - 1 ) * theta[0] ) + { + x = ( k[0] - 1 ) * theta[0] ; + } + return MixtureGammaAssignment( x, pi, k, theta ) ; +} + + +// Transform the cov number for better fitting +double TransformCov( double c ) +{ + double ret ; + // original it is c-1. + // Use -2 instead of -1 is that many region covered to 1 reads will be filtered when + // build the subexons. + // + //ret = sqrt( c ) - 1 ; + if ( c <= 2 + 1e-6 ) + ret = 1e-6 ; + else + ret = c - 2 ; + return ret ; + //return log( c ) / log( 2.0 ) ; +} + +int main( int argc, char *argv[] ) +{ + int i, j ; + bool noStats = false ; + if ( argc < 3 ) + { + fprintf( stderr, usage ) ; + exit( 1 ) ; + } + + gMinDepth = 2 ; + + for ( i = 3 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "--noStats" ) ) + { + noStats = true ; + continue ; + } + else if ( !strcmp( argv[i], "--minDepth" ) ) + { + gMinDepth = atoi( argv[i + 1] ) ; + ++i ; + continue ; + } + else + { + fprintf( stderr, "Unknown argument: %s\n", argv[i] ) ; + return 0 ; + } + } + + Alignments alignments ; + alignments.Open( argv[1] ) ; + std::vector<struct _splitSite> splitSites ; // only compromised the + std::vector<struct _splitSite> allSplitSites ; + + // read in the splice site + FILE *fp ; + fp = fopen( argv[2], "r" ) ; + char chrom[50] ; + int64_t start, end ; + int support ; + char strand[3] ; + int uniqSupport, secondarySupport, uniqEditDistance, secondaryEditDistance ; + while ( fscanf( fp, "%s %" PRId64 " %" PRId64 " %d %s %d %d %d %d", chrom, &start, &end, &support, strand, + &uniqSupport, &secondarySupport, &uniqEditDistance, &secondaryEditDistance ) != EOF ) + { + if ( support <= 0 ) + continue ; + //if ( !( uniqSupport >= 1 + // || secondarySupport > 10 ) ) + //if ( uniqSupport <= 0.01 * ( uniqSupport + secondarySupport ) || ( uniqSupport == 0 && secondarySupport < 20 ) ) + //if ( uniqSupport == 0 && secondarySupport <= 10 ) + // continue ; + int chrId = alignments.GetChromIdFromName( chrom ) ; + struct _splitSite ss ; + --start ; + --end ; + ss.pos = start ; + ss.chrId = chrId ; + ss.type = 2 ; + ss.oppositePos = end ; + ss.strand = strand[0] ; + ss.support = support ; + ss.uniqSupport = uniqSupport ; + ss.mismatchSum = uniqEditDistance + secondaryEditDistance ; + splitSites.push_back( ss ) ; + + ss.pos = end ; + ss.type = 1 ; + ss.oppositePos = start ; + ss.strand = strand[0] ; + ss.support = support ; + ss.uniqSupport = uniqSupport ; + ss.mismatchSum = uniqEditDistance + secondaryEditDistance ; + splitSites.push_back( ss ) ; + } + fclose( fp ) ; + //printf( "ss:%d\n", splitSites.size() ) ; + + //printf( "ss:%d\n", splitSites.size() ) ; + + alignments.GetGeneralInfo( true ) ; + // Build the blocks + Blocks regions ; + alignments.Rewind() ; + regions.BuildExonBlocks( alignments ) ; + //printf( "%d\n", regions.exonBlocks.size() ) ; + + FilterAndSortSplitSites( splitSites ) ; + FilterNearSplitSites( splitSites ) ; + FilterRepeatSplitSites( splitSites ) ; + regions.FilterSplitSitesInRegions( splitSites ) ; + regions.FilterGeneMergeSplitSites( splitSites ) ; + + + allSplitSites = splitSites ; + KeepUniqSplitSites( splitSites ) ; + + //for ( i = 0 ; i < splitSites.size() ; ++i ) + // printf( "%d %d\n", splitSites[i].pos + 1, splitSites[i].oppositePos + 1 ) ; + // Split the blocks using split site + regions.SplitBlocks( alignments, splitSites ) ; + //printf( "%d\n", regions.exonBlocks.size() ) ; + /*for ( i = 0 ; i < regions.exonBlocks.size() ; ++i ) + { + struct _block &e = regions.exonBlocks[i] ; + printf( "%s %" PRId64 " %" PRId64 " %d %d\n", alignments.GetChromName( e.chrId ), e.start + 1, e.end + 1, e.leftType, e.rightType ) ; + } + return 0 ;*/ + // Recompute the coverage for each block. + alignments.Rewind() ; + //printf( "Before computeDepth: %d\n", regions.exonBlocks.size() ) ; + + regions.ComputeDepth( alignments ) ; + //printf( "After computeDepth: %d\n", regions.exonBlocks.size() ) ; + + // Merge blocks that may have a hollow coverage by accident. + regions.MergeNearBlocks() ; + //printf( "After merge: %d\n", regions.exonBlocks.size() ) ; + + // Put the intron informations + regions.AddIntronInformation( allSplitSites, alignments ) ; + //printf( "After add information.\n" ) ; + + // Compute the average ratio against the left and right connected subexons. + regions.ComputeRatios() ; + //printf( "After compute ratios.\n" ) ; + + //printf( "Finish building regions.\n" ) ; + if ( noStats ) + { + // just output the subexons. + if ( realpath( argv[1], buffer ) == NULL ) + { + strcpy( buffer, argv[1] ) ; + } + printf( "#%s\n", buffer ) ; + printf( "#fitted_ir_parameter_ratio: pi: -1 k0: -1 theta0: -1 k1: -1 theta1: -1\n" ) ; + printf( "#fitted_ir_parameter_cov: pi: -1 k0: -1 theta0: -1 k1: -1 theta1: -1\n" ) ; + + int blockCnt = regions.exonBlocks.size() ; + for ( int i = 0 ; i < blockCnt ; ++i ) + { + struct _block &e = regions.exonBlocks[i] ; + double avgDepth = (double)e.depthSum / ( e.end - e.start + 1 ) ; + printf( "%s %" PRId64 " %" PRId64 " %d %d %lf -1 -1 -1 -1 ", alignments.GetChromName( e.chrId ), e.start + 1, e.end + 1, e.leftType, e.rightType, avgDepth ) ; + int prevCnt = e.prevCnt ; + if ( i > 0 && e.start == regions.exonBlocks[i - 1].end + 1 && + e.leftType == regions.exonBlocks[i - 1].rightType ) + { + printf( "%d ", prevCnt + 1 ) ; + for ( j = 0 ; j < prevCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.prev[j] ].end + 1 ) ; + printf( "%" PRId64 " ", regions.exonBlocks[i - 1].end + 1 ) ; + } + else + { + printf( "%d ", prevCnt ) ; + for ( j = 0 ; j < prevCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.prev[j] ].end + 1 ) ; + } + + int nextCnt = e.nextCnt ; + if ( i < blockCnt - 1 && e.end == regions.exonBlocks[i + 1].start - 1 && + e.rightType == regions.exonBlocks[i + 1].leftType ) + { + printf( "%d %" PRId64 " ", nextCnt + 1, regions.exonBlocks[i + 1].start + 1 ) ; + } + else + printf( "%d ", nextCnt ) ; + for ( j = 0 ; j < nextCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.next[j] ].start + 1 ) ; + printf( "\n" ) ; + + } + return 0 ; + } + + // Extract the blocks for different events. + int blockCnt = regions.exonBlocks.size() ; + std::vector<struct _block> irBlocks ; // The regions corresponds to intron retention events. + double *leftClassifier = new double[ blockCnt ] ; + double *rightClassifier = new double[ blockCnt ] ; + std::vector<struct _block> overhangBlocks ; //blocks like (...[ or ]...) + std::vector<struct _block> islandBlocks ; // blocks like (....) + + for ( i = 0 ; i < blockCnt ; ++i ) + { + int ltype = regions.exonBlocks[i].leftType ; + int rtype = regions.exonBlocks[i].rightType ; + leftClassifier[i] = -1 ; + rightClassifier[i] = -1 ; + + //double avgDepth = (double)regions.exonBlocks[i].depthSum / ( regions.exonBlocks[i].end - regions.exonBlocks[i].start + 1 ) ; + + if ( ltype == 2 && rtype == 1 ) + { + // candidate intron retention. + // Note that when I compute the ratio, it is already made sure that the avgDepth>1. + double ratio = regions.PickLeftAndRightRatio( regions.exonBlocks[i] ) ; + + //printf( "%lf %lf\n", regions.exonBlocks[i].leftRatio, regions.exonBlocks[i].rightRatio ) ; + if ( ratio > 0 ) + { + regions.exonBlocks[i].ratio = ratio ; + irBlocks.push_back( regions.exonBlocks[i] ) ; + irBlocks[ irBlocks.size() - 1 ].contigId = i ; + } + } + else if ( ( ltype == 0 && rtype == 1 ) || ( ltype == 2 && rtype == 0 ) ) + { + // subexons like (...[ or ]...) + double ratio ; + if ( ltype == 0 ) + { + ratio = regions.exonBlocks[i].rightRatio ; + } + else if ( ltype == 2 ) + { + ratio = regions.exonBlocks[i].leftRatio ; + } + if ( ratio > 0 ) + { + regions.exonBlocks[i].ratio = ratio ; + overhangBlocks.push_back( regions.exonBlocks[i] ) ; + overhangBlocks[ overhangBlocks.size() - 1].contigId = i ; + } + } + else if ( ltype == 0 && rtype == 0 ) + { + islandBlocks.push_back( regions.exonBlocks[i] ) ; + islandBlocks[ islandBlocks.size() - 1].contigId = i ; + } + } + + // Compute the histogram for each intron. + int irBlockCnt = irBlocks.size() ; + double *cov = new double[irBlockCnt] ; + double *covRatio = new double[ irBlockCnt ] ; + double piRatio, kRatio[2], thetaRatio[2] ; + double piCov, kCov[2], thetaCov[2] ; + for ( i = 0 ; i < irBlockCnt ; ++i ) + { + double avgDepth = regions.GetAvgDepth( irBlocks[i] ) ; + //cov[i] = ( avgDepth - 1 ) / ( flankingAvg - 1 ) ; + cov[i] = TransformCov( avgDepth ) ; + + covRatio[i] = regions.PickLeftAndRightRatio( irBlocks[i] ) ; + //cov[i] = avgDepth / anchorAvg ; + //printf( "%"PRId64" %d %d: %lf %lf\n", irBlocks[i].depthSum, irBlocks[i].start, irBlocks[i].end, avgDepth, cov[i] ) ; + } + + /*fp = fopen( "ratio.out", "r" ) ; + int irBlockCnt = 0 ; + double *cov = new double[10000] ; + while ( 1 ) + { + double r ; + if ( fscanf( fp, "%lf", &r ) == EOF ) + break ; + cov[ irBlockCnt ] = r ; + ++irBlockCnt ; + } + fclose( fp ) ;*/ + RatioAndCovEM( covRatio, cov, irBlockCnt, piRatio, kRatio, thetaRatio, piCov, kCov, thetaCov ) ; + + for ( i = 0 ; i < irBlockCnt ; ++i ) + { + //double lf0 = -k[0] * log( theta[0] ) + ( k[0] - 1 ) * log( cov[i]) - cov[i] / theta[0] - lgamma( k[0] ) ; + //double lf1 = -k[1] * log( theta[1] ) + ( k[1] - 1 ) * log( cov[i]) - cov[i] / theta[1] - lgamma( k[1] ) ; + + double p1, p2, p ; + + p1 = MixtureGammaAssignmentAdjust( covRatio[i], piRatio, kRatio, thetaRatio ) ; + p2 = MixtureGammaAssignmentAdjust( cov[i], piCov, kCov, thetaCov ) ; + + /*p1 = GetPValue( covRatio[i], kRatio, thetaRatio ) ; //1 - gammad( covRatio[i] / thetaRatio[0], kRatio[0], &fault ) ; + if ( piRatio != 0 ) + p2 = GetPValue( cov[i], kCov, thetaCov ) ;//1 - gammad( cov[i] / thetaCov[0], kCov[0], &fault ) ; + else + p2 = p1 ;*/ + //printf( "%lf %lf: %lf %lf\n", covRatio[i], cov[i], p1, p2 ) ; + p = p1 > p2 ? p1 : p2 ; + + + //printf( "%d %d: avg: %lf ratio: %lf p: %lf\n", irBlocks[i].start, irBlocks[i].end, irBlocks[i].depthSum / (double)( irBlocks[i].end - irBlocks[i].start + 1 ), covRatio[i], + // p ) ; + leftClassifier[ irBlocks[i].contigId ] = p ; + rightClassifier[ irBlocks[i].contigId ] = p ; + } + + // Process the classifier for overhang subexons and the subexons to see whether we need soft boundary + int overhangBlockCnt = overhangBlocks.size() ; + delete []cov ; + delete []covRatio ; + + cov = new double[ overhangBlockCnt ] ; + covRatio = new double[overhangBlockCnt] ; + double overhangPiRatio, overhangKRatio[2], overhangThetaRatio[2] ; + double overhangPiCov, overhangKCov[2], overhangThetaCov[2] ; + + for ( i = 0 ; i < overhangBlockCnt ; ++i ) + { + covRatio[i] = overhangBlocks[i].ratio ; + cov[i] = TransformCov( regions.GetAvgDepth( overhangBlocks[i] ) ) ; + } + RatioAndCovEM( covRatio, cov, overhangBlockCnt, overhangPiRatio, overhangKRatio, overhangThetaRatio, overhangPiCov, overhangKCov, overhangThetaCov ) ; + + for ( i = 0 ; i < overhangBlockCnt ; ++i ) + { + //double lf0 = -k[0] * log( theta[0] ) + ( k[0] - 1 ) * log( cov[i]) - cov[i] / theta[0] - lgamma( k[0] ) ; + //double lf1 = -k[1] * log( theta[1] ) + ( k[1] - 1 ) * log( cov[i]) - cov[i] / theta[1] - lgamma( k[1] ) ; + + double p1, p2, p ; + p1 = MixtureGammaAssignmentAdjust( covRatio[i], overhangPiRatio, overhangKRatio, overhangThetaRatio ) ; + p2 = MixtureGammaAssignmentAdjust( cov[i], overhangPiCov, overhangKCov, overhangThetaCov ) ; + + /*p1 = GetPValue( covRatio[i], overhangKRatio, overhangThetaRatio ) ; //1 - gammad( covRatio[i] / thetaRatio[0], kRatio[0], &fault ) ; + if ( overhangPiRatio != 0) + p2 = GetPValue( cov[i], overhangKCov, overhangThetaCov ) ;//1 - gammad( cov[i] / thetaCov[0], kCov[0], &fault ) ; + else + p2 = p1 ;*/ + + //p = p1 < p2 ? p1 : p2 ; + p = sqrt( p1 * p2 ) ; + + int idx = overhangBlocks[i].contigId ; + if ( regions.exonBlocks[idx].rightType == 0 ) + leftClassifier[ idx ] = rightClassifier[ idx ] = p ; + else + leftClassifier[ idx ] = rightClassifier[ idx ] = p ; + } + + for ( i = 0 ; i < blockCnt ; ++i ) + { + struct _block &e = regions.exonBlocks[i] ; + //if ( ( e.leftType == 0 && e.rightType == 1 ) || + // variance-stabailizing transformation of poisson distribution. But we are more conservative here. + // The multiply 2 before that is because we ignore the region below 0, so we need to somehow renormalize the distribution. + if ( e.leftType == 1 ) + { + if ( e.leftRatio >= 0 ) + leftClassifier[i] = 2 * alnorm( e.leftRatio * 2.0 , true ) ; + else + leftClassifier[i] = 1 ; + } + /*else if ( e.leftType == 0 ) + { + // If this region is a start of a gene, the other sample might introduce + // a new intron before it. So we want to test whether this region can still + // be a start of a gene even there is an intron before it. + for ( j = i + 1 ; j < blockCnt ; ++j ) + { + if ( regions.exonBlocks[j].contigId != regions.exonBlocks[i].contigId ) + break ; + } + + for ( k = i ; k < j ; ++k ) + if ( regions.exonBlocks[j].leftType == 1 ) + break ; + if ( k >= j ) + { + leftClassifier[i] = alnorm( ) + } + }*/ + + //if ( ( e.rightType == 0 && e.leftType == 2 ) || + if ( e.rightType == 2 ) + { + if ( e.rightRatio >= 0 ) + rightClassifier[i] = 2 * alnorm( e.rightRatio * 2.0, true ) ; + else + rightClassifier[i] = 1 ; + } + } + + // Process the result for subexons seems like single-exon transcript (...) + int islandBlockCnt = islandBlocks.size() ; + //std::sort( islandBlocks.begin(), islandBlocks.end(), CompBlocksByAvgDepth ) ; + for ( i = 0, j = 0 ; i < islandBlockCnt ; ++i ) + { + /*if ( regions.GetAvgDepth( islandBlocks[i] ) != regions.GetAvgDepth( islandBlocks[j] ) ) + j = i ; + leftClassifier[ islandBlocks[i].contigId ] = 1 - (j + 1) / (double)( islandBlockCnt + 1 ) ; + rightClassifier[ islandBlocks[i].contigId ] = 1 - (j + 1) / (double)( islandBlockCnt + 1 ) ;*/ + double p = GetPValue( TransformCov( regions.GetAvgDepth( islandBlocks[i] ) ), kCov, thetaCov ) ; + leftClassifier[ islandBlocks[i].contigId ] = p ; + rightClassifier[ islandBlocks[i].contigId ] = p ; + } + + + // Output the result. + if ( realpath( argv[1], buffer ) == NULL ) + { + strcpy( buffer, argv[1] ) ; + } + printf( "#%s\n", buffer ) ; + // TODO: higher precision. + printf( "#fitted_ir_parameter_ratio: pi: %lf k0: %lf theta0: %lf k1: %lf theta1: %lf\n", piRatio, kRatio[0], thetaRatio[0], kRatio[1], thetaRatio[1] ) ; + printf( "#fitted_ir_parameter_cov: pi: %lf k0: %lf theta0: %lf k1: %lf theta1: %lf\n", piCov, kCov[0], thetaCov[0], kCov[1], thetaCov[1] ) ; + + printf( "#fitted_overhang_parameter_ratio: pi: %lf k0: %lf theta0: %lf k1: %lf theta1: %lf\n", overhangPiRatio, overhangKRatio[0], overhangThetaRatio[0], overhangKRatio[1], overhangThetaRatio[1] ) ; + printf( "#fitted_overhang_parameter_cov: pi: %lf k0: %lf theta0: %lf k1: %lf theta1: %lf\n", overhangPiCov, overhangKCov[0], overhangThetaCov[0], overhangKCov[1], overhangThetaCov[1] ) ; + + + for ( int i = 0 ; i < blockCnt ; ++i ) + { + struct _block &e = regions.exonBlocks[i] ; + double avgDepth = regions.GetAvgDepth( e ) ; + printf( "%s %" PRId64 " %" PRId64 " %d %d %c %c %lf %lf %lf %lf %lf ", alignments.GetChromName( e.chrId ), e.start + 1, e.end + 1, e.leftType, e.rightType, + e.leftStrand, e.rightStrand, avgDepth, + e.leftRatio, e.rightRatio, leftClassifier[i], rightClassifier[i] ) ; + int prevCnt = e.prevCnt ; + if ( i > 0 && e.start == regions.exonBlocks[i - 1].end + 1 ) + //&& e.leftType == regions.exonBlocks[i - 1].rightType ) + { + printf( "%d ", prevCnt + 1 ) ; + for ( j = 0 ; j < prevCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.prev[j] ].end + 1 ) ; + printf( "%" PRId64 " ", regions.exonBlocks[i - 1].end + 1 ) ; + } + else + { + printf( "%d ", prevCnt ) ; + for ( j = 0 ; j < prevCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.prev[j] ].end + 1 ) ; + } + + int nextCnt = e.nextCnt ; + if ( i < blockCnt - 1 && e.end == regions.exonBlocks[i + 1].start - 1 ) + //&& e.rightType == regions.exonBlocks[i + 1].leftType ) + { + printf( "%d %" PRId64 " ", nextCnt + 1, regions.exonBlocks[i + 1].start + 1 ) ; + } + else + printf( "%d ", nextCnt ) ; + for ( j = 0 ; j < nextCnt ; ++j ) + printf( "%" PRId64 " ", regions.exonBlocks[ e.next[j] ].start + 1 ) ; + printf( "\n" ) ; + } + + delete[] cov ; + delete[] covRatio ; + delete[] leftClassifier ; + delete[] rightClassifier ; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/TranscriptDecider.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,3492 @@ +#include "TranscriptDecider.hpp" + +void TranscriptDecider::OutputTranscript( int sampleId, struct _subexon *subexons, struct _transcript &transcript ) +{ + int i, j ; + // determine the strand + std::vector<int> subexonInd ; + transcript.seVector.GetOnesIndices( subexonInd ) ; + + // Determine the strand + char strand[2] = "." ; + int size = subexonInd.size() ; + if ( size > 1 ) + { + // locate the intron showed up in this transcript. + for ( i = 0 ; i < size - 1 ; ++i ) + { + /*int nextCnt = subexons[ subexonInd[i] ].nextCnt ; + if ( nextCnt == 0 ) + continue ; + + for ( j = 0 ; j < nextCnt ; ++j ) + { + int a = subexons[ subexonInd[i] ].next[j] ; + if ( subexonInd[i + 1] == a + && subexons[ subexonInd[i] ].end + 1 < subexons[a].start ) // avoid the case like ..(...[... + { + break ; + } + } + if ( j < nextCnt )*/ + + if ( subexons[ subexonInd[i] ].end + 1 < subexons[ subexonInd[i + 1] ].start ) + { + if ( subexons[ subexonInd[i] ].rightStrand == 1 ) + strand[0] = '+' ; + else if ( subexons[ subexonInd[i] ].rightStrand == -1 ) + strand[0] = '-' ; + break ; + } + } + } + + // TODO: transcript_id + char *chrom = alignments.GetChromName( subexons[0].chrId ) ; + char prefix[10] = "" ; + struct _subexon *catSubexons = new struct _subexon[ size + 1 ] ; + // Concatenate adjacent subexons + catSubexons[0] = subexons[ subexonInd[0] ] ; + j = 1 ; + for ( i = 1 ; i < size ; ++i ) + { + if ( subexons[ subexonInd[i] ].start == catSubexons[j - 1].end + 1 ) + { + catSubexons[j - 1].end = subexons[ subexonInd[i] ].end ; + } + else + { + catSubexons[j] = subexons[ subexonInd[i] ] ; + ++j ; + } + } + size = j ; + + int gid = GetTranscriptGeneId( subexonInd, subexons ) ; + if ( 0 ) //numThreads <= 1 ) + { + fprintf( outputFPs[sampleId], "%s\tCLASSES\ttranscript\t%d\t%d\t1000\t%s\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; Abundance \"%.6lf\";\n", + chrom, catSubexons[0].start + 1, catSubexons[size - 1].end + 1, strand, + prefix, chrom, gid, + prefix, chrom, gid, transcriptId[ gid - baseGeneId ], transcript.FPKM ) ; + for ( i = 0 ; i < size ; ++i ) + { + fprintf( outputFPs[ sampleId ], "%s\tCLASSES\texon\t%d\t%d\t1000\t%s\t.\tgene_id \"%s%s.%d\"; " + "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; Abundance \"%.6lf\"\n", + chrom, catSubexons[i].start + 1, catSubexons[i].end + 1, strand, + prefix, chrom, gid, + prefix, chrom, gid, transcriptId[ gid - baseGeneId ], + i + 1, transcript.FPKM ) ; + } + } + else + { + struct _outputTranscript t ; + int len = 0 ; + t.chrId = subexons[0].chrId ; + t.geneId = gid ; + t.transcriptId = transcriptId[ gid - baseGeneId ] ; + t.FPKM = transcript.FPKM ; + t.sampleId = sampleId ; + t.exons = new struct _pair32[size] ; + for ( i = 0 ; i < size ; ++i ) + { + t.exons[i].a = catSubexons[i].start + 1 ; + t.exons[i].b = catSubexons[i].end + 1 ; + len += t.exons[i].b - t.exons[i].a + 1 ; + } + t.cov = transcript.abundance * alignments.readLen / len ; + t.ecnt = size ; + t.strand = strand[0] ; + //printf( "%lf\n", transcript.correlationScore ) ; + + if ( numThreads > 1 ) + outputHandler->Add( t ) ; + else + outputHandler->Add_SingleThread( t ) ; + } + ++transcriptId[ gid - baseGeneId ] ; + + delete[] catSubexons ; +} + +int TranscriptDecider::GetFather( int f, int *father ) +{ + if ( father[f] != f ) + return father[f] = GetFather( father[f], father ) ; + return f ; +} + +int TranscriptDecider::GetTranscriptGeneId( std::vector<int> &subexonInd, struct _subexon *subexons ) +{ + int i ; + int size = subexonInd.size() ; + + for ( i = 0 ; i < size ; ++i ) + if ( subexons[ subexonInd[i] ].geneId != -2 ) + return subexons[ subexonInd[i] ].geneId ; + + // Some extreme case, where all the regions are mixture regions. + for ( i = 0 ; i < size - 1 ; ++i ) + if ( subexons[ subexonInd[i] ].end + 1 < subexons[ subexonInd[i + 1] ].start ) + { + return defaultGeneId[ ( subexons[ subexonInd[i] ].rightStrand + 1 ) / 2 ] ; + } + return defaultGeneId[0] ; +} + +int TranscriptDecider::GetTranscriptGeneId( struct _transcript &t, struct _subexon *subexons ) +{ + if ( subexons[ t.first ].geneId != -2 ) + return subexons[ t.first ].geneId ; + if ( subexons[ t.last ].geneId != -2 ) + return subexons[ t.last ].geneId ; + std::vector<int> subexonInd ; + t.seVector.GetOnesIndices( subexonInd ) ; + return GetTranscriptGeneId( subexonInd, subexons ) ; +} + +void TranscriptDecider::InitTranscriptId() +{ + int i ; + for ( i = 0 ; i < usedGeneId - baseGeneId ; ++i ) + transcriptId[i] = 0 ; +} + +bool TranscriptDecider::IsStartOfMixtureStrandRegion( int tag, struct _subexon *subexons, int seCnt ) +{ + int j, k ; + int leftStrandCnt[2] = {0, 0}, rightStrandCnt[2] = {0, 0}; + for ( j = tag + 1 ; j < seCnt ; ++j ) + if ( subexons[j].start > subexons[j - 1].end + 1 ) + break ; + + for ( k = tag ; k < j ; ++k ) + { + if ( subexons[k].leftStrand != 0 ) + ++leftStrandCnt[ ( subexons[k].leftStrand + 1 ) / 2 ] ; + if ( subexons[k].rightStrand != 0 ) + ++rightStrandCnt[ ( subexons[k].rightStrand + 1 ) / 2 ] ; + } + + if ( rightStrandCnt[0] > 0 && leftStrandCnt[0] == 0 && leftStrandCnt[1] > 0 ) + return true ; + if ( rightStrandCnt[1] > 0 && leftStrandCnt[1] == 0 && leftStrandCnt[0] > 0 ) + return true ; + return false ; +} + +// Return 0 - uncompatible or does not overlap at all. 1 - fully compatible. 2 - Head of the constraints compatible with the tail of the transcript +// the partial compatible case (return 2) mostly likely happen in DP where we have partial transcript. +int TranscriptDecider::IsConstraintInTranscript( struct _transcript transcript, struct _constraint &c ) +{ + //printf( "%d %d, %d %d\n", c.first, c.last, transcript.first, transcript.last ) ; + if ( c.first < transcript.first || c.first > transcript.last + || !transcript.seVector.Test( c.first ) + || ( !transcript.partial && !transcript.seVector.Test( c.last ) ) ) // no overlap or starts too early or some chosen subexons does not compatible + return 0 ; + + // Extract the subexons we should focus on. + int s, e ; + s = c.first ; + e = c.last ; + bool returnPartial = false ; + if ( e > transcript.last ) // constraints ends after the transcript. + { + if ( transcript.partial ) + { + e = transcript.last ; + returnPartial = true ; + } + else + return 0 ; + } + /*printf( "%s: %d %d: (%d %d) (%d %d)\n", __func__, s, e, + transcript.seVector.Test(0), transcript.seVector.Test(1), + c.vector.Test(0), c.vector.Test(1) ) ;*/ + + compatibleTestVectorT.Assign( transcript.seVector ) ; + //compatibleTestVectorT.MaskRegionOutsideInRange( s, e, transcript.first, transcript.last ) ; + compatibleTestVectorT.MaskRegionOutside( s, e ) ; + + compatibleTestVectorC.Assign( c.vector ) ; + if ( c.last > transcript.last ) + { + //compatibleTestVectorC.MaskRegionOutsideInRange( s, e, c.first, c.last ) ; + //compatibleTestVectorC.MaskRegionOutside( s, e ) ; + compatibleTestVectorC.MaskRegionOutside( 0, e ) ; // Because the bits before s are already all 0s in C. + } + /*printf( "after masking %d %d. %d %d %d %d:\n", s, e, transcript.first, transcript.last, c.first, c.last ) ; + compatibleTestVectorT.Print() ; + compatibleTestVectorC.Print() ; */ + // Test compatible. + int ret = 0 ; + if ( compatibleTestVectorT.IsEqual( compatibleTestVectorC ) ) + { + if ( returnPartial ) + ret = 2 ; + else + ret = 1 ; + } + + return ret ; +} + +int TranscriptDecider::IsConstraintInTranscriptDebug( struct _transcript transcript, struct _constraint &c ) +{ + //printf( "%d %d, %d %d\n", c.first, c.last, transcript.first, transcript.last ) ; + if ( c.first < transcript.first || c.first > transcript.last ) // no overlap or starts too early. + return 0 ; + printf( "hi\n" ) ; + // Extract the subexons we should focus on. + int s, e ; + s = c.first ; + e = c.last ; + bool returnPartial = false ; + if ( e > transcript.last ) // constraints ends after the transcript. + { + if ( transcript.partial ) + { + e = transcript.last ; + returnPartial = true ; + } + else + return 0 ; + } + /*printf( "%s: %d %d: (%d %d) (%d %d)\n", __func__, s, e, + transcript.seVector.Test(0), transcript.seVector.Test(1), + c.vector.Test(0), c.vector.Test(1) ) ;*/ + + compatibleTestVectorT.Assign( transcript.seVector ) ; + compatibleTestVectorT.MaskRegionOutside( s, e ) ; + + compatibleTestVectorC.Assign( c.vector ) ; + if ( e > transcript.last ) + compatibleTestVectorC.MaskRegionOutside( s, e ) ; + /*printf( "after masking: (%d %d) (%d %d)\n", + compatibleTestVectorT.Test(0), compatibleTestVectorT.Test(1), + compatibleTestVectorC.Test(0), compatibleTestVectorC.Test(1) ) ;*/ + + // Test compatible. + int ret = 0 ; + if ( compatibleTestVectorT.IsEqual( compatibleTestVectorC ) ) + { + if ( returnPartial ) + ret = 2 ; + else + ret = 1 ; + } + compatibleTestVectorT.Print() ; + compatibleTestVectorC.Print() ; + printf( "ret=%d\n", ret ) ; + return ret ; +} +int TranscriptDecider::SubTranscriptCount( int tag, struct _subexon *subexons, int *f ) +{ + if ( f[tag] != -1 ) + return f[tag] ; + + int ret = 0 ; + int i ; + if ( subexons[tag].canBeEnd ) + ret = 1 ; + for ( i = 0 ; i < subexons[tag].nextCnt ; ++i ) + { + ret += SubTranscriptCount( subexons[tag].next[i], subexons, f ) ; + } + + if ( ret == 0 ) + ret = 1 ; + return f[tag] = ret ; +} + +void TranscriptDecider::CoalesceSameTranscripts( std::vector<struct _transcript> &t ) +{ + int i, k ; + if ( t.size() == 0 ) + return ; + + std::sort( t.begin(), t.end(), CompSortTranscripts ) ; + + int size = t.size() ; + k = 0 ; + for ( i = 1 ; i < size ; ++i ) + { + if ( t[k].seVector.IsEqual( t[i].seVector ) ) + { + t[k].abundance += t[i].abundance ; + t[i].seVector.Release() ; + } + else + { + ++k ; + if ( i != k ) + t[k] = t[i] ; + } + } + t.resize( k + 1 ) ; +} + +void TranscriptDecider::EnumerateTranscript( int tag, int strand, int visit[], int vcnt, struct _subexon *subexons, SubexonCorrelation &correlation, double correlationScore, std::vector<struct _transcript> &alltranscripts, int &atcnt ) +{ + int i ; + visit[ vcnt ] = tag ; + //printf( "%s: %d %d %d %d. %d %d\n", __func__, vcnt, tag, subexons[tag].nextCnt, strand, subexons[tag].start, subexons[tag].end ) ; + // Compute the correlation score + double minCor = correlationScore ; + for ( i = 0 ; i < vcnt ; ++i ) + { + double tmp = correlation.Query( visit[i], visit[vcnt] ) ; + if ( tmp < minCor ) + minCor = tmp ; + } + + if ( subexons[tag].canBeEnd ) + { + struct _transcript &txpt = alltranscripts[atcnt] ; + for ( i = 0 ; i <= vcnt ; ++i ) + txpt.seVector.Set( visit[i] ) ; + + txpt.first = visit[0] ; + txpt.last = visit[vcnt] ; + txpt.partial = false ; + txpt.correlationScore = minCor ; + + //printf( "%lf %d %d ", txpt.correlationScore, vcnt, visit[0] ) ; + //txpt.seVector.Print() ; + ++atcnt ; + } + + for ( i = 0 ; i < subexons[tag].nextCnt ; ++i ) + { + int a = subexons[tag].next[i] ; + if ( !SubexonGraph::IsSameStrand( subexons[tag].rightStrand, strand ) + && subexons[a].start > subexons[tag].end + 1 ) + continue ; + int backupStrand = strand ; + if ( subexons[a].start > subexons[tag].end + 1 && strand == 0 ) + strand = subexons[tag].rightStrand ; + EnumerateTranscript( subexons[tag].next[i], strand, visit, vcnt + 1, subexons, correlation, minCor, alltranscripts, atcnt ) ; + strand = backupStrand ; + } +} + +void TranscriptDecider::SearchSubTranscript( int tag, int strand, int parents[], int pcnt, struct _dp &pdp, int visit[], int vcnt, int extends[], int extendCnt, +std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr ) +{ + int i ; + int size ; + double cover ; + bool keepSearch = true ; + bool belowMin = false ; + + struct _subexon *subexons = attr.subexons ; + + visit[vcnt] = tag ; + ++vcnt ; + struct _dp visitdp ; + + visitdp.cover = -1 ; + + struct _transcript &subTxpt = attr.bufferTxpt ; + subTxpt.seVector.Reset() ; + for ( i = 0 ; i < pcnt ; ++i ) + subTxpt.seVector.Set( parents[i] ) ; + subTxpt.first = parents[0] ; + subTxpt.last = parents[ pcnt - 1] ; + for ( i = 0 ; i < vcnt ; ++i ) + subTxpt.seVector.Set( visit[i] ) ; + subTxpt.last = visit[ vcnt - 1 ] ; + subTxpt.partial = true ; + + // Adjust the extendsCnt + /*printf( "%s: %d %d %d\n", __func__, vcnt , extendCnt, extends[ extendCnt - 1] ) ; + subTxpt.seVector.Print() ; + tc[extends[extendCnt - 1]].vector.Print() ; + printf( "Adjust extend:\n") ;*/ + for ( i = extendCnt - 1 ; i >= 0 ; --i ) + { + if ( tc[ extends[i] ].last <= tag || ( tc[ extends[i] ].vector.Test( tag ) && IsConstraintInTranscript( subTxpt, tc[ extends[i] ] ) != 0 ) ) + break ; + } + extendCnt = i + 1 ; + + // If the extension ends. + subTxpt.partial = false ; + if ( subexons[tag].nextCnt > 0 && ( extendCnt == 0 || tag >= tc[ extends[ extendCnt - 1 ] ].last ) ) + { + // Solve the subtranscript beginning with visit. + // Now we got the optimal transcript for visit. + visitdp = SolveSubTranscript( visit, vcnt, strand, tc, tcStartInd, attr ) ; + keepSearch = false ; + } + //printf( "%s %d %d: visitdp.cover=%lf\n", __func__, parents[0], tag, visitdp.cover ) ; + + // the constraints across the parents and visit. + size = tc.size() ; + if ( visitdp.cover >= 0 ) + { + cover = visitdp.cover ; + // Reset the subTxpt, since its content is modofitied in SolveSubTxpt called above. + subTxpt.seVector.Reset() ; + for ( i = 0 ; i < pcnt ; ++i ) + subTxpt.seVector.Set( parents[i] ) ; + subTxpt.seVector.Or( visitdp.seVector ) ; + subTxpt.first = parents[0] ; + subTxpt.last = visitdp.last ; + subTxpt.partial = false ; + + if ( !attr.forAbundance && attr.minAbundance > 0 ) + { + for ( i = 0 ; i < pcnt - 1 ; ++i ) + { + if ( attr.uncoveredPair.find( parents[i] * attr.seCnt + parents[i + 1] ) != attr.uncoveredPair.end() ) + belowMin = true ; + } + for ( i = -1 ; i < vcnt - 1 ; ++i ) + { + if ( i == -1 && pcnt >= 1 ) + { + if ( attr.uncoveredPair.find( parents[pcnt - 1] * attr.seCnt + visit[0] ) != attr.uncoveredPair.end() ) + belowMin = true ; + } + else + { + if ( attr.uncoveredPair.find( visit[i] * attr.seCnt + visit[i + 1] ) != attr.uncoveredPair.end() ) + belowMin = true ; + } + } + if ( attr.forAbundance && belowMin ) + cover = 1e-6 ; + } + + for ( i = tcStartInd ; i < size ; ++i ) + { + if ( tc[i].first > parents[ pcnt - 1] ) + break ; + + if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 ) + { + if ( tc[i].normAbund <= attr.minAbundance ) + { + belowMin = true ; + cover = -2 ; + break ; + } + + if ( tc[i].abundance <= 0 ) + continue ; + + if ( attr.forAbundance ) + { + if ( tc[i].normAbund < cover || cover == 0 ) + cover = tc[i].normAbund ; + } + else + { + ++cover ; + } + } + } + if ( belowMin && pdp.cover == -1 ) + { + pdp.cover = -2 ; + pdp.seVector.Assign( subTxpt.seVector ) ; + pdp.first = subTxpt.first ; + pdp.last = subTxpt.last ; + pdp.strand = strand ; + } + else if ( cover > pdp.cover ) + { + pdp.cover = cover ; + pdp.seVector.Assign( subTxpt.seVector ) ; + pdp.first = subTxpt.first ; + pdp.last = subTxpt.last ; + pdp.strand = strand ; + } + } + else if ( visitdp.cover == -2 && pdp.cover == -1 ) // no valid extension from visit + { + subTxpt.seVector.Reset() ; + for ( i = 0 ; i < pcnt ; ++i ) + subTxpt.seVector.Set( parents[i] ) ; + subTxpt.seVector.Or( visitdp.seVector ) ; + subTxpt.first = parents[0] ; + subTxpt.last = visitdp.last ; + + pdp.cover = -2 ; + pdp.seVector.Assign( subTxpt.seVector ) ; + pdp.first = subTxpt.first ; + pdp.last = subTxpt.last ; + pdp.strand = strand ; + } + + if ( subexons[tag].canBeEnd && ( visitdp.cover < 0 || attr.forAbundance ) ) + // This works is because that the extension always covers more constraints. So we only go this branch if the extension does not work + // and it goes this branch if it violates minAbundance + // But we need to go here when we want to compute the maxAbundance transcript. + // This part also works as the exit point of the recurive function. + { + bool belowMin = false ; + subTxpt.seVector.Reset() ; + for ( i = 0 ; i < pcnt ; ++i ) + subTxpt.seVector.Set( parents[i] ) ; + for ( i = 0 ; i < vcnt ; ++i ) + subTxpt.seVector.Set( visit[i] ) ; + subTxpt.first = parents[0] ; + subTxpt.last = visit[ vcnt - 1] ; + subTxpt.partial = false ; + + cover = 0 ; + if ( attr.forAbundance || attr.minAbundance > 0 ) + { + for ( i = 0 ; i < pcnt - 1 ; ++i ) + { + if ( attr.uncoveredPair.find( parents[i] * attr.seCnt + parents[i + 1] ) != attr.uncoveredPair.end() ) + belowMin = true ; + } + for ( i = -1 ; i < vcnt - 1 ; ++i ) + { + if ( i == -1 && pcnt >= 1 ) + { + if ( attr.uncoveredPair.find( parents[pcnt - 1] * attr.seCnt + visit[0] ) != attr.uncoveredPair.end() ) + belowMin = true ; + } + else + { + if ( attr.uncoveredPair.find( visit[i] * attr.seCnt + visit[i + 1] ) != attr.uncoveredPair.end() ) + belowMin = true ; + } + } + + //if ( belowMin == true ) + // printf( "turned belowMin. %d. %d %d: %d %d %d\n", attr.uncoveredPair.size(), pcnt, vcnt, parents[0], visit[0], visit[ vcnt - 1] ) ; + + if ( attr.forAbundance && belowMin ) + cover = 1e-6 ; + } + + for ( i = tcStartInd ; i < size ; ++i ) + { + // note that the value is parents[ pcnt - 1], because + // in above the part of "visit" is computed in SolveSubTranscript( visit ). + if ( tc[i].first > visit[ vcnt - 1] ) + break ; + if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 ) + { + if ( tc[i].normAbund <= attr.minAbundance ) + { + belowMin = true ; + cover = -2 ; + break ; + } + + if ( tc[i].abundance <= 0 ) + continue ; + if ( attr.forAbundance ) + { + if ( tc[i].normAbund < cover || cover == 0 ) + cover = tc[i].normAbund ; + } + else + { + ++cover ; + } + } + } + + if ( belowMin && pdp.cover == -1 ) + { + pdp.cover = -2 ; + pdp.seVector.Assign( subTxpt.seVector ) ; + pdp.first = subTxpt.first ; + pdp.last = subTxpt.last ; + pdp.strand = strand ; + } + else if ( cover > pdp.cover ) + { + pdp.cover = cover ; + pdp.seVector.Assign( subTxpt.seVector ) ; + pdp.first = subTxpt.first ; + pdp.last = subTxpt.last ; + pdp.strand = strand ; + } + } + //printf( "%s %d: pdp.cover=%lf\n", __func__, tag, pdp.cover ) ; + + // keep searching. + if ( keepSearch ) + { + for ( i = 0 ; i < subexons[tag].nextCnt ; ++i ) + { + int b = subexons[tag].next[i] ; + if ( ( SubexonGraph::IsSameStrand( subexons[tag].rightStrand, strand ) + && SubexonGraph::IsSameStrand( subexons[b].leftStrand, strand ) ) || + subexons[b].start == subexons[tag].end + 1 ) + { + int backupStrand = strand ; + if ( subexons[b].start > subexons[tag].end + 1 ) + strand = subexons[tag].rightStrand ; + + SearchSubTranscript( subexons[tag].next[i], strand, parents, pcnt, pdp, visit, vcnt, + extends, extendCnt, tc, tcStartInd, attr ) ; + strand = backupStrand ; + } + } + + } + + return ; +} + +struct _dp TranscriptDecider::SolveSubTranscript( int visit[], int vcnt, int strand, std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr ) +{ + int i ; + int size ; + /*printf( "%s: ", __func__ ) ; + for ( i = 0 ; i < vcnt ; ++i ) + printf( "%d ", visit[i] ) ; + printf( ": %lf %d %d", attr.f1[ visit[0] ].cover, attr.f1[ visit[0] ].timeStamp, attr.timeStamp ) ; + printf( "\n" ) ;*/ + // Test whether it is stored in dp + if ( vcnt == 1 ) + { + if ( attr.f1[ visit[0] ].cover != -1 && attr.f1[ visit[0] ].strand == strand && ( attr.f1[ visit[0] ].timeStamp == attr.timeStamp || + ( attr.f1[ visit[0] ].minAbundance < attr.minAbundance && attr.f1[visit[0]].cover == -2 ) ) ) //even given lower minAbundance threshold, it fails + { + return attr.f1[ visit[0] ] ; + } + } + else if ( vcnt == 2 && attr.f2 ) + { + int a = visit[0] ; + int b = visit[1] ; + + if ( attr.f2[a][b].cover != -1 && attr.f2[a][b].strand == strand && ( attr.f2[a][b].timeStamp == attr.timeStamp || + ( attr.f2[a][b].minAbundance < attr.minAbundance && attr.f2[a][b].cover == -2 ) ) ) + { + return attr.f2[a][b] ; + } + } + else + { + int key = 0 ; + for ( i = 0 ; i < vcnt ; ++i ) + key = ( key * attr.seCnt + visit[i] ) % hashMax ; + if ( key < 0 ) + key += hashMax ; + + if ( attr.hash[key].cover != -1 && attr.hash[key].cnt == vcnt && attr.hash[key].strand == strand && + ( attr.hash[key].first == visit[0] ) && + ( attr.hash[key].timeStamp == attr.timeStamp || + ( attr.hash[key].minAbundance < attr.minAbundance && attr.hash[key].cover == -2 ) ) ) + { + struct _transcript subTxpt = attr.bufferTxpt ; + subTxpt.seVector.Reset() ; + for ( i = 0 ; i < vcnt ; ++i ) + subTxpt.seVector.Set( visit[i] ) ; + //subTxpt.seVector.Print() ; + //attr.hash[key].seVector.Print() ; + subTxpt.seVector.Xor( attr.hash[key].seVector ) ; + subTxpt.seVector.MaskRegionOutside( visit[0], visit[ vcnt - 1] ) ; + //printf( "hash test: %d %d\n", key, subTxpt.seVector.IsAllZero() ) ; + if ( subTxpt.seVector.IsAllZero() ) + { + return attr.hash[key] ; + } + + // Can't use the code below, because vcnt is the header of subexons. + /*for ( i = 0 ; i < vcnt ; ++i ) + if ( !attr.hash[key].seVector.Test( visit[i] ) ) + break ; + if ( i >= vcnt ) + return attr.hash[key] ;*/ + + } + } + // adjust tcStartInd + size = tc.size() ; + for ( i = tcStartInd ; i < size ; ++i ) + if ( tc[i].first >= visit[0] ) + break ; + tcStartInd = i ; + + + struct _subexon *subexons = attr.subexons ; + struct _dp visitdp ; + visitdp.seVector.Init( attr.seCnt ) ; + visitdp.cover = -1 ; + + struct _transcript &subTxpt = attr.bufferTxpt ; + // This happens when it is called from PickTranscriptsByDP, the first subexon might be the end. + subTxpt.seVector.Reset() ; + for ( i = 0 ; i < vcnt ; ++i ) + subTxpt.seVector.Set( visit[i] ) ; + subTxpt.first = visit[0] ; + subTxpt.last = visit[vcnt - 1] ; + + if ( subexons[ visit[vcnt - 1] ].canBeEnd ) + { + subTxpt.partial = false ; + double cover = 0 ; + for ( i = tcStartInd ; i < size ; ++i ) + { + if ( tc[i].first > subTxpt.last ) + break ; + + if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 ) + { + if ( tc[i].normAbund <= attr.minAbundance ) + { + cover = -2 ; + break ; + } + + if ( tc[i].abundance <= 0 ) + continue ; + if ( attr.forAbundance ) + { + if ( tc[i].normAbund < cover || cover == 0 ) + cover = tc[i].normAbund ; + } + else + ++cover ; + } + } + + visitdp.seVector.Assign( subTxpt.seVector ) ; + visitdp.cover = cover ; + visitdp.first = subTxpt.first ; + visitdp.last = subTxpt.last ; + visitdp.strand = strand ; + } + + // Now we extend. + size = tc.size() ; + int *extends = new int[tc.size() - tcStartInd + 1] ; + int extendCnt = 0 ; + subTxpt.partial = true ; + for ( i = tcStartInd ; i < size ; ++i ) + { + if ( tc[i].first > subTxpt.last ) + break ; + if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 2 ) + { + extends[extendCnt] = i ; + ++extendCnt ; + } + } + + // Sort the extend by the index of the last subexon. + if ( extendCnt > 0 ) + { + struct _pair32 *extendsPairs = new struct _pair32[extendCnt] ; + + for ( i = 0 ; i < extendCnt ; ++i ) + { + extendsPairs[i].a = extends[i] ; + extendsPairs[i].b = tc[ extends[i] ].last ; + } + qsort( extendsPairs, extendCnt, sizeof( struct _pair32 ), CompPairsByB ) ; + + for ( i = 0 ; i < extendCnt ; ++i ) + extends[i] = extendsPairs[i].a ; + + delete[] extendsPairs ; + } + + size = subexons[ visit[vcnt - 1] ].nextCnt ; + int nextvCnt = 1 ; + if ( extendCnt > 0 && tc[ extends[ extendCnt - 1 ] ].last - visit[ vcnt - 1 ] > 1 ) + nextvCnt = tc[ extends[ extendCnt - 1 ] ].last - visit[ vcnt - 1 ] ; + int *nextv = new int[ nextvCnt ] ; + for ( i = 0 ; i < size ; ++i ) + { + int a = visit[vcnt - 1] ; + int b = subexons[a].next[i] ; + if ( ( SubexonGraph::IsSameStrand( subexons[a].rightStrand, strand ) + && SubexonGraph::IsSameStrand( subexons[b].leftStrand, strand ) ) + || + subexons[b].start == subexons[a].end + 1 ) + { + int backupStrand = strand ; + if ( subexons[b].start > subexons[a].end + 1 ) + strand = subexons[a].rightStrand ; + SearchSubTranscript( subexons[ visit[vcnt - 1] ].next[i], strand, visit, vcnt, visitdp, nextv, 0, extends, extendCnt, tc, tcStartInd, attr ) ; + strand = backupStrand ; + + } + } + //printf( "%s %d(%d) %d %d %d: %lf\n", __func__, visit[0], subexons[ visit[vcnt - 1] ].canBeEnd, size, extendCnt, strand, visitdp.cover ) ; + delete[] nextv ; + delete[] extends ; + + // store the result in the dp structure. + // We return the structure stored in dp to simplify the memory access pattern. + // In other words, we assume the structure returned from this function always uses the memory from attr.dp + if ( vcnt == 1 ) + { + SetDpContent( attr.f1[ visit[0] ], visitdp, attr ) ; + visitdp.seVector.Release() ; + return attr.f1[ visit[0] ] ; + } + else if ( vcnt == 2 && attr.f2 ) + { + SetDpContent( attr.f2[ visit[0] ][ visit[1] ], visitdp, attr ) ; + visitdp.seVector.Release() ; + return attr.f2[ visit[0] ][ visit[1] ] ; + } + else + { + int key = 0 ; + for ( i = 0 ; i < vcnt ; ++i ) + key = ( key * attr.seCnt + visit[i] ) % hashMax ; + if ( key < 0 ) + key += hashMax ; + + //static int hashUsed = 0 ; + //if ( attr.hash[key].cover == -1 ) + // ++hashUsed ; + //printf( "%d/%d\n", hashUsed, HASH_MAX) ; + //printf( "hash write: %d\n", key ) ; + SetDpContent( attr.hash[key], visitdp, attr ) ; + attr.hash[key].cnt = vcnt ; + visitdp.seVector.Release() ; + return attr.hash[key] ; + } +} + + +void TranscriptDecider::PickTranscriptsByDP( struct _subexon *subexons, int seCnt, int iterBound, Constraints &constraints, SubexonCorrelation &correlation, struct _dpAttribute &attr, std::vector<struct _transcript> &alltranscripts ) +{ + int i, j, k ; + + std::vector<struct _transcript> transcripts ; + std::vector<struct _constraint> &tc = constraints.constraints ; + int tcCnt = tc.size() ; + int coalesceThreshold = 1024 ; + + //printf( "tcCnt=%d\n", tcCnt ) ; + + attr.timeStamp = 1 ; + attr.bufferTxpt.seVector.Init( seCnt ) ; + attr.subexons = subexons ; + attr.seCnt = seCnt ; + + double maxAbundance = -1 ; + // Initialize the dp data structure + /*memset( attr.f1, -1, sizeof( struct _dp ) * seCnt ) ; + for ( i = 0 ; i < seCnt ; ++i ) + memset( attr.f2[i], -1, sizeof( struct _dp ) * seCnt ) ; + memset( attr.hash, -1, sizeof( struct _dp ) * HASH_MAX ) ;*/ + for ( i = 0 ; i < seCnt ; ++i ) + ResetDpContent( attr.f1[i] ) ; + for ( i = 0 ; i < seCnt && attr.f2 ; ++i ) + for ( j = i ; j < seCnt ; ++j ) + ResetDpContent( attr.f2[i][j] ) ; + for ( i = 0 ; i < hashMax ; ++i ) + ResetDpContent( attr.hash[i] ) ; + + // Set the uncovered pair + attr.uncoveredPair.clear() ; + BitTable bufferTable( seCnt ) ; + k = 0 ; + for ( i = 0 ; i < seCnt ; ++i ) + { + for ( ; k < tcCnt ; ++k ) + { + if ( tc[k].last >= i ) + break ; + } + + if ( k >= tcCnt || tc[k].first > i ) + { + for ( j = 0 ; j < subexons[i].nextCnt ; ++j ) + { + attr.uncoveredPair[i * seCnt + subexons[i].next[j] ] = 1 ; + } + continue ; + } + + for ( j = 0 ; j < subexons[i].nextCnt ; ++j ) + { + bool covered = false ; + int l, n ; + + n = subexons[i].next[j] ; + for ( l = k ; l < tcCnt ; ++l ) + { + if ( tc[l].first > i ) + break ; + if ( tc[l].vector.Test( i ) && tc[l].vector.Test( n ) ) + { + if ( n == i + 1 ) + { + covered = true ; + break ; + } + else + { + bufferTable.Assign( tc[l].vector ) ; + bufferTable.MaskRegionOutside( i + 1, n - 1 ) ; + if ( bufferTable.IsAllZero() ) + { + covered = true ; + break ; + } + } + } + } + + if ( !covered ) + { + //printf( "set!: (%d: %d %d) (%d: %d %d)\n", i, subexons[i].start, subexons[i].end, n, subexons[n].start, subexons[n].end ) ; + attr.uncoveredPair[ i * seCnt + n ] = 1 ; + } + } + } + bufferTable.Release() ; + + + // Find the max abundance + attr.forAbundance = true ; + attr.minAbundance = 0 ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].canBeStart ) + { + int visit[1] = {i} ; + struct _dp tmp ; + + tmp = SolveSubTranscript( visit, 1, 0, tc, 0, attr ) ; + + if ( tmp.cover > maxAbundance ) + maxAbundance = tmp.cover ; + } + } + //PrintLog( "maxAbundance=%lf", maxAbundance ) ; + //exit( 1 ) ; + + // Pick the transcripts. Quantative Set-Cover + // Notice that by the logic in SearchSubTxpt and SolveSubTxpt, we don't need to reinitialize the data structure. + attr.forAbundance = false ; + int *coveredTc = new int[tcCnt] ; + int coveredTcCnt ; + struct _dp maxCoverDp ; + struct _dp bestDp ; + std::map<double, struct _dp> cachedCoverResult ; + + maxCoverDp.seVector.Init( seCnt ) ; + bestDp.seVector.Init( seCnt ) ; + int iterCnt = 0 ; + + while ( 1 ) + { + double bestScore ; + + // iterately assign constraints + attr.minAbundance = 0 ; + + // Find the best candidate transcript. + bestDp.cover = -1 ; + bestScore = -1 ; + while ( 1 ) + { + // iterate the change of minAbundance + if ( cachedCoverResult.find( attr.minAbundance ) != cachedCoverResult.end() ) + { + struct _dp tmp = cachedCoverResult[ attr.minAbundance ] ; + SetDpContent( maxCoverDp, tmp, attr ) ; + } + else + { + maxCoverDp.cover = -1 ; + ++attr.timeStamp ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].canBeStart == false ) + continue ; + int visit[1] = {i} ; + struct _dp tmp ; + tmp = SolveSubTranscript( visit, 1, 0, tc, 0, attr ) ; + + if ( tmp.cover > maxCoverDp.cover && tmp.cover > 0 ) + { + SetDpContent( maxCoverDp, tmp, attr ) ; + } + //if ( subexons[i].start == 6870264 || subexons[i].start == 6872237 ) + // printf( "%d: %lf\n", i, tmp.cover ) ; + } + + if ( maxCoverDp.cover == -1 ) + break ; + struct _dp ccr ; + ccr.seVector.Init( seCnt ) ; + SetDpContent( ccr, maxCoverDp, attr ) ; + cachedCoverResult[ attr.minAbundance ] = ccr ; + } + // the abundance for the max cover txpt. + double min = -1 ; + struct _transcript &subTxpt = attr.bufferTxpt ; + subTxpt.seVector.Assign( maxCoverDp.seVector ) ; + subTxpt.first = maxCoverDp.first ; + subTxpt.last = maxCoverDp.last ; + + for ( i = 0 ; i < tcCnt ; ++i ) + { + if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 ) + { + if ( tc[i].normAbund < min || min == -1 ) + min = tc[i].normAbund ; + } + } + + if ( attr.minAbundance == 0 ) + { + std::vector<int> subexonIdx ; + maxCoverDp.seVector.GetOnesIndices( subexonIdx ) ; + int size = subexonIdx.size() ; + for ( i = 0 ; i < size - 1 ; ++i ) + if ( attr.uncoveredPair.find( subexonIdx[i] * seCnt + subexonIdx[i + 1] ) != attr.uncoveredPair.end() ) + { + min = 1e-6 ; + break ; + } + } + + double score = ComputeScore( maxCoverDp.cover, 1.0, min, maxAbundance, 0 ) ; + if ( bestScore == -1 || score > bestScore ) + { + bestScore = score ; + SetDpContent( bestDp, maxCoverDp, attr ) ; + } + else if ( score < bestScore ) + { + if ( ComputeScore( maxCoverDp.cover, 1.0, maxAbundance, maxAbundance, 0 ) < bestScore ) + break ; + } + //PrintLog( "normAbund=%lf maxCoverDp.cover=%lf score=%lf timeStamp=%d", min, maxCoverDp.cover, score, attr.timeStamp ) ; + attr.minAbundance = min ; + } // end of iteration for minAbundance. + + if ( bestDp.cover == -1 ) + break ; + // Assign the constraints. + coveredTcCnt = 0 ; + double update = -1 ; + struct _transcript &subTxpt = attr.bufferTxpt ; + subTxpt.seVector.Assign( bestDp.seVector ) ; + subTxpt.first = bestDp.first ; + subTxpt.last = bestDp.last ; + subTxpt.partial = false ; + for ( i = 0 ; i < tcCnt ; ++i ) + { + if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 ) + { + if ( tc[i].abundance > 0 && + ( tc[i].abundance < update || update == -1 ) ) + { + update = tc[i].abundance ; + } + coveredTc[ coveredTcCnt ] = i ; + ++coveredTcCnt ; + } + /*else + { + printf( "%d: ", i ) ; + tc[i].vector.Print() ; + if ( i == 127 ) + { + printf( "begin debug:\n" ) ; + IsConstraintInTranscriptDebug( subTxpt, tc[i] ) ; + } + }*/ + } + update *= ( 1 + iterCnt / 50 ) ;//* ( 1 + iterCnt / 50 ) ; + + //PrintLog( "%d: update=%lf %d %d. %d %d %d", iterCnt, update, coveredTcCnt, tcCnt, + // bestDp.first, bestDp.last, subexons[ bestDp.first ].start ) ; + //bestDp.seVector.Print() ; + + struct _transcript nt ; + nt.seVector.Duplicate( bestDp.seVector ) ; + nt.first = bestDp.first ; + nt.last = bestDp.last ; + nt.partial = false ; + nt.abundance = 0 ; + for ( i = 0 ; i < coveredTcCnt ; ++i ) + { + j = coveredTc[i] ; + if ( tc[j].abundance > 0 ) + { + double tmp = ( tc[j].abundance > update ? update : tc[j].abundance ) ; + tc[j].abundance -= tmp ; + double factor = 1 ; + + nt.abundance += ( tc[j].support * update / tc[j].normAbund * factor ) ; + + if ( tc[j].abundance <= 0 ) + { + std::vector<double> removeKey ; + for ( std::map<double, struct _dp>::iterator it = cachedCoverResult.begin() ; it != cachedCoverResult.end() ; ++it ) + { + subTxpt.seVector.Assign( it->second.seVector ) ; + subTxpt.first = it->second.first ; + subTxpt.last = it->second.last ; + subTxpt.partial = false ; + if ( IsConstraintInTranscript( subTxpt, tc[j] ) == 1 ) + { + it->second.seVector.Release() ; + removeKey.push_back( it->first ) ; + } + } + int size = removeKey.size() ; + int l ; + for ( l = 0 ; l < size ; ++l ) + cachedCoverResult.erase( removeKey[l] ) ; + } + } + + if ( tc[j].abundance < 0 ) + tc[j].abundance = 0 ; + } + + transcripts.push_back( nt ) ; + if ( transcripts.size() >= transcripts.capacity() && (int)transcripts.size() >= coalesceThreshold ) + { + CoalesceSameTranscripts( transcripts ) ; + if ( transcripts.size() >= transcripts.capacity() / 2 ) + coalesceThreshold *= 2 ; + } + ++iterCnt ; + + if ( iterCnt >= iterBound ) + break ; + } + CoalesceSameTranscripts( transcripts ) ; + int size = transcripts.size() ; + // Compute the correlation score + for ( i = 0 ; i < size ; ++i ) + { + std::vector<int> subexonInd ; + transcripts[i].seVector.GetOnesIndices( subexonInd ) ; + double cor = 2.0 ; + int s = subexonInd.size() ; + for ( j = 0 ; j < s ; ++j ) + for ( k = j + 1 ; k < s ; ++k ) + { + double tmp = correlation.Query( subexonInd[j], subexonInd[k] ) ; + if ( tmp < cor ) + cor = tmp ; + } + if ( cor > 1 ) + cor = 0 ; + transcripts[i].correlationScore = cor ; + } + + // store the result + for ( i = 0 ; i < size ; ++i ) + alltranscripts.push_back( transcripts[i] ) ; + + // Release the memory + for ( std::map<double, struct _dp>::iterator it = cachedCoverResult.begin() ; it != cachedCoverResult.end() ; ++it ) + { + it->second.seVector.Release() ; + } + attr.bufferTxpt.seVector.Release() ; + + delete[] coveredTc ; + maxCoverDp.seVector.Release() ; + bestDp.seVector.Release() ; +} + + +// Add the preifx/suffix of transcripts to the list +void TranscriptDecider::AugmentTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, int limit, bool extend ) +{ + int i, j, k ; + int size = alltranscripts.size() ; + if ( size >= limit ) + return ; + + // Augment suffix, prefix transcripts + for ( i = 0 ; i < size ; ++i ) + { + std::vector<int> subexonIdx ; + alltranscripts[i].seVector.GetOnesIndices( subexonIdx ) ; + int seIdxCnt = subexonIdx.size() ; + // suffix + for ( j = 1 ; j < seIdxCnt ; ++j ) + { + if ( subexons[ subexonIdx[j] ].canBeStart ) + { + struct _transcript nt ; + nt.first = subexonIdx[j] ; + nt.last = alltranscripts[i].last ; + nt.seVector.Duplicate( alltranscripts[i].seVector ) ; + nt.seVector.MaskRegionOutside( nt.first, nt.last ) ; + nt.partial = false ; + nt.correlationScore = 0 ; + nt.abundance = 0 ; + nt.constraintsSupport = NULL ; + + alltranscripts.push_back( nt ) ; + if ( alltranscripts.size() >= limit ) + return ; + } + } + + // prefix + for ( j = 0 ; j < seIdxCnt - 1 ; ++j ) + { + if ( subexons[ subexonIdx[j] ].canBeEnd ) + { + struct _transcript nt ; + nt.first = alltranscripts[i].first ; + nt.last = subexonIdx[j] ; + nt.seVector.Duplicate( alltranscripts[i].seVector ) ; + nt.seVector.MaskRegionOutside( nt.first, nt.last ) ; + nt.partial = false ; + nt.correlationScore = 0 ; + nt.abundance = 0 ; + nt.constraintsSupport = NULL ; + + alltranscripts.push_back( nt ) ; + if ( alltranscripts.size() >= limit ) + return ; + } + } + + if ( extend ) + { + //Extentions right. + for ( j = 0 ; j < seIdxCnt ; ++j ) + { + if ( subexons[ subexonIdx[j] ].nextCnt > 1 ) + { + for ( k = 0 ; k < subexons[ subexonIdx[j] ].nextCnt ; ++k ) + { + int idx = subexons[ subexonIdx[j] ].next[k] ; + + if ( alltranscripts[i].seVector.Test( idx ) ) + continue ; + int l ; + std::vector<int> visited ; + while ( 1 ) + { + if ( subexons[idx].nextCnt > 1 || subexons[idx].prevCnt > 1 ) + { + break ; + } + + visited.push_back( idx ) ; + if ( subexons[idx].canBeEnd && subexons[idx].nextCnt == 0 ) + { + struct _transcript nt ; + nt.first = alltranscripts[i].first ; + nt.last = idx ; + nt.seVector.Duplicate( alltranscripts[i].seVector ) ; + nt.seVector.MaskRegionOutside( nt.first, subexonIdx[j] ) ; + int visitedSize = visited.size() ; + for ( l = 0 ; l < visitedSize ; ++l ) + nt.seVector.Set( visited[l] ) ; + nt.partial = false ; + nt.correlationScore = 0 ; + nt.abundance = 0 ; + nt.constraintsSupport = NULL ; + + alltranscripts.push_back( nt ) ; + if ( alltranscripts.size() >= limit ) + return ; + } + + if ( subexons[idx].nextCnt == 1 ) + idx = subexons[idx].next[0] ; + else + break ; + } + } + } + } + + // Extension towards left + for ( j = 0 ; j < seIdxCnt ; ++j ) + { + if ( subexons[ subexonIdx[j] ].prevCnt > 1 ) + { + for ( k = 0 ; k < subexons[ subexonIdx[j] ].prevCnt ; ++k ) + { + int idx = subexons[ subexonIdx[j] ].prev[k] ; + + if ( alltranscripts[i].seVector.Test( idx ) ) + continue ; + int l ; + std::vector<int> visited ; + while ( 1 ) + { + if ( subexons[idx].nextCnt > 1 || subexons[idx].prevCnt > 1 ) + { + break ; + } + + visited.push_back( idx ) ; + if ( subexons[idx].canBeStart && subexons[idx].prevCnt == 0 ) + { + struct _transcript nt ; + nt.first = idx ; + nt.last = alltranscripts[i].last ; + nt.seVector.Duplicate( alltranscripts[i].seVector ) ; + nt.seVector.MaskRegionOutside( subexonIdx[j], nt.last ) ; + int visitedSize = visited.size() ; + for ( l = 0 ; l < visitedSize ; ++l ) + nt.seVector.Set( visited[l] ) ; + nt.partial = false ; + nt.correlationScore = 0 ; + nt.abundance = 0 ; + nt.constraintsSupport = NULL ; + + alltranscripts.push_back( nt ) ; + if ( alltranscripts.size() >= limit ) + return ; + } + + if ( subexons[idx].prevCnt == 1 ) + idx = subexons[idx].prev[0] ; + else + break ; + } + } + } + } + } // for if-extend + } + + CoalesceSameTranscripts( alltranscripts ) ; +} + +// Pick the transcripts from given transcripts. +void TranscriptDecider::PickTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, Constraints &constraints, + SubexonCorrelation &seCorrelation, std::vector<struct _transcript> &transcripts ) +{ + int i, j, k ; + std::vector<int> chosen ; + std::vector<struct _matePairConstraint> &tc = constraints.matePairs ; + int atcnt = alltranscripts.size() ; + int tcCnt = tc.size() ; // transcript constraints + int seCnt = 0 ; + + if ( tcCnt == 0 ) + return ; + if ( atcnt > 0 ) + seCnt = alltranscripts[0].seVector.GetSize() ; + else + return ; + + double inf = -1 ; // infinity + int coalesceThreshold = 1024 ; + int *transcriptSeCnt = new int[ atcnt ] ; + int *transcriptLength = new int[atcnt] ; + double *transcriptAbundance = new double[atcnt] ; // the roughly estimated abundance based on constraints. + double *avgTranscriptAbundance = new double[atcnt] ; // the average normAbund from the compatible constraints. + + BitTable *btable = new BitTable[ atcnt ] ; + //BitTable lowCovSubexon ; // force the abundance to 0 for the transcript contains the subexon. + double *coveredPortion = new double[atcnt] ; + + memset( avgTranscriptAbundance, 0 ,sizeof( double ) * atcnt ) ; + for ( i = 0 ; i < atcnt ; ++i ) + btable[i].Init( tcCnt ) ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + int a = constraints.matePairs[j].i ; + int b = constraints.matePairs[j].j ; + + if ( constraints.constraints[a].support > inf ) + inf = constraints.constraints[a].support ; + if ( constraints.constraints[b].support > inf ) + inf = constraints.constraints[b].support ; + + if ( tc[j].normAbund > inf ) + inf = tc[j].normAbund ; + + tc[j].abundance = tc[j].normAbund ; + } + ++inf ; + bool btableSet = false ; + for ( i = 0 ; i < atcnt ; ++i ) + { + //printf( "correlation %d: %lf\n", i, alltranscripts[i].correlationScore ) ; + /*for ( int l = 0 ; l < subexonInd.size() ; ++l ) + { + for ( int m = l ; m < subexonInd.size() ; ++m ) + printf( "%lf ", seCorrelation.Query( l, m ) ) ; + printf( "\n" ) ; + }*/ + + for ( j = 0 ; j < tcCnt ; ++j ) + { + int a = tc[j].i ; + int b = tc[j].j ; + + //printf( "try set btble[ %d ].Set( %d ): %d %d\n", i, j, a, b ) ; + //alltranscripts[i].seVector.Print() ; + //constraints.constraints[a].vector.Print() ; + //constraints.constraints[b].vector.Print() ; + if ( IsConstraintInTranscript( alltranscripts[i], constraints.constraints[a] ) == 1 + && IsConstraintInTranscript( alltranscripts[i], constraints.constraints[b] ) == 1 ) + { + //printf( "set btble[ %d ].Set( %d ): %d %d\n", i, j, a, b ) ; + btable[i].Set( j ) ; + btableSet = true ; + } + } + transcriptSeCnt[i] = alltranscripts[i].seVector.Count() ; + } + if ( btableSet == false ) + { + for ( i = 0 ; i < atcnt ; ++i ) + btable[i].Release() ; + delete[] btable ; + return ; + } + + double maxAbundance = -1 ; // The abundance of the most-abundant transcript + double *adjustScore = new double[atcnt] ; + memset( adjustScore, 0, sizeof( double ) * atcnt ) ; + if ( atcnt > 0 /*&& alltranscripts[0].abundance == -1*/ ) + { + struct _pair32 *chain = new struct _pair32[seCnt] ; + bool *covered = new bool[seCnt] ; + bool *usedConstraints = new bool[constraints.constraints.size() ] ; + std::vector<BitTable> togetherChain ; // those subexons is more likely to show up in the same transcript, like an IR with overhang, should be together to represent a 3'/5'-end + + /*lowCovSubexon.Init( seCnt ) ; + double *avgDepth = new double[seCnt ] ; + + memset( avgDepth, 0, sizeof( double ) * seCnt ) ; + int size = constraints.constraints.size() ; + for ( i = 0 ; i < size ; ++i ) + { + std::vector<int> subexonIdx ; + constraints.constraints[i].GetOnesIndices( subexonIdx ) ; + int seIdxCnt = subexonidx.size() ; + for ( j = 0 ; j < seIdxCnt ; ++j ) + avgDepth[ subexonidx[j] ] += constraints.constraints[i].support ; + } + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( avgDepth[i] * alignments.readLen / (double)( subexons[i].end - subexons[i].start + 1 ) < 1 ) + }*/ + + struct _pair32 firstRegion, lastRegion ; + + for ( i = 0 ; i < seCnt ; ) + { + for ( j = i + 1 ; j < seCnt ; ++j ) + { + if ( subexons[j].start > subexons[j - 1].end + 1 ) + break ; + } + + + int cnt = 0 ; + for ( k = i ; k < j ; ++k ) + { + if ( ( subexons[k].leftType == 2 && subexons[k].rightType == 1 ) + || ( subexons[k].leftType == 0 && subexons[k].rightType == 1 ) + || ( subexons[k].leftType == 2 && subexons[k].rightType == 0 ) ) + ++cnt ; + } + + if ( cnt <= 1 ) + { + i = j ; + continue ; + } + + BitTable tmpTable( seCnt ) ; + for ( k = i ; k < j ; ++k ) + { + if ( ( subexons[k].leftType == 2 && subexons[k].rightType == 1 ) + || ( subexons[k].leftType == 0 && subexons[k].rightType == 1 ) + || ( subexons[k].leftType == 2 && subexons[k].rightType == 0 ) ) + tmpTable.Set( k ) ; + } + togetherChain.push_back( tmpTable ) ; + i = j ; + } + + for ( i = 0 ; i < atcnt ; ++i ) + { + double value = inf ; + int tag = -1 ; + + alltranscripts[i].abundance = 0 ; + alltranscripts[i].constraintsSupport = new double[tcCnt] ; + + std::vector<int> subexonIdx ; + alltranscripts[i].seVector.GetOnesIndices( subexonIdx ) ; + int seIdxCnt = subexonIdx.size() ; + transcriptLength[i] = 0 ; + + firstRegion.a = subexonIdx[0] ; + for ( j = 1 ; j < seIdxCnt ; ++j ) + { + if ( subexons[ subexonIdx[j] ].start > subexons[ subexonIdx[j - 1] ].end + 1 ) + break ; + } + firstRegion.b = subexonIdx[j - 1] ; + lastRegion.b = subexonIdx[ seIdxCnt - 1 ] ; + for ( j = seIdxCnt - 2 ; j >= 0 ; --j ) + { + if ( subexons[ subexonIdx[j] ].end < subexons[ subexonIdx[j + 1] ].start - 1 ) + break ; + } + lastRegion.a = subexonIdx[j + 1] ; + + for ( j = 0 ; j < seIdxCnt ; ++j ) + transcriptLength[i] += subexons[ subexonIdx[j] ].end - subexons[ subexonIdx[j] ].start + 1 ; + + //for ( j = firstRegion.b ; j < lastRegion.a ; ++j ) + for ( j = 0 ; j < seIdxCnt - 1 ; ++j ) + { + chain[j].a = subexonIdx[j] ; + chain[j].b = subexonIdx[j + 1] ; + covered[j] = false ; + } + memset( usedConstraints, false, sizeof( bool ) * constraints.constraints.size() ) ; + int compatibleCnt = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + alltranscripts[i].constraintsSupport[j] = 0 ; + if ( btable[i].Test(j) && tc[j].abundance > 0 ) + { + ++compatibleCnt ; + double adjustAbundance = tc[j].abundance ; + if ( seIdxCnt > 1 ) + { + if ( tc[j].i == tc[j].j + && ( constraints.constraints[ tc[j].i ].first + + constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].first + || constraints.constraints[ tc[j].i ].first + + constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].last ) ) + { + adjustAbundance = inf ; + } + else if ( tc[j].i != tc[j].j + && ( constraints.constraints[ tc[j].i ].first + + constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].first + || constraints.constraints[ tc[j].i ].first + + constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].last ) ) + { + adjustAbundance = constraints.constraints[ tc[j].j ].normAbund ; + } + else if ( tc[j].i != tc[j].j + && ( constraints.constraints[ tc[j].j ].first + + constraints.constraints[ tc[j].j ].last == 2 * alltranscripts[i].first + || constraints.constraints[ tc[j].j ].first + + constraints.constraints[ tc[j].j ].last == 2 * alltranscripts[i].last ) ) + { + adjustAbundance = constraints.constraints[ tc[j].i ].normAbund ; + } + } + if ( adjustAbundance < value ) + /*!( seIdxCnt > 1 + && ( ( ( constraints.constraints[ tc[j].i ].first >= firstRegion.a && constraints.constraints[ tc[j].i ].last <= firstRegion.b ) + && ( constraints.constraints[ tc[j].j ].first >= firstRegion.a && constraints.constraints[ tc[j].j ].last <= firstRegion.b ) ) + || ( ( constraints.constraints[ tc[j].i ].first >= lastRegion.a && constraints.constraints[ tc[j].i ].last <= lastRegion.b ) + && ( constraints.constraints[ tc[j].j ].first >= lastRegion.a && constraints.constraints[ tc[j].j ].last <= lastRegion.b ) ) ) ) + )*/ + { + // Not use the constraints totally within the 3'/5'-end in the transcript + value = adjustAbundance ; + tag = j ; + } + avgTranscriptAbundance[i] += tc[j].abundance ; + + if ( !usedConstraints[ tc[j].i ] ) + { + struct _constraint &c = constraints.constraints[ tc[j].i ] ; + for ( k = 0 ; k < seIdxCnt - 1 ; ++k ) + { + // Note that since the constraint is already compatible with the txpt, + // chain[k].a/b must be also adjacent in this constraint. + if ( c.vector.Test( chain[k].a ) && c.vector.Test( chain[k].b ) ) + covered[k] = true ; + } + usedConstraints[ tc[j].i ] = true ; + } + + if ( !usedConstraints[ tc[j].j ] ) + { + struct _constraint &c = constraints.constraints[ tc[j].j ] ; + for ( k = 0 ; k < seIdxCnt - 1 ; ++k ) + { + if ( c.vector.Test( chain[k].a ) && c.vector.Test( chain[k].b ) ) + covered[k] = true ; + } + usedConstraints[ tc[j].j ] = true ; + } + } + } + + // Get some penalty if something should together did not show up together + int size = togetherChain.size() ; + if ( size > 0 ) + { + BitTable bufferTable( seCnt ) ; + for ( j = 0 ; j < size ; ++j ) + { + bufferTable.Assign( togetherChain[j] ) ; + bufferTable.And( alltranscripts[i].seVector ) ; + //if ( !bufferTable.IsAllZero() && !bufferTable.IsEqual( togetherChain[j] ) ) + // value /= 2 ; + + if ( !bufferTable.IsAllZero() ) + { + if ( bufferTable.IsEqual( togetherChain[j] ) ) + //printf( "nice together!\n" ) ; + ; + else + value /= 2 ; + //printf( "bad together!\n" ) ; + } + } + bufferTable.Release() ; + } + + + // Every two-subexon chain should be covered by some reads if a transcript is expressed highly enough + int cnt = 0 ; + for ( j = 0 ; j < seIdxCnt - 1 ; ++j ) + if ( covered[j] == false ) // && j >= firstRegion.b && j <= lastRegion.a - 1 ) + { + value = 0 ; + } + else + ++cnt ; + if ( seIdxCnt > 1 ) + coveredPortion[i] = (double)cnt / (double)( seIdxCnt - 1 ) ; + else + coveredPortion[i] = 1 ; + if ( coveredPortion[i] == 0 ) + coveredPortion[i] = (double)0.5 / ( seIdxCnt ) ; + + // For short subexon (readLength-subexon_length-1>30), we further require a constraint cover three conseuctive subexon + /*memset( usedConstraints, false, sizeof( bool ) * constraints.constraints.size() ) ; + for ( j = 1 ; j < seIdxCnt - 1 ; ++j ) + { + int k = subexonIdx[j] ; + if ( alignments.readLen - ( subexons[k].end - subexons[k].start + 1 ) - 1 <= 30 ) + continue ; + // We need at least one of the side subexons are adjacent to the center one. + if ( subexons[ subexonIdx[j - 1] ].end + 1 < subexons[k].start && subexons[k].end + 1 < subexons[ subexonIdx[j + 1] ].start ) + continue ; + + int l = 0 ; + for ( l = 0 ; l < tcCnt ; ++l ) + { + if ( btable[i].Test(l) && tc[l].abundance > 0 ) + { + if ( !usedConstraints[ tc[l].i ] ) + { + struct _constraint &c = constraints.constraints[ tc[l].i ] ; + if ( c.vector.Test( subexonIdx[j - 1] ) && c.vector.Test( subexonIdx[j] ) && + c.vector.Test( subexonIdx[j + 1] ) ) + break ; + usedConstraints[ tc[l].i ] = true ; + } + + if ( !usedConstraints[ tc[l].j ] ) + { + struct _constraint &c = constraints.constraints[ tc[l].j ] ; + if ( c.vector.Test( subexonIdx[j - 1] ) && c.vector.Test( subexonIdx[j] ) && + c.vector.Test( subexonIdx[j + 1] ) ) + break ; + usedConstraints[ tc[l].j ] = true ; + } + } + } + // It is not covered + if ( l >= tcCnt ) + { + int residual = alignments.readLen - ( subexons[k].end - subexons[k].start + 1 ) - 1 ; + //printf( "residual: %d %d %lf\n", k, residual, value ) ; + if ( value * residual > 2 ) + { + value = 1 / (double)residual ; + } + } + }*/ + + if ( tag == -1 ) + value = 0 ; + if ( value > maxAbundance ) + maxAbundance = value ; + transcriptAbundance[i] = value ; + if ( tag != -1 ) + avgTranscriptAbundance[i] /= compatibleCnt ; + + //printf( "abundance %d: %lf %lf ", i, value, avgTranscriptAbundance[i] ) ; + //alltranscripts[i].seVector.Print() ; + } + if ( maxAbundance == 0 ) + { + for ( i = 0 ; i < atcnt ; ++i ) + { + transcriptAbundance[i] = coveredPortion[i] ; + } + maxAbundance = 1 ; + } + //printf( "%s: %lf\n", __func__, maxAbundance ) ; + int size = togetherChain.size() ; + for ( j = 0 ; j < size ; ++j ) + togetherChain[j].Release() ; + delete[] usedConstraints ; + delete[] covered ; + delete[] chain ; + } + else + { + for ( i = 0 ; i < atcnt ; ++i ) + { + transcriptAbundance[i] = alltranscripts[i].abundance ; + if ( transcriptAbundance[i] > maxAbundance ) + maxAbundance = transcriptAbundance[i] ; + coveredPortion[i] = 1 ; + } + if ( maxAbundance == 0 ) + maxAbundance = 1 ; + } + + // Obtain the prefix, suffix information of the transcripts. + int *nextSuffix, *nextPrefix ; + struct _pair32 *txptRank ; + nextSuffix = new int[atcnt] ; + nextPrefix = new int[atcnt] ; + txptRank = new struct _pair32[atcnt] ; + memset( nextSuffix, -1, sizeof( int ) * atcnt ) ; + memset( nextPrefix, -1, sizeof( int ) * atcnt ) ; + /*for ( i = 0 ; i < atcnt ; ++i ) + { + std::vector<int> subexonIdx ; + txptRank[i].a = i ; + alltranscripts[i].seVector.GetOnesIndices( subexonIdx ) ; + txptRank[i].b = subexonIdx.size() ; + } + qsort( txptRank, atcnt, sizeof( struct _pair32 ), CompPairsByB) ; + BitTable bufferTable( seCnt ) ; + for ( i = atcnt - 1 ; i >= 0 ; --i ) + { + int a = txptRank[i].a ; + for ( j = i - 1 ; j >= 0 ; --j ) + { + if ( txptRank[i].b == txptRank[j].b ) + continue ; + + int b = txptRank[j].a ; + + if ( alltranscripts[b].last != alltranscripts[a].last ) + continue ; + + bufferTable.Assign( alltranscripts[a].seVector ) ; + bufferTable.MaskRegionOutside( alltranscripts[b].first, alltranscripts[b].last ) ; + if ( bufferTable.IsEqual( alltranscripts[b].seVector ) ) + { + nextSuffix[a] = b ; + break ; + } + } + } + for ( i = atcnt - 1 ; i >= 0 ; --i ) + { + int a = txptRank[i].a ; + for ( j = i - 1 ; j >= 0 ; --j ) + { + if ( txptRank[i].b == txptRank[j].b ) + continue ; + + int b = txptRank[j].a ; + + if ( alltranscripts[b].first != alltranscripts[a].first ) + continue ; + + bufferTable.Assign( alltranscripts[a].seVector ) ; + bufferTable.MaskRegionOutside( alltranscripts[b].first, alltranscripts[b].last ) ; + if ( bufferTable.IsEqual( alltranscripts[b].seVector ) ) + { + nextPrefix[a] = b ; + break ; + } + } + } + + bufferTable.Release() ;*/ + delete[] txptRank ; + + // Quantative Set-Cover + int iterCnt = -1 ; + double *coverCnt = new double[atcnt] ; + for ( i = 0 ; i < atcnt ; ++i ) + coverCnt[i] = -1 ; + int *list = new int[atcnt] ; + int listCnt ; + + while ( 1 ) + { + double max = -1 ; + int maxtag = -1 ; + double maxcnt = -1 ; + ++iterCnt ; + + // Find the optimal candidate. + for ( i = 0 ; i < atcnt ; ++i ) + { + double value = inf ; + double cnt = 0 ; + + if ( coverCnt[i] == -1 ) + { + for ( j = 0 ; j < tcCnt ; ++j ) + { + if ( tc[j].abundance > 0 && btable[i].Test( j ) ) + { + cnt += tc[j].effectiveCount ; + } + } + /*else + { + std::vector<int> tcIdx ; + btable[i].GetOnesIndices( tcIdx ) ; + int size = tcIdx.size() ; + for ( j = 0 ; j < size ; ++j ) + { + if ( tc[ tcIdx[j] ].abundance > 0 ) + { + cnt += tc[ tcIdx[j] ].effectiveCount ; + } + } + }*/ + coverCnt[i] = cnt ; + } + else + { + cnt = coverCnt[i] ; + } + + value = transcriptAbundance[i] ; + if ( cnt < 1 ) // This transcript does not satisfy any undepleted constraints. + continue ; + cnt *= coveredPortion[i] ; + + double weight = 1 ; //* seCnt / transcriptSeCnt[i] ; + //if ( maxAbundance >= 1 && value / maxAbundance >= 0.2 ) + // seCntAdjust = sqrt( (double)( transcriptSeCnt[i] ) / seCnt ) ;//< 0.5 ? 0.5 : (double)( transcriptSeCnt[i] ) / seCnt ; + + if ( alltranscripts[i].FPKM > 0 && sampleCnt > 1 ) + weight = ( 1 + alltranscripts[i].FPKM / sampleCnt ) ; + + double score = ComputeScore( cnt, weight, value, maxAbundance, alltranscripts[i].correlationScore ) ; + if ( cnt > maxcnt ) + maxcnt = cnt ; + score += adjustScore[i] ; + if ( score > max ) + { + max = score ; + maxtag = i ; + } + else if ( score == max ) + { + if ( avgTranscriptAbundance[maxtag] < avgTranscriptAbundance[i] ) + { + max = score ; + maxtag = i ; + } + } + //printf( "score: %d %lf -> %lf\n", i, cnt, score ) ; + } + + if ( maxcnt == 0 || maxtag == -1 ) + break ; + + // Find the constraint that should be depleted. + double update = inf ; + int updateTag = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + if ( btable[ maxtag ].Test( j ) && tc[j].abundance > 0 && + tc[j].abundance <= update ) + { + update = tc[j].abundance ; + updateTag = j ; + } + } + + // Search suffix and prefix to see whether these fit better. + int p = nextSuffix[ maxtag] ; + while ( p != -1 ) + { + if ( transcriptAbundance[p] >= 10.0 * transcriptAbundance[maxtag] + && btable[p].Test( updateTag ) ) + { + //printf( "%d\n", p ) ; + maxtag = p ; + break ; + } + p = nextSuffix[p] ; + } + p = nextPrefix[maxtag] ; + while ( p != -1 ) + { + if ( transcriptAbundance[p] >= 10.0 * transcriptAbundance[maxtag] + && btable[p].Test( updateTag ) ) + { + maxtag = p ; + break ; + } + p = nextPrefix[p] ; + } + + + // Update the abundance. + int supportCnt = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + if ( btable[maxtag].Test( j ) ) + { + if ( tc[j].abundance > 0 ) + { + tc[j].abundance -= 1 * update ; + double factor = tc[j].effectiveCount ; + double tmp = ( tc[j].support * update / tc[j].normAbund * factor ) ; + alltranscripts[maxtag].constraintsSupport[j] += tmp ; + alltranscripts[maxtag].abundance += tmp ; + + if ( tc[j].abundance <= 0 ) + { + int l ; + for ( l = 0 ; l < atcnt ; ++l ) + { + if ( btable[l].Test(j) ) + coverCnt[l] -= tc[j].effectiveCount ; + } + } + ++supportCnt ; + } + else if ( alltranscripts[maxtag].constraintsSupport[j] == 0 ) + { + double sum = 0 ; + double takeOut = 0 ; + double factor = tc[j].effectiveCount ; + listCnt = 0 ; + for ( i = 0 ; i < atcnt ; ++i ) + { + if ( i == maxtag ) + continue ; + + if ( alltranscripts[i].abundance > 0 && btable[i].Test(j) ) + { + sum += alltranscripts[i].constraintsSupport[j] ; + + double tmp = ( alltranscripts[i].constraintsSupport[j] + alltranscripts[maxtag].constraintsSupport[j] ) * + transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[i] ) + - alltranscripts[maxtag].constraintsSupport[j] ; + if ( tmp > 0 ) + { + list[ listCnt ] = i ; + ++listCnt ; + takeOut += tmp ; //alltranscripts[i].constraintsSupport[j] * transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[i] ) ; + } + } + } + + double ratio = 1 ; + double takeOutFactor = 0.5 ; + if ( update < tc[j].normAbund ) + { + if ( takeOut > ( tc[j].support * update / tc[j].normAbund * factor ) * takeOutFactor ) + ratio = ( tc[j].support * update / tc[j].normAbund * factor ) * takeOutFactor / takeOut ; + } + else + { + if ( takeOut > ( tc[j].support * factor ) * takeOutFactor ) + ratio = ( tc[j].support * factor ) * takeOutFactor / takeOut ; + } + + if ( 1 ) //update < tc[j].normAbund ) + { + for ( i = 0 ; i < listCnt ; ++i ) + { + //double tmp = ( tc[j].support * update / tc[j].normAbund * factor ) * + // ( alltranscripts[ list[i] ].constraintsSupport[j] / sum ) ; + //if ( alltranscripts[ list[i] ].constraintsSupport[j] < tmp ) + // printf( "WARNING! %lf %lf, %lf\n", alltranscripts[ list[i] ].constraintsSupport[j], sum, tmp ) ; + + //double tmp = alltranscripts[ list[i] ].constraintsSupport[j] * transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[ list[i] ] ) * ratio ; + double tmp = ( ( alltranscripts[ list[i] ].constraintsSupport[j] + alltranscripts[maxtag].constraintsSupport[j] ) * + transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[ list[i] ] ) + - alltranscripts[maxtag].constraintsSupport[j] ) * ratio ; + + + alltranscripts[ list[i] ].constraintsSupport[j] -= tmp ; + alltranscripts[ list[i] ].abundance -= tmp ; + } + //double tmp = ( tc[j].support * update / tc[j].normAbund * factor ) ; + //printf( "%lf %lf. %lf %lf\n", takeOut, ratio, update, tc[j].normAbund ) ; + double tmp = takeOut * ratio ; + alltranscripts[maxtag].constraintsSupport[j] += tmp ; + alltranscripts[maxtag].abundance += tmp ; + } + /*else + { + double tmp = ( tc[j].support / (double)( listCnt + 1 ) ) * factor ; + for ( i = 0 ; i < listCnt ; ++i ) + { + alltranscripts[ list[i] ].abundance -= alltranscripts[ list[i] ].constraintsSupport[j] ; + + alltranscripts[ list[i] ].constraintsSupport[j] = tmp ; + alltranscripts[ list[i] ].abundance += tmp ; + } + alltranscripts[maxtag].constraintsSupport[j] += tmp ; + alltranscripts[maxtag].abundance += tmp ; + }*/ + + } + } + + if ( tc[j].abundance < 0 ) + { + tc[j].abundance = 0 ; + + } + } + tc[ updateTag ].abundance = 0 ; + if ( supportCnt == 0 ) + break ; + //adjustScore[maxtag] += 1 / (double)tcCnt ; + //printf( "maxtag=%d %lf %d\n", maxtag, update, updateTag ) ; + } + + for ( i = 0 ; i < atcnt ; ++i ) + { + if ( alltranscripts[i].abundance > 0 ) + { + struct _transcript nt = alltranscripts[i] ; + nt.seVector.Nullify() ; + nt.seVector.Duplicate( alltranscripts[i].seVector ) ; + nt.constraintsSupport = NULL ; + if ( transcriptAbundance[i] == 0 ) + nt.correlationScore = -1 ; + else + nt.correlationScore = 0 ; + nt.id = i ; + transcripts.push_back( nt ) ; + } + } + + // Release the memory of btable. + for ( i = 0 ; i < atcnt ; ++i ) + { + delete[] alltranscripts[i].constraintsSupport ; + btable[i].Release() ; + } + delete[] btable ; + + delete[] list ; + delete[] transcriptSeCnt ; + delete[] transcriptLength ; + delete[] transcriptAbundance ; + delete[] avgTranscriptAbundance ; + delete[] coveredPortion ; + delete[] adjustScore ; + delete[] coverCnt ; + + delete[] nextPrefix ; + delete[] nextSuffix ; + + // Redistribute weight if there is some constraints that are unbalanced. + /*tcnt = transcripts.size() ; + for ( i = 0 ; i < tcnt ; ++i ) + { + int maxRatio = -1 ; + for ( j = 0 ; j < tcCnt ; ++j ) + if ( transcripts[i].constraintsSupport[j] > 0 ) + { + double factor = tc[j].effectiveCount ; + if ( transcripts[]) + } + }*/ +} + +void TranscriptDecider::AbundanceEstimation( struct _subexon *subexons, int seCnt, Constraints &constraints, std::vector<struct _transcript> &transcripts ) +{ + int tcnt = transcripts.size() ; + int size ; + int i, j ; + if ( tcnt <= 0 ) + return ; + + std::vector<struct _matePairConstraint> &tc = constraints.matePairs ; + int tcCnt = tc.size() ; // transcript constraints + + BitTable *btable = new BitTable[ tcnt ] ; + int *transcriptLength = new int[tcnt] ; + int *compatibleList = new int[tcnt] ; + double *rho = new double[tcnt] ; // the abundance. + int iterCnt = 0 ; + + for ( i = 0 ; i < tcnt ; ++i ) + transcripts[i].constraintsSupport = new double[ tcCnt ] ; + + for ( i = 0 ; i < tcnt ; ++i ) + { + btable[i].Init( tcCnt ) ; + double min = -1 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + int a = tc[j].i ; + int b = tc[j].j ; + + if ( IsConstraintInTranscript( transcripts[i], constraints.constraints[a] ) == 1 + && IsConstraintInTranscript( transcripts[i], constraints.constraints[b] ) == 1 ) + { + //printf( "set btble[ %d ].Set( %d ): %d %d\n", i, j, a, b ) ; + btable[i].Set( j ) ; + + if ( min == -1 || tc[j].normAbund < min ) + min = tc[j].normAbund ; + } + } + + std::vector<int> subexonIdx ; + transcripts[i].seVector.GetOnesIndices( subexonIdx ) ; + int subexonIdxCnt = subexonIdx.size() ; + int len = 0 ; + for ( j = 0 ; j < subexonIdxCnt ; ++j ) + len += subexons[ subexonIdx[j] ].end - subexons[ subexonIdx[j] ].start + 1 ; + transcriptLength[i] = len - alignments.fragLen + 2 * alignments.fragStdev ; + if ( transcriptLength[i] < 1 ) + transcriptLength[i] = 1 ; + rho[i] = transcripts[i].abundance / transcriptLength[i] ; // use the rough estimation generated before. + if ( transcripts[i].correlationScore == -1 && rho[i] > 0.1 / (double)alignments.readLen ) + rho[i] = 0.1 / (double)alignments.readLen ; + } + + while ( 1 ) + { + for ( i = 0 ; i < tcnt ; ++i ) + for ( j = 0 ; j < tcCnt ; ++j ) + { + transcripts[i].constraintsSupport[j] = 0 ; + } + for ( j = 0 ; j < tcCnt ; ++j ) + { + int clCnt = 0 ; + double sum = 0 ; + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( btable[i].Test(j) ) + { + compatibleList[ clCnt ] = i ; + ++clCnt ; + sum += rho[i] ; + } + } + + for ( i = 0 ; i < clCnt ; ++i ) + { + double factor = tc[j].effectiveCount ; + transcripts[ compatibleList[i] ].constraintsSupport[j] = ( rho[ compatibleList[i] ] / sum ) * tc[j].support * factor ; + } + } + + double diff = 0 ; + for ( i = 0 ; i < tcnt ; ++i ) + { + double newAbund = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + newAbund += transcripts[i].constraintsSupport[j] ; + double old = rho[i] ; + rho[i] = newAbund / transcriptLength[i] ; + //printf( "rho[%d]=%lf\n", i, rho[i] ) ; + if ( transcripts[i].correlationScore == -1 && rho[i] > 0.1 / (double)alignments.readLen ) + rho[i] = 0.1 / (double)alignments.readLen ; + + double tmp = ( old - rho[i] ) ; + diff += tmp < 0 ? -tmp : tmp ; + } + //printf( "%lf\n", diff ) ; + if ( diff < 1e-3) + break ; + + ++iterCnt ; + if ( iterCnt >= 1000 ) + break ; + } + + for ( i = 0 ; i < tcnt ; ++i ) + { + //printf( "%lf=>", transcripts[i].abundance ) ; + transcripts[i].abundance = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + transcripts[i].abundance += transcripts[i].constraintsSupport[j] ; + } + //printf( "%lf. (%lf)\n", transcripts[i].abundance, transcripts[i].correlationScore ) ; + //transcripts[i].seVector.Print() ; + } + + for ( i = 0 ; i < tcnt ; ++i ) + delete[] transcripts[i].constraintsSupport ; + + // Release the memory of btable. + for ( i = 0 ; i < tcnt ; ++i ) + { + btable[i].Release() ; + } + delete[] compatibleList ; + delete[] btable ; + delete[] transcriptLength ; + delete[] rho ; +} + +int TranscriptDecider::RefineTranscripts( struct _subexon *subexons, int seCnt, bool aggressive, + std::map<int, int> *subexonChainSupport, int *txptSampleSupport, std::vector<struct _transcript> &transcripts, Constraints &constraints ) +{ + int i, j, k ; + int tcnt = transcripts.size() ; + if ( tcnt == 0 ) + return 0 ; + int tcCnt = constraints.matePairs.size() ; + + std::vector<struct _matePairConstraint> &tc = constraints.matePairs ; + std::vector<struct _constraint> &scc = constraints.constraints ; //single-end constraints.constraints + + // Remove transcripts whose FPKM are too small. + //printf( "%d %d\n", usedGeneId, baseGeneId ) ; + double *geneMaxFPKM = new double[usedGeneId - baseGeneId ] ; + int *geneMaxFPKMTag = new int[usedGeneId - baseGeneId ] ; + double *nonOverlapMaxFPKM = new double[ usedGeneId - baseGeneId ] ; // the max FPKM among all the transcripts not overlapping with maxFPKMTag transcripts. + memset( geneMaxFPKM, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ; + memset( geneMaxFPKMTag, 0, sizeof( int ) * ( usedGeneId - baseGeneId ) ) ; + memset( nonOverlapMaxFPKM, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ; + + double *geneMaxCov = new double[ usedGeneId - baseGeneId ] ; + memset( geneMaxCov, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ; + int *txptGid = new int[tcnt] ; + + /*for ( i = 0 ; i < tcnt ; ++i ) + { + printf( "%d: %lf ", i, transcripts[i].FPKM ) ; + transcripts[i].seVector.Print() ; + }*/ + + /*================================================================== + Remove transcripts that has too few relative FPKM. (-f) + ====================================================================*/ + for ( i = 0 ; i < tcnt ; ++i ) + { + int gid = GetTranscriptGeneId( transcripts[i], subexons ) ; + int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[i].abundance, transcripts[i].FPKM ) ; + //printf( "gid=%d\n", gid ) ; + //printf( "%lf %lf %d\n", transcripts[i].abundance, transcripts[i].FPKM, len ) ; + if ( transcripts[i].FPKM > geneMaxFPKM[gid - baseGeneId ] ) + { + geneMaxFPKM[ gid - baseGeneId ] = transcripts[i].FPKM ; + geneMaxFPKMTag[ gid - baseGeneId ] = i ; + } + if ( transcripts[i].abundance * alignments.readLen / len > geneMaxCov[gid - baseGeneId ] ) + geneMaxCov[gid - baseGeneId] = ( transcripts[i].abundance * alignments.readLen ) / len ; + txptGid[i] = gid ; + } + + for ( i = 0 ; i < tcnt ; ++i ) + { + int tag = txptGid[i] - baseGeneId ; + if ( ( transcripts[i].last < transcripts[ geneMaxFPKMTag[ tag ] ].first + || transcripts[i].first > transcripts[ geneMaxFPKMTag[tag] ].last ) && transcripts[i].FPKM > nonOverlapMaxFPKM[tag] ) + nonOverlapMaxFPKM[tag] = transcripts[i].FPKM ; + } + BitTable bufferTable ; + bufferTable.Duplicate( transcripts[0].seVector ) ; + + if ( !aggressive ) + { + // Rescue the transcripts covering unique constraints. + int cnt = 0 ; + int tag = 0 ; + int *uniqCount = new int[tcnt] ; + memset( uniqCount, 0, sizeof( int ) * tcnt ) ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + cnt = 0 ; + if ( tc[j].uniqSupport <= 5 ) + continue ; + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( IsConstraintInTranscript( transcripts[i], scc[ tc[j].i ] ) && + IsConstraintInTranscript( transcripts[i], scc[ tc[j].j] ) ) + { + tag = i ; + ++cnt ; + } + if ( cnt >= 2 ) + break ; + } + if ( cnt == 1 ) + { + ++uniqCount[tag] ; + } + } + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( uniqCount[i] >= 2 ) + { + transcripts[i].abundance *= 4 ; + transcripts[i].FPKM *= 4 ; + } + } + + delete[] uniqCount ; + } + + int sccCnt = scc.size() ; + double filterFactor = 1.0 ; + + for ( i = 0 ; i < tcnt ; ++i ) + { + //printf( "%d: %lf %lf\n", txptGid[i], transcripts[i].abundance, geneMaxFPKM[ txptGid[i] - baseGeneId ] ) ; + + if ( transcripts[i].FPKM < filterFactor * FPKMFraction * geneMaxFPKM[ txptGid[i] - baseGeneId ] ) + { + /*int cnt = 0 ; + int coverCnt = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + if ( transcripts[i].constraintsSupport[j] > 0 ) + ++coverCnt ; + double factor = tc[j].effectiveCount ; + if ( transcripts[i].constraintsSupport[j] >= factor * tc[j].support - 1e-3 + && tc[j].support >= 10 + && tc[j].uniqSupport >= 0.95 * tc[j].support ) + { + ++cnt ; + } + } + //cnt = 0 ; + if ( cnt >= 2 ) + { + ; + } + else*/ + transcripts[i].abundance = -transcripts[i].abundance ; + } + //if ( transcripts[i].FPKM >= 0.8 * geneMaxFPKM[ txptGid[i] - baseGeneId ] && geneMaxCov[ txptGid[i] - baseGeneId ] >= txptMinReadDepth ) + // continue ; + } + + if ( nonOverlapMaxFPKM != 0 ) + { + // Go two iterations to rescue, the first iteration should be just for marking. + std::vector<int> rescueList ; + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( transcripts[i].abundance >= 0 ) + continue ; + + for ( j = 0 ; j < tcnt ; ++j ) + { + if ( transcripts[j].abundance < 0 || txptGid[i] != txptGid[j] ) + continue ; + if ( transcripts[i].first <= transcripts[j].last && transcripts[i].last >= transcripts[j].first ) + /*bufferTable.Assign( transcripts[i].seVector ) ; + bufferTable.And( transcripts[j].seVector ) ; + + if ( !bufferTable.IsAllZero() )*/ + break ; + } + if ( j >= tcnt && transcripts[i].FPKM >= FPKMFraction * nonOverlapMaxFPKM[ txptGid[i] - baseGeneId ] ) + { + //transcripts[i].abundance = -transcripts[i].abundance ; + rescueList.push_back( i ) ; + } + } + + int size = rescueList.size() ; + for ( i = 0 ; i < size ; ++i ) + transcripts[ rescueList[i] ].abundance *= -1 ; + } + + /*================================================================== + Remove transcripts that has too few read coverage (-d) + ====================================================================*/ + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( transcripts[i].abundance >= 0 ) + { + int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[i].abundance, transcripts[i].FPKM ) ; + double cov = ( transcripts[i].abundance * alignments.readLen ) / len ; + //printf( "%d: %d %d %lf %lf\n", i, len, transcripts[i].seVector.Count(), cov, geneMaxCov[ txptGid[i] - baseGeneId ] ) ; + + if ( ( tcnt > 1 || len <= 1000 || transcripts[i].seVector.Count() <= 3 ) && cov < txptMinReadDepth ) + { + //if ( usedGeneId == baseGeneId + 1 && /*transcripts[i].seVector.Count() > 3 + // && len > 1000 &&*/ geneMaxCov[ txptGid[i] - baseGeneId ] == cov ) + if ( geneMaxCov[ txptGid[i] - baseGeneId ] == cov ) + continue ; + + // Test whether it has some very abundant constraints. + /*int cnt = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + if ( transcripts[i].constraintsSupport[j] >= tc[j].support / 2.0 + && tc[j].support >= 10 + && tc[j].uniqSupport >= 0.95 * tc[j].support + && tc[j].normAbund >= 1 ) + { + ++cnt ; + } + } + + if ( cnt >= 1 ) + { + continue ; + }*/ + + // Test whether this transcript is fully covered. If so ,we can filter it. + + if ( geneMaxCov[ txptGid[i] - baseGeneId ] <= 5 ) + { + bufferTable.Reset() ; + for ( j = 0 ; j < sccCnt ; ++j ) + { + if ( !IsConstraintInTranscript( transcripts[i], scc[j] ) ) + continue ; + bufferTable.Or( scc[j].vector ) ; + } + if ( bufferTable.IsEqual( transcripts[i].seVector ) ) + transcripts[i].abundance = -transcripts[i].abundance ; + } + else + transcripts[i].abundance = -transcripts[i].abundance ; + + /*else + { + transcripts[i].seVector.Print() ; + bufferTable.Print() ; + OutputTranscript( stderr, subexons, transcripts[i] ) ; + }*/ + } + } + } + + /*================================================================== + Remove transcripts that is too short + ====================================================================*/ + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( transcripts[i].abundance <= 0 ) + continue ; + + int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[i].abundance, transcripts[i].FPKM ) ; + if ( len < 200 ) + { + transcripts[i].abundance = -transcripts[i].abundance ; + } + } + + // Rescue transcripts that showed up in many samples. + /*for ( i = 0 ; i < tcnt ; ++i ) + { + if ( transcripts[i].abundance > 0 ) + continue ; + if ( txptSampleSupport[ transcripts[i].id ] >= 3 && + txptSampleSupport[transcripts[i].id ] >= (int)( sampleCnt / 2 ) ) + transcripts[i].abundance = -transcripts[i].abundance ; + }*/ + + // Rescue some transcripts covering subexon chains showed up in many samples, but missing after filtration. + struct _constraint tmpC ; + tmpC.vector.Init( seCnt ) ; + + std::vector< struct _pair32 > missingChain ; + std::vector<int> recoverCandidate ; + bool *used = new bool[tcnt] ; + memset( used, false, sizeof( bool ) * tcnt ) ; + + // Obtain the list of transcripts that should be recovered. + for ( i = 0 ; i < seCnt && sampleCnt > 1 ; ++i ) + { + + double maxFPKM = -1 ; + for ( std::map<int, int>::iterator it = subexonChainSupport[i].begin() ; + it != subexonChainSupport[i].end() ; ++it ) + { + if ( sampleCnt >= 0 && ( it->second < 3 || it->second < (int)( 0.5 * sampleCnt ) ) && it->second <= sampleCnt / 2 ) + continue ; + + bool recover = true ; + tmpC.vector.Reset() ; + tmpC.vector.Set( i ) ; + tmpC.vector.Set( it->first ) ; + tmpC.first = i ; + tmpC.last = it->first ; + + + for ( j = 0 ; j < tcnt ; ++j ) + { + if ( transcripts[j].abundance < 0 ) + continue ; + + if ( IsConstraintInTranscript( transcripts[j], tmpC ) ) + { + recover = false ; + break ; + } + + if ( recover ) + { + for ( j = 0 ; j < tcnt ; ++j ) + { + if ( transcripts[j].abundance > 0 ) + continue ; + //printf( "%d %lf\n", IsConstraintInTranscript( transcripts[j], tmpC ), transcripts[j].FPKM ) ; + if ( IsConstraintInTranscript( transcripts[j], tmpC ) ) + { + /*if ( maxTag == -1 ) + maxTag = j ; + else + { + if ( txptSampleSupport[ transcripts[j].id ] > txptSampleSupport[ transcripts[maxTag ].id ] ) + maxTag = j ; + else if ( txptSampleSupport[ transcripts[j].id ] == txptSampleSupport[ transcripts[maxTag ].id ]) + { + if ( transcripts[j].FPKM > transcripts[maxTag].FPKM ) + maxTag = j ; + } + }*/ + + struct _pair32 np ; + np.a = i ; np.b = it->first ; + missingChain.push_back( np ) ; + + if ( !used[j] ) + { + recoverCandidate.push_back( j ) ; + used[j] = true ; + } + } + } + + /*if ( maxTag != -1 && txptSampleSupport[ transcripts[maxTag].id ] > 1 ) + { + //printf( "recover %d %d\n", maxTag, txptSampleSupport[ transcripts[maxTag].id ] ) ; + transcripts[maxTag].abundance *= -1 ; + }*/ + } + } + + } + } + + int size = recoverCandidate.size() ; + memset( used, false, sizeof( bool ) * tcnt ) ; + // Recover the candidates in the order of reliability + int *geneRecoverCnt = new int[ usedGeneId - baseGeneId ] ; + memset( geneRecoverCnt, 0, sizeof( int ) * ( usedGeneId - baseGeneId ) ) ; + int round = 1 ; + if ( aggressive && size > 1 ) + round = 1 ; + + for ( i = 0 ; i < size ; ++i ) + { + int maxTag = -1 ; + int maxCover = -1 ; + for ( j = 0 ; j < size ; ++j ) + { + if ( !used[ recoverCandidate[j] ] ) + { + /*int cover = 0 ; + + k = missingChain.size() ; + int l ; + for ( l = 0 ; l < k ; ++l ) + { + if ( missingChain[l].a == -1 ) + continue ; + + tmpC.vector.Reset() ; + tmpC.vector.Set( missingChain[l].a ) ; + tmpC.vector.Set( missingChain[l].b ) ; + tmpC.first = missingChain[l].a ; + tmpC.last = missingChain[l].b ; + + if ( IsConstraintInTranscript( transcripts[ recoverCandidate[j] ], tmpC ) ) + { + ++cover ; + } + }*/ + + if ( maxTag == -1 ) + { + maxTag = recoverCandidate[j] ; + //maxCover = cover ; + continue ; + } + + /*if ( cover > maxCover ) + { + maxTag = recoverCandidate[j] ; + maxCover = cover ; + } + else if ( cover == maxCover ) + {*/ + if ( txptSampleSupport[ transcripts[ recoverCandidate[j] ].id ] > + txptSampleSupport[ + transcripts[ maxTag ].id + ] ) + maxTag = recoverCandidate[j] ; + else if ( txptSampleSupport[ transcripts[ recoverCandidate[j] ].id ] == + txptSampleSupport[ transcripts[ maxTag ].id ] ) + { + if ( transcripts[ recoverCandidate[j] ].FPKM > transcripts[ maxTag ].FPKM ) + maxTag = recoverCandidate[j] ; + } + + /*else if ( transcripts[ recoverCandidate[j] ].FPKM > transcripts[ maxTag ].FPKM ) + maxTag = recoverCandidate[j] ; + else if ( transcripts[ recoverCandidate[j] ].FPKM == transcripts[ maxTag ].FPKM ) + { + if ( txptSampleSupport[ transcripts[ recoverCandidate[j] ].id ] > + txptSampleSupport[ transcripts[ maxTag ].id ] ) + maxTag = recoverCandidate[j] ; + }*/ + //} + } + } + + if ( maxTag == -1 || txptSampleSupport[ transcripts[ maxTag ].id ] <= 2 + || txptSampleSupport[ transcripts[maxTag].id ] < 0.5 * sampleCnt ) + break ; + + used[maxTag] = true ; + if ( geneRecoverCnt[ txptGid[maxTag] - baseGeneId ] >= round ) + continue ; + ++geneRecoverCnt[ txptGid[maxTag] - baseGeneId ] ; + + k = missingChain.size() ; + int cnt = 0 ; + for ( j = 0 ; j < k ; ++j ) + { + if ( missingChain[j].a == -1 ) + continue ; + + tmpC.vector.Reset() ; + tmpC.vector.Set( missingChain[j].a ) ; + tmpC.vector.Set( missingChain[j].b ) ; + tmpC.first = missingChain[j].a ; + tmpC.last = missingChain[j].b ; + + if ( IsConstraintInTranscript( transcripts[maxTag], tmpC ) ) + { + missingChain[j].a = -1 ; + ++cnt ; + } + } + + int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[maxTag].abundance, transcripts[maxTag].FPKM ) ; + double cov = ( transcripts[maxTag].abundance * alignments.readLen ) / len ; + if ( cnt >= 1 && cov > 1.0 ) + { + transcripts[maxTag].abundance *= -1 ; + } + } + delete[] used ; + delete[] geneRecoverCnt ; + tmpC.vector.Release() ; + + + tcnt = RemoveNegativeAbundTranscripts( transcripts ) ; + + + + delete []geneMaxCov ; + bufferTable.Release() ; + delete []geneMaxFPKM ; + delete []geneMaxFPKMTag ; + delete []nonOverlapMaxFPKM ; + delete []txptGid ; + + /*================================================================== + Remove transcripts that seems duplicated + ====================================================================*/ + for ( i = 0 ; i < tcnt ; ++i ) + { + int support = 0 ; + int uniqSupport = 0 ; + + for ( j = 0 ; j < tcCnt ; ++j ) + { + if ( !IsConstraintInTranscript( transcripts[i], scc[ tc[j].i ] ) || !IsConstraintInTranscript( transcripts[i], scc[ tc[j].j ] ) ) + continue ; + //support += scc[ tc[j].i ].support + scc[ tc[j].j ].support ; + //uniqSupport += scc[ tc[j].i ].uniqSupport + scc[ tc[j].j ].uniqSupport ; + support += tc[j].support ; + uniqSupport += tc[j].uniqSupport ; + + //printf( "constraint uniqness: %d: %d %d\n", i, tc[j].uniqSupport, tc[j].support ) ; + } + //printf( "%d: %d %d\n", i, uniqSupport, support ) ; + if ( (double)uniqSupport < 0.03 * support ) + transcripts[i].abundance = -1 ; + } + tcnt = RemoveNegativeAbundTranscripts( transcripts ) ; + + + /*================================================================== + Remove shadow transcripts, the abnormal 2-exon txpt whose intron is very close to the true one or one of the anchor exon is shorter than 25bp.... + ====================================================================*/ + int minusCnt = 0, plusCnt = 0 ; + int mainStrand ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].rightStrand == 1 ) + ++plusCnt ; + else if ( subexons[i].rightStrand == -1 ) + ++minusCnt ; + } + if ( plusCnt > minusCnt ) + mainStrand = 1 ; + else + mainStrand = -1 ; + + for ( i = 0 ; i < tcnt ; ++i ) + { + std::vector<int> subexonIdx ; + transcripts[i].seVector.GetOnesIndices( subexonIdx ) ; + int size = subexonIdx.size() ; + int intronCnt = 0 ; + int anchorIdx = 0 ; // the subexon adjacent to the only intron. + + for ( j = 0 ; j < size - 1 ; ++j ) + { + if ( subexons[ subexonIdx[j] ].end + 1 < subexons[ subexonIdx[j + 1] ].start ) + { + ++intronCnt ; + anchorIdx = j ; + } + } + if ( intronCnt != 1 ) + continue ; + + int anchorExonLength[2] = {0, 0}; + int tag = 0 ; + for ( j = 0 ; j < size ; ++j ) + { + anchorExonLength[tag] += subexons[ subexonIdx[j] ].end - subexons[ subexonIdx[j] ].start + 1 ; + if ( tag == 0 && subexons[ subexonIdx[j] ].end + 1 < subexons[ subexonIdx[j + 1] ].start ) + ++tag ; + } + + int flag = 0 ; + if ( subexons[ subexonIdx[anchorIdx] ].rightStrand == mainStrand ) + { + j = subexonIdx[ anchorIdx ] ; + if ( subexons[j].end - subexons[j].start + 1 <= 20 || + ( subexons[j+ 1].start == subexons[j].end + 1 && subexons[j + 1].end - subexons[j + 1].start + 1 <= 20 + && subexons[j + 1].rightStrand == mainStrand ) ) + ++flag ; + j = subexonIdx[ anchorIdx + 1 ] ; + if ( subexons[j].end - subexons[j].start + 1 <= 20 || + ( subexons[j].start == subexons[j - 1].end + 1 && subexons[j - 1].end - subexons[j - 1].start + 1 <= 20 + && subexons[j - 1].leftStrand == mainStrand ) ) + ++flag ; + } + + if ( anchorExonLength[0] <= 25 || anchorExonLength[1] <= 25 ) + flag = 2 ; + + // the alignment support the intron must be unique and has enough support. + int support = 0 ; + int uniqSupport = 0 ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + if ( !IsConstraintInTranscript( transcripts[i], scc[ tc[j].i ] ) || !IsConstraintInTranscript( transcripts[i], scc[ tc[j].j ] ) ) + continue ; + if ( ( scc[ tc[j].i ].vector.Test( subexonIdx[ anchorIdx ] ) && scc[ tc[j].i ].vector.Test( subexonIdx[ anchorIdx + 1 ] ) ) + || ( scc[ tc[j].j ].vector.Test( subexonIdx[ anchorIdx ] ) && scc[ tc[j].j ].vector.Test( subexonIdx[ anchorIdx + 1 ] ) ) ) + { + support += tc[j].support ; + uniqSupport += tc[j].uniqSupport ; + } + + } + + if ( (double)uniqSupport < 0.3 * support || support < txptMinReadDepth ) + { + flag = 2 ; + } + + if ( flag == 2 ) + transcripts[i].abundance = -1 ; + + } + tcnt = RemoveNegativeAbundTranscripts( transcripts ) ; + + return transcripts.size() ; +} + +void TranscriptDecider::ComputeTranscriptsScore( struct _subexon *subexons, int seCnt, std::map<int, int> *subexonChainSupport, std::vector<struct _transcript> &transcripts ) +{ + int i, j ; + int tcnt = transcripts.size() ; + struct _constraint tmpC ; + tmpC.vector.Init( seCnt ) ; + + for ( i = 0 ; i < tcnt ; ++i ) + transcripts[i].correlationScore = 0 ; + + for ( i = 0 ; i < seCnt ; ++i ) + { + for ( std::map<int, int>::iterator it = subexonChainSupport[i].begin() ; + it != subexonChainSupport[i].end() ; ++it ) + { + if ( sampleCnt >= 0 && ( it->second < 3 || it->second < (int)( 0.1 * sampleCnt ) ) && it->second <= sampleCnt / 2 ) + continue ; + + tmpC.vector.Reset() ; + tmpC.vector.Set( i ) ; + tmpC.vector.Set( it->first ) ; + tmpC.first = i ; + tmpC.last = it->first ; + + for ( j = 0 ; j < tcnt ; ++j ) + { + if ( IsConstraintInTranscript( transcripts[j], tmpC ) ) + ++transcripts[j].correlationScore ; + } + } + } + + tmpC.vector.Release() ; +} + +int TranscriptDecider::Solve( struct _subexon *subexons, int seCnt, std::vector<Constraints> &constraints, SubexonCorrelation &subexonCorrelation ) +{ + int i, j, k ; + int cnt = 0 ; + int *f = new int[seCnt] ; // this is a general buffer for a type of usage. + bool useDP = false ; + + compatibleTestVectorT.Init( seCnt ) ; // this is the bittable used in compatible test function. + compatibleTestVectorC.Init( seCnt ) ; + + for ( i = 0 ; i < seCnt ; ++i ) + { + subexons[i].canBeStart = subexons[i].canBeEnd = false ; + + if ( subexons[i].prevCnt == 0 ) + subexons[i].canBeStart = true ; + else if ( subexons[i].leftClassifier < canBeSoftBoundaryThreshold && subexons[i].leftClassifier != -1 + && subexons[i].leftStrand != 0 ) // The case of overhang. + { + // We then look into whether there is a left-side end already showed up before this subexon in this region of subexons. + bool flag = true ; + for ( j = i - 1 ; j >= 0 ; --j ) + { + if ( subexons[j].end + 1 != subexons[j + 1].start ) + break ; + if ( subexons[i].canBeStart == true ) + { + flag = false ; + break ; + } + } + subexons[i].canBeStart = flag ; + } + + if ( subexons[i].nextCnt == 0 ) + subexons[i].canBeEnd = true ; + else if ( subexons[i].rightClassifier < canBeSoftBoundaryThreshold && subexons[i].rightClassifier != -1 + && subexons[i].rightStrand != 0 ) + { + subexons[i].canBeEnd = true ; + } + // Remove other soft end already showed up in this region of subexons. + if ( subexons[i].canBeEnd == true ) + { + for ( j = i - 1 ; j >= 0 ; --j ) + { + if ( subexons[j].end + 1 != subexons[j + 1].start ) + break ; + if ( subexons[j].canBeEnd == true ) + { + subexons[j].canBeEnd = false ; + break ; + } + } + } + //printf( "%d: %d %lf\n", subexons[i].canBeStart, subexons[i].prevCnt, subexons[i].leftClassifier ) ; + } + + // Go through the cases of mixture region to set canBeStart/End. + // e.g: +[...]+_____+[....]-...]+____+[..)_____-[...]- + // ^ then we need to force a start point here. + // Do we need to associate a strand information with canBeStart, canBeEnd? + for ( i = 0 ; i < seCnt ; ) + { + // [i, j) is a region. + for ( j = i + 1 ; j < seCnt ; ++j ) + if ( subexons[j].start > subexons[j - 1].end + 1 ) + break ; + if ( subexons[i].canBeStart == false ) // then subexons[i] must has a hard left boundary. + { + int leftStrandCnt[2] = {0, 0} ; + for ( k = i ; k < j ; ++k ) + { + if ( !SubexonGraph::IsSameStrand( subexons[k].rightStrand, subexons[i].leftStrand ) ) + break ; + if ( subexons[k].leftStrand != 0 ) + ++leftStrandCnt[ ( subexons[k].leftStrand + 1 ) / 2 ] ; + } + if ( k < j && leftStrandCnt[ ( subexons[k].rightStrand + 1 ) / 2 ] == 0 ) + subexons[i].canBeStart = true ; + } + + if ( subexons[j - 1].canBeEnd == false ) + { + int rightStrandCnt[2] = {0, 0} ; + for ( k = j - 1 ; k >= i ; --k ) + { + if ( !SubexonGraph::IsSameStrand( subexons[k].leftStrand, subexons[j - 1].rightStrand ) ) + break ; + if ( subexons[k].rightStrand != 0 ) + ++rightStrandCnt[ ( subexons[k].rightStrand + 1 ) / 2 ] ; + } + if ( k >= i && rightStrandCnt[ ( subexons[k].leftStrand + 1 ) / 2 ] == 0 ) + subexons[j - 1].canBeEnd = true ; + } + + //if ( subexons[i].start == 6870264) + // printf( "hi %d %d\n",i , subexons[i].canBeStart ) ; + i = j ; + } + /*for ( i = 0 ; i < seCnt ; ++i ) + { + printf( "%d %d: %d %d\n", subexons[i].start, subexons[i].end, subexons[i].canBeStart, subexons[i].canBeEnd ) ; + }*/ + + // Find the gene ids. + baseGeneId = subexons[0].lcCnt ; + usedGeneId = subexons[0].rcCnt ; + defaultGeneId[0] = defaultGeneId[1] = -1 ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].geneId < 0 ) + continue ; + + //if ( baseGeneId == -1 || subexons[i].geneId < baseGeneId ) + // baseGeneId = subexons[i].geneId ; + //if ( subexons[i].geneId > usedGeneId ) + // usedGeneId = subexons[i].geneId ; + + if ( ( subexons[i].rightStrand == -1 || subexons[i].leftStrand == -1 ) && + ( defaultGeneId[0] == -1 || subexons[i].geneId < defaultGeneId[0] ) ) + defaultGeneId[0] = subexons[i].geneId ; + if ( ( subexons[i].rightStrand == 1 || subexons[i].leftStrand == 1 ) && + ( defaultGeneId[1] == -1 || subexons[i].geneId < defaultGeneId[1] ) ) + defaultGeneId[1] = subexons[i].geneId ; + } + if ( defaultGeneId[0] == -1 ) + defaultGeneId[0] = baseGeneId ; + if ( defaultGeneId[1] == -1 ) + defaultGeneId[1] = usedGeneId - 1 ; + + // Go through the constraints to find the chain of subexons that should be kept. + std::map<int, int> *subexonChainSupport = new std::map<int, int>[ seCnt ] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + std::vector<int> subexonIdx ; + std::vector<struct _pair32> chain ; + + int tcCnt = constraints[i].constraints.size() ; + int size ; + for ( j = 0 ; j < tcCnt ; ++j ) + { + struct _constraint c = constraints[i].constraints[j] ; + if ( c.uniqSupport < 0.95 * c.support || c.support < 3 ) + continue ; + + subexonIdx.clear() ; + c.vector.GetOnesIndices( subexonIdx ) ; + size = subexonIdx.size() ; + + for ( k = 0 ; k < size - 1 ; ++k ) + { + struct _pair32 p ; + + p.a = subexonIdx[k] ; + p.b = subexonIdx[k + 1] ; + //if ( subexons[p.a].end + 1 == 113235898 && subexons[ p.b ].start + 1 == 113236121 ) + // printf( "bad bad %d %d %d\n", i, c.uniqSupport, c.support ) ; + + if ( subexons[ p.a ].end + 1 < subexons[ p.b ].start ) + chain.push_back( p ) ; + } + } + // Remove redundancy. + sort( chain.begin(), chain.end(), CompSortPairs ) ; + size = chain.size() ; + k = 0 ; + for ( j = 1 ; j < size ; ++j ) + { + if ( chain[j].a == chain[k].a && chain[j].b == chain[k].b ) + continue ; + else + { + ++k ; + chain[k] = chain[j] ; + } + } + chain.resize( k + 1 ) ; + + // Add those to sample count + size = k + 1 ; + for ( j = 0 ; j < size ; ++j ) + { + if ( subexonChainSupport[ chain[j].a ].count( chain[j].b ) ) + { + ++subexonChainSupport[ chain[j].a ][ chain[j].b ] ; + } + else + subexonChainSupport[ chain[j].a ][ chain[j].b ] = 1 ; + } + } + + /*for ( i = 0 ; i < seCnt ; ++i ) + { + printf( "%d:", i ) ; + for ( std::map<int, int>::iterator it = subexonChainSupport[i].begin() ; it != subexonChainSupport[i].end() ; ++it ) + printf( " (%d %d) ", it->first, it->second ) ; + printf( "\n" ) ; + }*/ + + //printf( "%d %d %d\n", defaultGeneId[0], baseGeneId, usedGeneId ) ; + cnt = 0 ; + memset( f, -1, sizeof( int ) * seCnt ) ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].canBeStart ) + { + cnt += SubTranscriptCount( i, subexons, f ) ; + } + } + if ( cnt <= USE_DP ) + { + for ( i = 0 ; i < seCnt ; ++i ) + if ( f[i] > USE_DP ) + { + useDP = true ; + break ; + } + } + else + useDP = true ; + if ( !useDP ) + { + for ( i = 0 ; i < sampleCnt ; ++i ) + { + double msize = constraints[i].matePairs.size() ; + double csize = constraints[i].constraints.size() ; + if ( cnt > ( csize / msize ) * ( csize / msize ) * seCnt + && cnt > USE_DP / ( msize * msize ) && cnt > 50 ) + { + useDP = true ; + break ; + } + } + } + + int atCnt = cnt ; + printf( "%d: atCnt=%d seCnt=%d %d %d %d\n", subexons[0].start + 1, atCnt, seCnt, useDP, (int)constraints[0].constraints.size(), (int)constraints[0].matePairs.size() ) ; + fflush( stdout ) ; + std::vector<struct _transcript> alltranscripts ; + + if ( !useDP ) + { + int origSize = atCnt ; + alltranscripts.resize( atCnt ) ; + for ( i = 0 ; i < atCnt ; ++i ) + { + alltranscripts[i].seVector.Init( seCnt ) ; + alltranscripts[i].correlationScore = 1 ; + } + + atCnt = 0 ; + for ( i = 0 ; i < seCnt ; ++i ) + { + if ( subexons[i].canBeStart ) + EnumerateTranscript( i, 0, f, 0, subexons, subexonCorrelation, 1, alltranscripts, atCnt ) ; + } + + for ( i = atCnt ; i < origSize ; ++i ) + alltranscripts[i].seVector.Release() ; + + alltranscripts.resize( atCnt ) ; + //printf( "transcript cnt: %d\n", atCnt ) ; + //printf( "%d %d\n", alltranscripts[0].seVector.Test( 1 ), constraints[0].matePairs.size() ) ; + } + else // Use dynamic programming to pick a set of candidate transcript. + { + std::vector<struct _transcript> sampleTranscripts ; + + // pre allocate the memory. + struct _dpAttribute attr ; + attr.f1 = new struct _dp[seCnt] ; + if ( seCnt <= 10000 ) + { + attr.f2 = new struct _dp*[seCnt] ; + for ( i = 0 ; i < seCnt ; ++i ) + attr.f2[i] = new struct _dp[seCnt] ; + } + else + attr.f2 = NULL ; + + hashMax = HASH_MAX ; + if (seCnt > 500) + hashMax = 1000003 ; + else if (seCnt > 1000) + hashMax = 10000019 ; + else if (seCnt > 1500) + hashMax = 20000003 ; + + attr.hash = dpHash ; + if ( hashMax != HASH_MAX ) + attr.hash = new struct _dp[hashMax] ; + + for ( i = 0 ; i < seCnt ; ++i ) + { + attr.f1[i].seVector.Nullify() ; + attr.f1[i].seVector.Init( seCnt ) ; + for ( j = i ; j < seCnt ; ++j ) + { + attr.f2[i][j].seVector.Nullify() ; + attr.f2[i][j].seVector.Init( seCnt ) ; + } + } + for ( i = 0 ; i < hashMax ; ++i ) + { + attr.hash[i].seVector.Nullify() ; + attr.hash[i].seVector.Init( seCnt ) ; + } + + // select candidate transcripts from each sample. + struct _pair32 *sampleComplexity = new struct _pair32[ sampleCnt ] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + sampleComplexity[i].a = i ; + sampleComplexity[i].b = constraints[i].constraints.size() ; + } + qsort( sampleComplexity, sampleCnt, sizeof( sampleComplexity[0] ), CompPairsByB ) ; + int downsampleCnt = -1 ; + + for ( i = sampleCnt - 1 ; i >= 0 ; --i ) + { + sampleTranscripts.clear() ; + int iterBound = constraints[ sampleComplexity[i].a ].constraints.size() ; + if ( i < sampleCnt - 1 ) + iterBound = 100 ; + + if ( i < sampleCnt - 10 && alltranscripts.size() > 1000 ) + iterBound = 10 ; + //printf( "%d %d: %d %d %d %d\n", subexons[0].start + 1, sampleComplexity[i].a, constraints[ sampleComplexity[i].a ].constraints.size(), constraints[ sampleComplexity[i].a ].matePairs.size(), + // alltranscripts.size(), iterBound ) ; fflush( stdout ) ; + if ( maxDpConstraintSize > 0 ) + { + Constraints truncatedConstraints ; + truncatedConstraints.TruncateConstraintsCoverFrom( constraints[ sampleComplexity[i].a ], seCnt, maxDpConstraintSize ) ; + PickTranscriptsByDP( subexons, seCnt, iterBound, truncatedConstraints, + subexonCorrelation, attr, sampleTranscripts ) ; + } + else if ( ( constraints[ sampleComplexity[i].a ].constraints.size() > 1000 + && constraints[ sampleComplexity[i].a ].constraints.size() * 10 < constraints[ sampleComplexity[i].a ].matePairs.size() ) + || ( downsampleCnt > 0 && (int)constraints[ sampleComplexity[i].a ].constraints.size() >= downsampleCnt ) + || seCnt >= 1500 ) + { + Constraints downsampledConstraints ; + int stride = (int)constraints[ sampleComplexity[i].a ].matePairs.size() / (int)constraints[ sampleComplexity[i].a ].constraints.size() ; + if ( downsampleCnt > 0 ) + stride = (int)constraints[ sampleComplexity[i].a ].constraints.size() / downsampleCnt ; + if ( stride < 1 ) + stride = 1 ; + downsampledConstraints.DownsampleConstraintsFrom( constraints[ sampleComplexity[i].a ], stride ) ; + if ( downsampleCnt <= 0 ) + downsampleCnt = downsampledConstraints.constraints.size() ; + if ( iterBound <= 10 ) + continue ; + PickTranscriptsByDP( subexons, seCnt, iterBound, downsampledConstraints, subexonCorrelation, attr, sampleTranscripts ) ; + } + else + { + PickTranscriptsByDP( subexons, seCnt, iterBound, constraints[ sampleComplexity[i].a ], subexonCorrelation, attr, sampleTranscripts ) ; + } + int size = sampleTranscripts.size() ; + for ( j = 0 ; j < size ; ++j ) + alltranscripts.push_back( sampleTranscripts[j] ) ; + + // we can further pick a smaller subsets of transcripts here if the number is still to big. + CoalesceSameTranscripts( alltranscripts ) ; + + AugmentTranscripts( subexons, alltranscripts, 1000, false ) ; + } + + // release the memory. + delete[] sampleComplexity ; + for ( i = 0 ; i < seCnt ; ++i ) + { + attr.f1[i].seVector.Release() ; + for ( j = i ; j < seCnt && attr.f2 ; ++j ) + attr.f2[i][j].seVector.Release() ; + } + for ( i = 0 ; i < hashMax ; ++i ) + attr.hash[i].seVector.Release() ; + + delete[] attr.f1 ; + for ( i = 0 ; i < seCnt && attr.f2 ; ++i ) + delete[] attr.f2[i] ; + delete[] attr.f2 ; + if (hashMax != HASH_MAX) + delete[] attr.hash ; + + } + + transcriptId = new int[usedGeneId - baseGeneId] ; + std::vector<struct _transcript> *predTranscripts = new std::vector<struct _transcript>[sampleCnt] ; + + atCnt = alltranscripts.size() ; + for ( i = 0 ; i < atCnt ; ++i ) + alltranscripts[i].FPKM = 0 ; + + for ( i = 0 ; i < sampleCnt ; ++i ) + { + int size = alltranscripts.size() ; + for ( j = 0 ; j < size ; ++j ) + alltranscripts[j].abundance = -1 ; + //printf( "pick: %d: %d %d\n", i, constraints[i].matePairs.size(), alltranscripts.size() ) ; + PickTranscripts( subexons, alltranscripts, constraints[i], subexonCorrelation, predTranscripts[i] ) ; + + /*double tmp = FPKMFraction ; + FPKMFraction = 0 ; + size = predTranscripts.size() ; + for ( j = 0 ; j < size ; ++j ) + { + ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[j] ) ; + } + RefineTranscripts( subexons, seCnt, predTranscripts, constraints[i] ) ; + FPKMFraction = tmp ;*/ + + } + + atCnt = alltranscripts.size() ; + int *txptSampleSupport = new int[atCnt] ; + memset( txptSampleSupport, 0, sizeof( int ) * atCnt ) ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + int size = predTranscripts[i].size() ; + for ( j = 0 ; j < size ; ++j ) + { + ++txptSampleSupport[ predTranscripts[i][j].id ] ; + ++alltranscripts[ predTranscripts[i][j].id ].FPKM ; + } + } + + for ( i = 0 ; i < sampleCnt ; ++i ) + { + int size = alltranscripts.size() ; + for ( j = 0 ; j < size ; ++j ) + alltranscripts[j].abundance = -1 ; + //printf( "pick: %d: %d %d\n", i, constraints[i].matePairs.size(), alltranscripts.size() ) ; + + size = predTranscripts[i].size() ; + for ( j = 0 ; j < size ; ++j ) + { + predTranscripts[i][j].seVector.Release() ; + } + predTranscripts[i].clear() ; + PickTranscripts( subexons, alltranscripts, constraints[i], subexonCorrelation, predTranscripts[i] ) ; + } + + std::vector<int> *rawPredTranscriptIds = new std::vector<int>[sampleCnt] ; + std::vector<double> *rawPredTranscriptAbundance = new std::vector<double>[sampleCnt] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + int size = predTranscripts[i].size() ; + + for ( j = 0 ; j < size ; ++j ) + { + rawPredTranscriptIds[i].push_back( predTranscripts[i][j].id ) ; + rawPredTranscriptAbundance[i].push_back( predTranscripts[i][j].abundance ) ; + } + } + + // Do the filtration. + for ( i = 0 ; i < sampleCnt ; ++i ) + { + int size = predTranscripts[i].size() ; + for ( j = 0 ; j < size ; ++j ) + { + ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[i][j] ) ; + } + size = RefineTranscripts( subexons, seCnt, false, subexonChainSupport, txptSampleSupport, predTranscripts[i], constraints[i] ) ; + + // Recompute the abundance. + AbundanceEstimation( subexons, seCnt, constraints[i], predTranscripts[i] ) ; + for ( j = 0 ; j < size ; ++j ) + ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[i][j] ) ; + size = RefineTranscripts( subexons, seCnt, true, subexonChainSupport, txptSampleSupport, predTranscripts[i], constraints[i] ) ; + + //ComputeTranscriptsScore( subexons, seCnt, subexonChainSupport, predTranscripts[i] ) ; + } + + // Rescue some filtered transcripts + memset( txptSampleSupport, 0, sizeof( int ) * atCnt ) ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + int size = predTranscripts[i].size() ; + for ( j = 0 ; j < size ; ++j ) + { + ++txptSampleSupport[ predTranscripts[i][j].id ] ; + } + } + + bool *predicted = new bool[atCnt] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + memset( predicted, false, sizeof( bool ) * atCnt ) ; + if ( predTranscripts[i].size() != rawPredTranscriptIds[i].size() ) + { + int psize = predTranscripts[i].size() ; + int rsize = rawPredTranscriptIds[i].size() ; + int tcCnt = constraints[i].matePairs.size() ; + + for ( j = 0 ; j < psize ; ++j ) + predicted[ predTranscripts[i][j].id ] = true ; + + for ( j = 0 ; j < rsize ; ++j ) + { + int id = rawPredTranscriptIds[i][j] ; + if ( predicted[ id ] == false && + ( txptSampleSupport[ id ] >= 3 && txptSampleSupport[id] >= 0.25 * sampleCnt ) ) + { + struct _transcript nt = alltranscripts[id] ; + nt.seVector.Nullify() ; + nt.seVector.Duplicate( alltranscripts[id].seVector ) ; + nt.constraintsSupport = NULL ; + nt.correlationScore = -1 ; + nt.abundance = rawPredTranscriptAbundance[i][j] ; + nt.id = id ; + predTranscripts[i].push_back( nt ) ; + } + } + if ( psize != predTranscripts[i].size() ) + AbundanceEstimation( subexons, seCnt, constraints[i], predTranscripts[i] ) ; + } + + int size = predTranscripts[i].size() ; + + if ( 0 ) //size == 1 ) + { + //AugmentTranscripts( subexons, predTranscripts[i], false ) ; + + int l = predTranscripts[i].size() ; + int tcCnt = constraints[i].matePairs.size() ; + for ( j = 0 ; j < l ; ++j ) + { + predTranscripts[i][j].abundance = 1.0 / alignments.readLen ; + } + AbundanceEstimation( subexons, seCnt, constraints[i], predTranscripts[i] ) ; + + std::vector<int> subexonIdx ; + for ( j = 0 ; j < l ; ++j ) + { + subexonIdx.clear() ; + predTranscripts[i][j].seVector.GetOnesIndices( subexonIdx ) ; + int subexonIdxCnt = subexonIdx.size() ; + int len = 0 ; + for ( k = 0 ; k < subexonIdxCnt ; ++k ) + len += subexons[ subexonIdx[k] ].end - subexons[ subexonIdx[k] ].start + 1 ; + + if ( predTranscripts[i][j].abundance * alignments.readLen / len < 2.0 ) + predTranscripts[i][j].abundance = -1 ; + else + ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[i][j] ) ; + + } + RemoveNegativeAbundTranscripts( predTranscripts[i] ) ; + } + + // Output + size = predTranscripts[i].size() ; + InitTranscriptId() ; + for ( j = 0 ; j < size ; ++j ) + { + OutputTranscript( i, subexons, predTranscripts[i][j] ) ; + } + for ( j = 0 ; j < size ; ++j ) + { + predTranscripts[i][j].seVector.Release() ; + } + } + + delete []predicted ; + delete []transcriptId ; + delete []predTranscripts ; + delete []rawPredTranscriptIds ; + delete []rawPredTranscriptAbundance ; + delete []txptSampleSupport ; + + atCnt = alltranscripts.size() ; + for ( i = 0 ; i < atCnt ; ++i ) + alltranscripts[i].seVector.Release() ; + compatibleTestVectorT.Release() ; + compatibleTestVectorC.Release() ; + delete[] f ; + delete[] subexonChainSupport ; + return 0 ; +} + +void *TranscriptDeciderSolve_Wrapper( void *a ) +{ + int i ; + + struct _transcriptDeciderThreadArg &arg = *( (struct _transcriptDeciderThreadArg *)a ) ; + TranscriptDecider transcriptDecider( arg.FPKMFraction, arg.classifierThreshold, arg.txptMinReadDepth, arg.sampleCnt, *( arg.alignments ) ) ; + transcriptDecider.SetNumThreads( arg.numThreads + 1 ) ; + transcriptDecider.SetMultiThreadOutputHandler( arg.outputHandler ) ; + transcriptDecider.SetMaxDpConstraintSize( arg.maxDpConstraintSize ) ; + transcriptDecider.Solve( arg.subexons, arg.seCnt, arg.constraints, arg.subexonCorrelation ) ; + + int start = arg.subexons[0].start ; + int end = arg.subexons[ arg.seCnt - 1 ].end ; + int chrId = arg.subexons[0].chrId ; + // Release memory + for ( i = 0 ; i < arg.seCnt ; ++i ) + { + delete[] arg.subexons[i].prev ; + delete[] arg.subexons[i].next ; + } + delete[] arg.subexons ; + + // Put the work id back to the free threads queue. + pthread_mutex_lock( arg.ftLock ) ; + arg.freeThreads[ *( arg.ftCnt ) ] = arg.tid ; + ++*( arg.ftCnt ) ; + if ( *( arg.ftCnt ) == 1 ) + pthread_cond_signal( arg.fullWorkCond ) ; + pthread_mutex_unlock( arg.ftLock) ; + printf( "Thread %d: %s %d %d finished.\n", arg.tid, arg.alignments->GetChromName(chrId), start + 1, end + 1 ) ; + fflush( stdout ) ; + + + pthread_exit( NULL ) ; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/TranscriptDecider.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,544 @@ +#ifndef _MOURISL_CLASSES_TRANSCRIPTDECIDER_HEADER +#define _MOURISL_CLASSES_TRANSCRIPTDECIDER_HEADER + +#include <pthread.h> +#include <map> +#include <time.h> +#include <stdarg.h> + +#include "alignments.hpp" +#include "SubexonGraph.hpp" +#include "SubexonCorrelation.hpp" +#include "BitTable.hpp" +#include "Constraints.hpp" + +#define HASH_MAX 100003 // default HASH_MAX +#define USE_DP 200000 + +struct _transcript +{ + BitTable seVector ; + double abundance ; + double correlationScore ; + double FPKM ; + double *constraintsSupport ; // record the assign ment of constraints. + + int first, last ; // indicate the index of the first and last subexons. + bool partial ; // wehther this is a partial transcript. + int id ; // the id for various usage: i.e transcript index in the alltranscripts. +} ; + +struct _outputTranscript +{ + int chrId ; + int geneId, transcriptId ; + struct _pair32 *exons ; + int ecnt ; + char strand ; + int sampleId ; + + double FPKM ; + double TPM ; + double cov ; +} ; + +struct _dp +{ + BitTable seVector ; + int first, last ; + // The "cnt" is for the hash structure. + // the first cnt set bits represent the subexons that are the key of the hash + // the remaining set bits are the optimal subtranscript follow the key. + int cnt ; + double cover ; + + double minAbundance ; + int timeStamp ; + int strand ; +} ; + + +struct _dpAttribute +{ + struct _dp *f1, **f2 ; + struct _dp *hash ; + + struct _transcript bufferTxpt ; + + bool forAbundance ; + + struct _subexon *subexons ; + int seCnt ; + + std::map<uint64_t, int> uncoveredPair ; + + double minAbundance ; + int timeStamp ; +} ; + +class MultiThreadOutputTranscript ; + +struct _transcriptDeciderThreadArg +{ + int tid ; + struct _subexon *subexons ; + int seCnt ; + int sampleCnt ; + int numThreads ; + + int maxDpConstraintSize ; + double FPKMFraction, classifierThreshold, txptMinReadDepth ; + Alignments *alignments ; + std::vector<Constraints> constraints ; + SubexonCorrelation subexonCorrelation ; + MultiThreadOutputTranscript *outputHandler ; + + int *freeThreads ; // the stack for free threads + int *ftCnt ; + pthread_mutex_t *ftLock ; + pthread_cond_t *fullWorkCond ; +} ; + +class MultiThreadOutputTranscript +{ +private: + std::vector<struct _outputTranscript> outputQueue ; + pthread_t *threads ; + pthread_mutex_t outputLock ; + int sampleCnt ; + int numThreads ; + std::vector<FILE *> outputFPs ; + Alignments &alignments ; + +public: + static int CompTranscripts( const struct _outputTranscript &a, const struct _outputTranscript &b ) + { + int i ; + if ( a.geneId != b.geneId ) + return a.geneId - b.geneId ; + if ( a.ecnt != b.ecnt ) + return a.ecnt - b.ecnt ; + + for ( i = 0 ; i < a.ecnt ; ++i ) + { + if ( a.exons[i].a != b.exons[i].a ) + return a.exons[i].a - b.exons[i].a ; + + if ( a.exons[i].b != b.exons[i].b ) + return a.exons[i].b - b.exons[i].b ; + } + return 0 ; + } + + static bool CompSortTranscripts( const struct _outputTranscript &a, const struct _outputTranscript &b ) + { + int tmp = CompTranscripts( a, b ) ; + if ( tmp < 0 ) + return true ; + else if ( tmp > 0 ) + return false ; + else + return a.sampleId < b.sampleId ; + } + + MultiThreadOutputTranscript( int cnt, Alignments &a ): alignments( a ) + { + sampleCnt = cnt ; + pthread_mutex_init( &outputLock, NULL ) ; + } + ~MultiThreadOutputTranscript() + { + pthread_mutex_destroy( &outputLock ) ; + int i ; + for ( i = 0 ; i < sampleCnt ; ++i ) + fclose( outputFPs[i] ) ; + } + + void SetThreadsPointer( pthread_t *t, int n ) + { + threads = t ; + numThreads = n ; + } + + void SetOutputFPs( char *outputPrefix ) + { + int i ; + char buffer[1024] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + if ( outputPrefix[0] ) + sprintf( buffer, "%s_sample_%d.gtf", outputPrefix, i ) ; + else + sprintf( buffer, "sample_%d.gtf", i ) ; + FILE *fp = fopen( buffer, "w" ) ; + outputFPs.push_back( fp ) ; + } + } + + void Add( struct _outputTranscript &t ) + { + pthread_mutex_lock( &outputLock ) ; + outputQueue.push_back( t ) ; + pthread_mutex_unlock( &outputLock ) ; + } + + void Add_SingleThread( struct _outputTranscript &t ) + { + outputQueue.push_back( t ) ; + } + + void ComputeFPKMTPM( std::vector<Alignments> &alignmentFiles ) + { + int i ; + int qsize = outputQueue.size() ; + double *totalFPK = new double[ sampleCnt ] ; + memset( totalFPK, 0, sizeof( double ) * sampleCnt ) ; + for ( i = 0 ; i < qsize ; ++i ) + { + totalFPK[ outputQueue[i].sampleId ] += outputQueue[i].FPKM ; + } + + for ( i = 0 ; i < qsize ; ++i ) + { + outputQueue[i].TPM = outputQueue[i].FPKM / ( totalFPK[ outputQueue[i].sampleId ] / 1000000.0 ) ; + outputQueue[i].FPKM /= ( alignmentFiles[ outputQueue[i].sampleId ].totalReadCnt / 1000000.0 ) ; + } + + delete[] totalFPK ; + } + + void OutputCommandInfo( int argc, char *argv[] ) + { + int i ; + int j ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + fprintf( outputFPs[i], "#PsiCLASS_v1.0.1\n#" ) ; + for ( j = 0 ; j < argc - 1 ; ++j ) + { + fprintf( outputFPs[i], "%s ", argv[j] ) ; + } + fprintf( outputFPs[i], "%s\n", argv[j] ) ; + } + } + + void OutputCommentToSampleGTF( int sampleId, char *s ) + { + fprintf( outputFPs[ sampleId ], "#%s\n", s ) ; + } + + void Flush() + { + std::sort( outputQueue.begin(), outputQueue.end(), CompSortTranscripts ) ; + int i, j ; + int qsize = outputQueue.size() ; + char prefix[10] = "" ; + + // Recompute the transcript id + int gid = -1 ; + int tid = 0 ; + for ( i = 0 ; i < qsize ; ) + { + for ( j = i + 1 ; j < qsize ; ++j ) + { + if ( CompTranscripts( outputQueue[i], outputQueue[j] ) ) + break ; + } + int l ; + if ( outputQueue[i].geneId != gid ) + { + gid = outputQueue[i].geneId ; + tid = 0 ; + } + else + ++tid ; + + for ( l = i ; l < j ; ++l ) + outputQueue[l].transcriptId = tid ; + + i = j ; + } + + // output + for ( i = 0 ; i < qsize ; ++i ) + { + struct _outputTranscript &t = outputQueue[i] ; + char *chrom = alignments.GetChromName( t.chrId ) ; + + fprintf( outputFPs[t.sampleId], "%s\tPsiCLASS\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\";\n", + chrom, t.exons[0].a, t.exons[t.ecnt - 1].b, t.strand, + prefix, chrom, t.geneId, + prefix, chrom, t.geneId, t.transcriptId, t.FPKM, t.TPM, t.cov ) ; + for ( j = 0 ; j < t.ecnt ; ++j ) + { + fprintf( outputFPs[ t.sampleId ], "%s\tPsiCLASS\texon\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; " + "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"\%.6lf\";\n", + chrom, t.exons[j].a, t.exons[j].b, t.strand, + prefix, chrom, t.geneId, + prefix, chrom, t.geneId, t.transcriptId, + j + 1, t.FPKM, t.TPM, t.cov ) ; + } + delete []t.exons ; + } + } +} ; + +class TranscriptDecider +{ +private: + int sampleCnt ; + int numThreads ; + double FPKMFraction ; + double txptMinReadDepth ; + int hashMax ; + int maxDpConstraintSize ; + + Constraints *constraints ; + //struct _subexon *subexons ; + //int seCnt ; + + int usedGeneId ; + int baseGeneId, defaultGeneId[2] ; + + int *transcriptId ; // the next transcript id for each gene id (we shift the gene id to 0 in this array.) + Alignments &alignments ; // for obtain the chromosome names. + + std::vector<FILE *> outputFPs ; + + BitTable compatibleTestVectorT, compatibleTestVectorC ; + double canBeSoftBoundaryThreshold ; + + MultiThreadOutputTranscript *outputHandler ; + + // Test whether subexon tag is a start subexon in a mixture region that corresponds to the start of a gene on another strand. + bool IsStartOfMixtureStrandRegion( int tag, struct _subexon *subexons, int seCnt ) ; + + // The functions to pick transcripts through dynamic programming + struct _dp *dpHash ; + void SearchSubTranscript( int tag, int strand, int parents[], int pcnt, struct _dp &pdp, int visit[], int vcnt, int extends[], int extendCnt, std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr ) ; + struct _dp SolveSubTranscript( int visit[], int vcnt, int strand, std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr ) ; + void PickTranscriptsByDP( struct _subexon *subexons, int seCnt, int iterBound, Constraints &constraints, SubexonCorrelation &correlation, struct _dpAttribute &attr, std::vector<struct _transcript> &allTranscripts ) ; + + void SetDpContent( struct _dp &a, struct _dp &b, const struct _dpAttribute &attr ) + { + a.seVector.Assign( b.seVector ) ; + a.first = b.first ; + a.last = b.last ; + a.cnt = b.cnt ; + a.cover = b.cover ; + + a.strand = b.strand ; + a.minAbundance = attr.minAbundance ; + a.timeStamp = attr.timeStamp ; + } + + void ResetDpContent( struct _dp &d ) + { + d.seVector.Reset() ; + d.first = -1 ; + d.last = -1 ; + d.cnt = -1 ; + d.cover = -1 ; + d.minAbundance = -1 ; + d.timeStamp = -1 ; + } + + void AugmentTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, int limit, bool extend ) ; + // Test whether a constraints is compatible with the transcript. + // Return 0 - uncompatible or does not overlap at all. 1 - fully compatible. 2 - Head of the constraints compatible with the tail of the transcript + int IsConstraintInTranscript( struct _transcript transcript, struct _constraint &c ) ; + int IsConstraintInTranscriptDebug( struct _transcript transcript, struct _constraint &c ) ; + + // Count how many transcripts are possible starting from subexons[tag]. + int SubTranscriptCount( int tag, struct _subexon *subexons, int f[] ) ; + + // The methods when there is no need for DP + void EnumerateTranscript( int tag, int strand, int visit[], int vcnt, struct _subexon *subexons, SubexonCorrelation &correlation, double correlationScore, std::vector<struct _transcript> &alltranscripts, int &atcnt ) ; + // For the simpler case, we can pick sample by sample. + void PickTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, Constraints &constraints, SubexonCorrelation &seCorrelation, std::vector<struct _transcript> &transcripts ) ; + + static bool CompSortTranscripts( const struct _transcript &a, const struct _transcript &b ) + { + if ( a.first < b.first ) + return true ; + else if ( a.first > b.first ) + return false ; + + int diffPos = a.seVector.GetFirstDifference( b.seVector ) ; + if ( diffPos == -1 ) + return false ; + if ( a.seVector.Test( diffPos ) ) + return true ; + else + return false ; + } + + static bool CompSortPairs( const struct _pair32 &x, const struct _pair32 &y ) + { + if ( x.a != y.a ) + return x.a < y.a ; + else + return x.b < y.b ; + } + + static bool CompSortPairsByB( const struct _pair32 &x, const struct _pair32 &y ) + { + return x.b < y.b ; + } + + static int CompPairsByB( const void *p1, const void *p2 ) + { + return ((struct _pair32 *)p1)->b - ((struct _pair32 *)p2)->b ; + } + + double ComputeScore( double cnt, double weight, double a, double A, double correlation ) + { + if ( a > A * 0.1 ) + return ( cnt * weight ) * ( 1 + pow( a / A, 0.25 ) ) + correlation ; + else + return ( cnt * weight ) * ( 1 + a / A ) + correlation ; + //return ( cnt ) * ( exp( 1 + a / A ) ) + correlation ; + } + + int GetFather( int f, int *father ) ; + + void ConvertTranscriptAbundanceToFPKM( struct _subexon *subexons, struct _transcript &t, int readCnt = 1000000 ) + { + int txptLen = 0 ; + int i, size ; + + std::vector<int> subexonInd ; + t.seVector.GetOnesIndices( subexonInd ) ; + size = subexonInd.size() ; + for ( i = 0 ; i < size ; ++i ) + txptLen += ( subexons[ subexonInd[i] ].end - subexons[ subexonInd[i] ].start + 1 ) ; + double factor = 1 ; + if ( alignments.matePaired ) + factor = 0.5 ; + t.FPKM = t.abundance * factor / ( ( readCnt / 1000000.0 ) * ( txptLen / 1000.0 ) ) ; + } + + int GetTranscriptLengthFromAbundanceAndFPKM( double abundance, double FPKM, int readCnt = 1000000 ) + { + double factor = 1 ; + if ( alignments.matePaired ) + factor = 0.5 ; + return int( abundance * factor / ( FPKM / 1000.0 ) / ( readCnt / 1000000.0 ) + 0.5 ) ; + } + + void CoalesceSameTranscripts( std::vector<struct _transcript> &t ) ; + + + // Initialize the structure to store transcript id + void InitTranscriptId() ; + + int GetTranscriptGeneId( std::vector<int> &subexonInd, struct _subexon *subexons ) ; + int GetTranscriptGeneId( struct _transcript &t, struct _subexon *subexons ) ; + int RemoveNegativeAbundTranscripts( std::vector<struct _transcript> &transcripts ) + { + int i, j ; + int tcnt = transcripts.size() ; + j = 0 ; + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( transcripts[i].abundance < 0 ) + { + transcripts[i].seVector.Release() ; // Don't forget release the memory. + continue ; + } + transcripts[j] = transcripts[i] ; + ++j ; + } + transcripts.resize( j ) ; + return j ; + } + + void AbundanceEstimation( struct _subexon *subexons, int seCnt, Constraints &constraints, std::vector<struct _transcript> &transcripts ) ; + + int RefineTranscripts( struct _subexon *subexons, int seCnt, bool aggressive, std::map<int, int> *subexonChainSupport, int *txptSampleSupport, std::vector<struct _transcript> &transcripts, Constraints &constraints ) ; + + void ComputeTranscriptsScore( struct _subexon *subexons, int seCnt, std::map<int, int> *subexonChainSupport, std::vector<struct _transcript> &transcripts ) ; + + void OutputTranscript( int sampleId, struct _subexon *subexons, struct _transcript &transcript ) ; + + void PrintLog( const char *fmt, ... ) + { + char buffer[10021] ; + va_list args ; + va_start( args, fmt ) ; + vsprintf( buffer, fmt, args ) ; + + time_t mytime = time(NULL) ; + struct tm *localT = localtime( &mytime ) ; + char stime[500] ; + strftime( stime, sizeof( stime ), "%c", localT ) ; + fprintf( stderr, "[%s] %s\n", stime, buffer ) ; + } + + +public: + TranscriptDecider( double f, double c, double d, int sampleCnt, Alignments &a ): alignments( a ) + { + FPKMFraction = f ; + canBeSoftBoundaryThreshold = c ; + txptMinReadDepth = d ; + usedGeneId = 0 ; + defaultGeneId[0] = -1 ; + defaultGeneId[1] = -1 ; + maxDpConstraintSize = -1 ; + numThreads = 1 ; + this->sampleCnt = sampleCnt ; + dpHash = new struct _dp[ HASH_MAX ] ; // pre-allocated buffer to hold dp information. + } + ~TranscriptDecider() + { + int i ; + if ( numThreads == 1 ) + { + int size = outputFPs.size() ; + for ( i = 0 ; i < size ; ++i ) + { + fclose( outputFPs[i] ) ; + } + } + delete[] dpHash ; + } + + + // @return: the number of assembled transcript + int Solve( struct _subexon *subexons, int seCnt, std::vector<Constraints> &constraints, SubexonCorrelation &subexonCorrelation ) ; + + void SetOutputFPs( char *outputPrefix ) + { + int i ; + char buffer[1024] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + if ( outputPrefix[0] ) + sprintf( buffer, "%s_sample_%d.gtf", outputPrefix, i ) ; + else + sprintf( buffer, "sample_%d.gtf", i ) ; + FILE *fp = fopen( buffer, "w" ) ; + outputFPs.push_back( fp ) ; + } + } + + void SetMultiThreadOutputHandler( MultiThreadOutputTranscript *h ) + { + outputHandler = h ; + } + + void SetNumThreads( int t ) + { + numThreads = t ; + } + + void SetMaxDpConstraintSize(int size) + { + maxDpConstraintSize = size ; + } +} ; + +void *TranscriptDeciderSolve_Wrapper( void *arg ) ; + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/Vote.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,432 @@ +// The program that vote to pick the trusted transcripts. +#include <stdio.h> +#include <stdlib.h> +#include <map> +#include <vector> +#include <algorithm> +#include <string.h> +#include <string> + +#include "defs.h" +#include "TranscriptDecider.hpp" + +char usage[] = "./transcript-vote [OPTIONS] > output.gtf:\n" + "Required:\n" + "\t--lg: path to the list of GTF files.\n" + "Optional:\n" + "\t-d FLOAT: threshold of average coverage depth across all the samples. (default: 1)\n" + //"\t-n INT: the number of samples a transcript showed up. (default: 3)\n" + ; + + +/*struct _transcript +{ + int chrId ; + int geneId ; + char strand ; + struct _pair32 *exons ; + int ecnt ; + int sampleId ; +} ;*/ + +void GetGTFField( char *s, const char *field, char *ret ) +{ + char *p = strstr( s, field ) ; + if ( p == NULL ) + return ; + for ( ; *p != ' ' ; ++p ) + ; + p += 2 ; // add extra 1 to skip \" + sscanf( p, "%s", ret ) ; + //printf( "+%s %d\n", tid, strlen( tid ) ) ; + p = ret + strlen( ret ) ; + while ( *p != '\"' ) + --p ; + *p = '\0' ; +} + +int GetTailNumber( char *s ) +{ + int len = strlen( s ) ; + int ret = 0 ; + int i ; + int factor = 1 ; + for ( i = len - 1 ; i >= 0 && s[i] >= '0' && s[i] <= '9' ; --i, factor *= 10 ) + { + ret += factor * ( s[i] - '0' ) ; + } + return ret ; +} + +int CompDouble( const void *p1, const void *p2 ) +{ + double a = *(double *)p1 ; + double b = *(double *)p2 ; + if ( a > b ) + return 1 ; + else if ( a < b ) + return -1 ; + else + return 0 ; +} + +int main( int argc, char *argv[] ) +{ + int i, j, k ; + double minAvgDepth = 1.0 ; + double fraction = 1.0 ; + int minSampleCnt = 3 ; + std::map<std::string, int> chrNameToId ; + std::map<int, std::string> chrIdToName ; + + FILE *fpGTFlist = NULL ; + FILE *fp = NULL ; + for ( i = 1 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "--lg" ) ) + { + fpGTFlist = fopen( argv[i + 1], "r" ) ; + ++i ; + } + else if ( !strcmp( argv[i], "-d" ) ) + { + minAvgDepth = atof( argv[i + 1] ) ; + ++i ; + } + /*else if ( !strcmp( argv[i], "-n" ) ) + { + minSampleCnt = atoi( argv[i + 1] ) ; + ++i ; + }*/ + else + { + printf( "%s", usage ) ; + exit( 1 ) ; + } + } + if ( fpGTFlist == NULL ) + { + printf( "Must use --lg option to speicfy the list of GTF files.\n%s", usage ) ; + exit( 1 ) ; + } + + std::vector<struct _outputTranscript> transcripts ; + std::vector<struct _outputTranscript> outputTranscripts ; + + char buffer[4096] ; + char line[10000] ; + char chrom[50], tool[20], type[40], strand[3] ; + //char tid[50] ; + int start, end ; + std::vector<struct _pair32> tmpExons ; + int sampleCnt = 0 ; + int gid = -1 ; + int tid = -1 ; + int chrId = -1 ; + int chrIdUsed = 0 ; + char cStrand = '.' ; + char prefix[50] ; + double FPKM = 0, TPM = 0, cov = 0 ; + + while ( fgets( buffer, sizeof( buffer ), fpGTFlist ) != NULL ) + { + int len = strlen( buffer ) ; + if ( buffer[len - 1] == '\n' ) + { + buffer[len - 1] = '\0' ; + --len ; + } + fp = fopen( buffer, "r" ) ; + + + while ( fgets( line, sizeof( line ), fp ) != NULL ) + { + if ( line[0] == '#' ) + continue ; + + sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ; + + if ( strcmp( type, "exon" ) ) + { + if ( tmpExons.size() > 0 ) + { + struct _outputTranscript nt ; + nt.sampleId = sampleCnt ; + nt.chrId = chrId ; + nt.geneId = gid ; + nt.transcriptId = tid ; + nt.strand = cStrand ; + nt.ecnt = tmpExons.size() ; + nt.exons = new struct _pair32[nt.ecnt] ; + nt.FPKM = FPKM ; + nt.TPM = TPM ; + nt.cov = cov ; + for ( i = 0 ; i < nt.ecnt ; ++i ) + nt.exons[i] = tmpExons[i] ; + tmpExons.clear() ; + transcripts.push_back( nt ) ; + } + continue ; + } + + if ( chrNameToId.find( std::string( chrom ) ) == chrNameToId.end() ) + { + chrId = chrIdUsed ; + + std::string s( chrom ) ; + chrNameToId[s] = chrIdUsed ; + chrIdToName[ chrIdUsed ] = s ; + ++chrIdUsed ; + } + else + chrId = chrNameToId[ std::string( chrom ) ] ; + + GetGTFField( line, "gene_id", buffer ) ; + gid = GetTailNumber( buffer ) ; + + GetGTFField( line, "transcript_id", buffer ) ; + tid = GetTailNumber( buffer ) ; + + GetGTFField( line, "FPKM", buffer ) ; + FPKM = atof( buffer ) ; + + GetGTFField( line, "TPM", buffer ) ; + TPM = atof( buffer ) ; + + GetGTFField( line, "cov", buffer ) ; + cov = atof( buffer ) ; + + cStrand = strand[0] ; + + // Look for the user-defined prefix. + int len = strlen( buffer ) ; + j = 0 ; + for ( i = len - 1 ; i >= 0 ; --i ) + { + if ( buffer[i] == '.' ) + { + ++j ; + if ( j >= 2 ) + break ; + } + } + + for ( j = 0 ; j < i ; ++i ) + { + prefix[j] = buffer[j] ; + } + if ( i > 0 ) + { + prefix[j] = '.' ; + prefix[j + 1] = '\0' ; + } + else + prefix[0] = '\0' ; + + struct _pair32 ne ; + ne.a = start ; + ne.b = end ; + tmpExons.push_back( ne ) ; + } + if ( tmpExons.size() > 0 ) + { + struct _outputTranscript nt ; + nt.sampleId = sampleCnt ; + nt.chrId = chrId ; + nt.geneId = gid ; + nt.transcriptId = tid ; + nt.strand = cStrand ; + nt.ecnt = tmpExons.size() ; + nt.exons = new struct _pair32[nt.ecnt] ; + nt.FPKM = FPKM ; + nt.TPM = TPM ; + nt.cov = cov ; + for ( i = 0 ; i < nt.ecnt ; ++i ) + nt.exons[i] = tmpExons[i] ; + tmpExons.clear() ; + transcripts.push_back( nt ) ; + } + + ++sampleCnt ; + fclose( fp ) ; + } + fclose( fpGTFlist ) ; + + // Coalesce same transcripts + std::sort( transcripts.begin(), transcripts.end(), MultiThreadOutputTranscript::CompSortTranscripts ) ; + std::vector<int> sampleSupport ; + int size = transcripts.size() ; + if ( minSampleCnt > sampleCnt ) + minSampleCnt = sampleCnt ; + + for ( i = 0 ; i < size ; ) + { + int l ; + for ( j = i + 1 ; j < size ; ++j ) + { + if ( MultiThreadOutputTranscript::CompTranscripts( transcripts[i], transcripts[j] ) ) + break ; + } + // [i,j) are the same transcripts. + /*if ( j - i >= fraction * sampleCnt && j - i >= minSampleCnt ) + { + outputTranscripts.push_back( transcripts[i] ) ; + }*/ + + double sumFPKM = 0 ; + double sumTPM = 0 ; + double sumCov = 0 ; + for ( l = i ; l < j ; ++l ) + { + sumFPKM += transcripts[l].FPKM ; + sumTPM += transcripts[l].TPM ; + sumCov += transcripts[l].cov ; + } + + transcripts[k] = transcripts[i] ; + for ( l = i + 1 ; l < j ; ++l ) + delete[] transcripts[l].exons ; + + transcripts[k].FPKM = sumFPKM / ( j - i ) ; + transcripts[k].TPM = sumTPM / ( j - i ) ; + transcripts[k].cov = sumCov / ( j - i ) ; + + if ( ( j - i < int( fraction * sampleCnt ) || j - i < minSampleCnt ) && sumCov < minAvgDepth * sampleCnt ) + //if ( transcripts[k].score * ( j - i ) < 1 && ( j - i ) < sampleCnt * 0.5 ) + //if ( sumTPM < sampleCnt || sumFPKM < sampleCnt ) + //if ( sumCov < minAvgDepthn * sampleCnt ) + { + transcripts[k].FPKM = -1 ; + } + else + sampleSupport.push_back( j - i ) ; + ++k ; + i = j ; + } + transcripts.resize( k ) ; + + // The motivation to separate the coalscing and voting is to allow + // a gene with no txpt passing the vote to pick a representative. + size = k ; + for ( i = 0 ; i < size ; ) + { + int cnt = 0 ; + if ( transcripts[i].FPKM != -1 ) + ++cnt ; + + for ( j = i + 1 ; j < size ; ++j ) + { + if ( transcripts[i].geneId != transcripts[j].geneId ) + break ; + if ( transcripts[j].FPKM != -1 ) + ++cnt ; + } + int l ; + /*double *freq = new double[cnt] ; + for ( l = 0 ; l < cnt ; ++l ) + freq[l] = transcripts[l].FPKM / sampleCnt ; + qsort( freq, cnt, sizeof( double ), CompDouble ) ; + + for ( l = 0 ; l < cnt ; ++l ) + printf( "%lf\n", freq[l] ) ; + printf( "===========\n" ) ; + delete[] freq ;*/ + /*if ( cnt == 0 ) + { + int maxEcnt = -1 ; + int maxCnt = 0 ; + int maxtag ; + for ( l = i ; l < j ; ++l ) + if ( transcripts[l].ecnt > maxEcnt ) + maxEcnt = transcripts[l].ecnt ; + + if ( maxEcnt >= 2 ) + { + int maxSampleSupport = -1 ; + maxtag = i ; + for ( l = i ; l < j ; ++l ) + { + if ( transcripts[l].ecnt > 1 && transcripts[l].ecnt >= 2 ) + { + if ( sampleSupport[l] > maxSampleSupport ) + { + maxSampleSupport = sampleSupport[l] ; + maxtag = l ; + maxCnt = 1 ; + } + else if ( sampleSupport[l] == maxSampleSupport ) + ++maxCnt ; + } + } + + if ( maxSampleSupport >= 3 && maxSampleSupport >= ( fraction / 10 ) * sampleCnt && transcripts[maxtag].TPM > 0) + { + if ( maxCnt > 1 ) + { + int maxTPM = -1 ; + for ( l = i ; l < j ; ++l ) + { + if ( sampleSupport[l] != maxSampleSupport ) + continue ; + if ( transcripts[l].TPM > maxTPM ) + { + maxTPM = transcripts[l].TPM ; + maxtag = l ; + } + } + + } + transcripts[maxtag].FPKM = 0 ; + //fprintf( stderr, "recovered: %d %d\n", sampleSupport[ maxtag ], transcripts[ maxtag ].ecnt ) ; + outputTranscripts.push_back( transcripts[maxtag] ) ; + } + } + } + else*/ + { + for ( l = i ; l < j ; ++l ) + { + //if ( transcripts[l].FPKM >= fraction * sampleCnt && transcripts[l].FPKM >= minSampleCnt ) + if ( transcripts[l].FPKM != -1 ) + { + outputTranscripts.push_back( transcripts[l] ) ; + } + } + } + + i = j ; + } + + // Output + size = outputTranscripts.size() ; + //printf( "%d\n", size ) ; + int transcriptId = 0 ; + int prevGid = -1 ; + for ( i = 0 ; i < size ; ++i ) + { + struct _outputTranscript &t = outputTranscripts[i] ; + const char *chrom = chrIdToName[t.chrId].c_str() ; + /*if ( t.geneId != prevGid ) + transcriptId = 0 ; + else + ++transcriptId ;*/ + transcriptId = outputTranscripts[i].transcriptId ; + fprintf( stdout, "%s\tPsiCLASS\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n", + chrom, t.exons[0].a, t.exons[t.ecnt - 1].b, t.strand, + prefix, chrom, t.geneId, + prefix, chrom, t.geneId, transcriptId, t.FPKM, t.TPM, t.cov, sampleSupport[i] ) ; + for ( j = 0 ; j < t.ecnt ; ++j ) + { + fprintf( stdout, "%s\tPsiCLASS\texon\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; " + "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n", + chrom, t.exons[j].a, t.exons[j].b, t.strand, + prefix, chrom, t.geneId, + prefix, chrom, t.geneId, transcriptId, + j + 1, t.FPKM, t.TPM, t.cov, + sampleSupport[i] ) ; + } + + prevGid = t.geneId ; + } + return 0 ; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/alignments.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,549 @@ +// The class handles reading the bam file + +#ifndef _LSONG_RSCAF_ALIGNMENT_HEADER +#define _LSONG_RSCAF_ALIGNMENT_HEADER + +#include "samtools-0.1.19/sam.h" +#include <map> +#include <string> +#include <assert.h> +#include <iostream> +#include <stdlib.h> +#include <math.h> + +#include "defs.h" + +class Alignments +{ +private: + samfile_t *fpSam ; + bam1_t *b ; + + char fileName[1024] ; + bool opened ; + std::map<std::string, int> chrNameToId ; + bool allowSupplementary ; + bool allowClip ; + bool hasClipHead, hasClipTail ; + int segmentsSum ; // the sum of segments. + + bool atBegin ; + bool atEnd ; + + static int CompInt( const void *p1, const void *p2 ) + { + return (*(int *)p1 ) - (*(int *)p2 ) ; + } + + void Open() + { + fpSam = samopen( fileName, "rb", 0 ) ; + if ( !fpSam->header ) + { + fprintf( stderr, "Can not open %s.\n", fileName ) ; + exit( 1 ) ; + } + + // Collect the chromosome information + for ( int i = 0 ; i < fpSam->header->n_targets ; ++i ) + { + std::string s( fpSam->header->target_name[i] ) ; + chrNameToId[s] = i ; + } + opened = true ; + atBegin = true ; + atEnd = false ; + } +public: + struct _pair segments[MAX_SEG_COUNT] ; + int segCnt ; + + int totalReadCnt ; + int fragLen, fragStdev ; + int readLen ; + bool matePaired ; + + Alignments() + { + b = NULL ; + opened = false ; + atBegin = true ; + atEnd = false ; + allowSupplementary = false ; + allowClip = true ; + + totalReadCnt = 0 ; + fragLen = 0 ; + fragStdev = 0 ; + readLen = 0 ; + matePaired = false ; + } + ~Alignments() + { + if ( b ) + bam_destroy1( b ) ; + } + + void Open( char *file ) + { + strcpy( fileName, file ) ; + Open() ; + } + + void Rewind() + { + Close() ; + Open() ; + + atBegin = true ; + atEnd = false ; + } + + void Close() + { + samclose( fpSam ) ; + fpSam = NULL ; + } + + bool IsOpened() + { + return opened ; + } + + bool IsAtBegin() + { + return atBegin ; + } + + bool IsAtEnd() + { + return atEnd ; + } + + bool HasClip() + { + return hasClipHead || hasClipTail ; + } + + bool HasClipHead() + { + return hasClipHead ; + } + bool HasClipTail() + { + return hasClipTail ; + } + + int Next() + { + int i ; + int start = 0, len = 0 ; + uint32_t *rawCigar ; + + if ( atBegin == true ) + totalReadCnt = 0 ; + + atBegin = false ; + while ( 1 ) + { + while ( 1 ) + { + if ( b ) + bam_destroy1( b ) ; + b = bam_init1() ; + + if ( samread( fpSam, b ) <= 0 ) + { + atEnd = true ; + return 0 ; + } + + if ( ( b->core.flag & 0x900 ) == 0 ) + ++totalReadCnt ; + + if ( b->core.flag & 0xC ) + continue ; + + //if ( ( b->core.flag & 0x900 ) == 0 ) + break ; + } + + // to many repeat. + if ( bam_aux_get( b, "NH" ) ) + { + if ( bam_aux2i( bam_aux_get( b, "NH" ) ) >= 5 ) + continue ; + } + + // Compute the exons segments from the reads + segCnt = 0 ; + start = b->core.pos ; //+ 1 ; + rawCigar = bam1_cigar( b ) ; + // Check whether the query length is compatible with the read + if ( bam_cigar2qlen( &b->core, rawCigar ) != b->core.l_qseq ) + continue ; + + bool clipMiddle = false ; + int clipSum = 0 ; + hasClipHead = hasClipTail = false ; + len = 0 ; + segmentsSum = 0 ; + for ( i = 0 ; i < b->core.n_cigar ; ++i ) + { + int op = rawCigar[i] & BAM_CIGAR_MASK ; + int num = rawCigar[i] >> BAM_CIGAR_SHIFT ; + + switch ( op ) + { + case BAM_CMATCH: + case BAM_CDEL: + len += num ; break ; + case BAM_CSOFT_CLIP: + case BAM_CHARD_CLIP: + case BAM_CPAD: + { + if ( i == 0 ) + hasClipHead = true ; + else if ( i == b->core.n_cigar - 1 ) + hasClipTail = true ; + else + clipMiddle = true ; + + clipSum += num ; + } + case BAM_CINS: + num = 0 ; break ; + case BAM_CREF_SKIP: + { + segments[ segCnt ].a = start ; + segments[ segCnt ].b = start + len - 1 ; + ++segCnt ; + segmentsSum += len ; + start = start + len + num ; + len = 0 ; + } break ; + default: + len += num ; break ; + } + } + if ( clipMiddle ) // should never happend + continue ; + + if ( clipSum >= 2 && !allowClip ) + continue ; + + if ( len > 0 ) + { + segments[ segCnt ].a = start ; + segments[ segCnt ].b = start + len - 1 ; + ++segCnt ; + segmentsSum += len ; + } + /*if ( !strcmp( bam1_qname( b ), "chr1:109656301-109749401W:ENST00000490758.2:381:1480:1090:1290:X" ) ) + { + for ( i = 0 ; i < segCnt ; ++i ) + printf( "(%d %d) ", segments[i].a, segments[i].b ) ; + printf( "\n" ) ; + }*/ + + // Check whether the mates are compatible + //int mChrId = b->core.mtid ; + int64_t mPos = b->core.mpos ; + + if ( b->core.mtid == b->core.tid ) + { + for ( i = 0 ; i < segCnt - 1 ; ++i ) + { + if ( mPos >= segments[i].b && mPos <= segments[i + 1].a ) + break ; + } + if ( i < segCnt - 1 ) + continue ; + } + + break ; + } + + return 1 ; + } + + + int GetChromId() + { + return b->core.tid ; + } + + char* GetChromName( int tid ) + { + return fpSam->header->target_name[ tid ] ; + } + + int GetChromIdFromName( const char *s ) + { + std::string ss( s ) ; + if ( chrNameToId.find( ss ) == chrNameToId.end() ) + { + printf( "Unknown genome name: %s\n", s ) ; + exit( 1 ) ; + } + return chrNameToId[ss] ; + } + + int GetChromLength( int tid ) + { + return fpSam->header->target_len[ tid ] ; + } + + int GetChromCount() + { + return fpSam->header->n_targets ; + } + + void GetMatePosition( int &chrId, int64_t &pos ) + { + if ( b->core.flag & 0x8 ) + { + chrId = -1 ; + pos = -1 ; + } + else + { + chrId = b->core.mtid ; + pos = b->core.mpos ; //+ 1 ; + } + } + + int GetRepeatPosition( int &chrId, int64_t &pos ) + { + // Look at the CC field. + if ( !bam_aux_get( b, "CC" ) || !bam_aux_get( b, "CP" ) ) + { + chrId = -1 ; + pos = -1 ; + return 0 ; + } + + std::string s( bam_aux2Z( bam_aux_get(b, "CC" ) ) ) ; + chrId = chrNameToId[ s ] ; + pos = bam_aux2i( bam_aux_get( b, "CP" ) ) ;// Possible error for 64bit + return 1 ; + } + + void GetFileName( char *buffer ) + { + strcpy( buffer, fileName ) ; + } + + int GetReadLength() + { + return b->core.l_qseq ; + } + + int GetRefCoverLength() + { + return segmentsSum ; + } + + bool IsFirstMate() + { + if ( b->core.flag & 0x40 ) + return true ; + return false ; + } + + bool IsReverse() + { + if ( b->core.flag & 0x10 ) + return true ; + return false ; + } + + bool IsMateReverse() + { + if ( b->core.flag & 0x20 ) + return true ; + return false ; + } + + char *GetReadId() + { + return bam1_qname( b ) ; + } + + bool IsUnique() + { + if ( bam_aux_get( b, "NH" ) ) + { + if ( bam_aux2i( bam_aux_get( b, "NH" ) ) > 1 ) + return false ; + } + if ( IsSupplementary() && GetFieldZ( (char *)"XZ" ) != NULL ) + return false ; + return true ; + } + + bool IsPrimary() + { + if ( ( b->core.flag & 0x900 ) == 0 ) + return true ; + else + return false ; + } + + // -1:minus, 0: unknown, 1:plus + int GetStrand() + { + if ( segCnt == 1 ) + return 0 ; + if ( bam_aux_get( b, "XS" ) ) + { + if ( bam_aux2A( bam_aux_get( b, "XS" ) ) == '-' ) + return -1 ; + else + return 1 ; + } + else + return 0 ; + } + + int GetFieldI( char *f ) + { + if ( bam_aux_get( b, f ) ) + { + return bam_aux2i( bam_aux_get( b, f ) ) ; + } + return -1 ; + } + + char *GetFieldZ( char *f ) + { + if ( bam_aux_get( b, f ) ) + { + return bam_aux2Z( bam_aux_get( b, f ) ) ; + } + return NULL ; + } + + int GetNumberOfHits() + { + if ( bam_aux_get( b, "NH" ) ) + { + return bam_aux2i( bam_aux_get( b, "NH" ) ) ; + } + return 1 ; + } + + bool IsSupplementary() + { + if ( ( b->core.flag & 0x800 ) == 0 ) + return false ; + else + return true ; + } + + void SetAllowSupplementary( bool in ) + { + allowSupplementary = in ; + } + + void SetAllowClip( bool in ) + { + allowClip = in ; + } + + bool IsGCRich( bool threshold = 0.9 ) + { + int i = 0 ; + int gcCnt = 0 ; + for ( i = 0 ; i < b->core.l_qseq ; ++i ) + { + int bit = bam1_seqi( bam1_seq( b ), i ) ; + if ( bit == 2 || bit == 4 ) + ++gcCnt ; + } + if ( gcCnt >= threshold * b->core.l_qseq ) + return true ; + } + + void GetGeneralInfo( bool stopEarly = false ) + { + int i, k ; + + const int sampleMax = 1000000 ; + int *lens = new int[sampleMax] ; + int *mateDiff = new int[sampleMax] ; + int lensCnt = 0 ; + int mateDiffCnt = 0 ; + bool end = false ; + + while ( 1 ) + { + while ( 1 ) + { + if ( b ) + bam_destroy1( b ) ; + b = bam_init1() ; + + if ( samread( fpSam, b ) <= 0 ) + { + end = true ; + break ; + } + if ( b->core.flag & 0xC ) + continue ; + + if ( ( b->core.flag & 0x900 ) == 0 ) + break ; + } + if ( end ) + break ; + + if ( lensCnt < sampleMax ) + { + lens[ lensCnt ] = b->core.l_qseq ; + ++lensCnt ; + } + + if ( mateDiffCnt < sampleMax && b->core.tid == b->core.mtid + && b->core.pos < b->core.mpos ) + { + mateDiff[ mateDiffCnt ] = b->core.mpos - b->core.pos ; + ++mateDiffCnt ; + } + ++totalReadCnt ; + if ( totalReadCnt >= sampleMax && stopEarly ) + break ; + } + + // Get the read length info and fragment length info + qsort( lens, lensCnt, sizeof( int ), CompInt ) ; + readLen = lens[ lensCnt - 1 ] ; + + if ( mateDiffCnt > 0 ) + { + matePaired = true ; + + qsort( mateDiff, mateDiffCnt, sizeof( int ), CompInt ) ; + long long int sum = 0 ; + long long int sumsq = 0 ; + + for ( i = 0 ; i < mateDiffCnt * 0.7 ; ++i ) + { + sum += ( mateDiff[i] + readLen ); + sumsq += ( mateDiff[i] + readLen ) * ( mateDiff[i] + readLen ) ; + } + k = i ; + fragLen = (int)( sum / k ) ; + fragStdev = (int)sqrt( sumsq / k - fragLen * fragLen ) ; + } + else + { + fragLen = readLen ; + fragStdev = 0 ; + } + //printf( "readLen = %d\nfragLen = %d, fragStdev = %d\n", readLen, fragLen, fragStdev ) ; + delete[] lens ; + delete[] mateDiff ; + } +} ; +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/blocks.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1580 @@ +// The class manage the blocks +// Li Song + +#ifndef _LSONG_CLASSES_BLOCKS_HEADER +#define _LSONG_CLASSES_BLOCKS_HEADER + +#include <stdlib.h> +#include <vector> +#include <map> +#include <assert.h> +#include <math.h> +#include <set> +#include <inttypes.h> + +#include "defs.h" + +extern bool VERBOSE ; +extern FILE *fpOut ; + +extern int gMinDepth ; + + +struct _splitSite // means the next position belongs to another block +{ + int64_t pos ; + char strand ; + int chrId ; + int type ; // 1-start of an exon. 2-end of an exon. + + int support ; + int uniqSupport ; + + int mismatchSum ; + + int64_t oppositePos ; // the position of the other sites to form the intron. +} ; + +struct _block +{ + int chrId ; + int contigId ; + int64_t start, end ; + int64_t leftSplice, rightSplice ; // Some information about the left splice site and right splice site. + // record the leftmost and rightmost coordinates of a splice site within this block + //or the length of read alignment support the splice sites. + //Or the number of support. + int64_t depthSum ; + + int leftType, rightType ; //0-soft boundary, 1-start of an exon, 2-end of an exon. + + double leftRatio, rightRatio ; // the adjusted ratio-like value to the left and right anchor subexons. + double ratio ; + + char leftStrand, rightStrand ; + + int *depth ; + int prevCnt, nextCnt ; // Some connection information for the subexons. + int *prev ; + int *next ; +} ; + +// adjacent graph +struct _adj +{ + int ind ; + int info ; + int support ; + int next ; +} ; + +class Blocks +{ + private: + std::map<int, int> exonBlocksChrIdOffset ; + + int64_t Overlap( int64_t s0, int64_t e0, int64_t s1, int64_t e1, int64_t &s, int64_t &e ) + { + s = e = -1 ; + if ( e0 < s1 || s0 > e1 ) + return 0 ; + s = s0 > s1 ? s0 : s1 ; + e = e0 < e1 ? e0 : e1 ; + return e - s + 1 ; + } + + void Split( const char *s, char delimit, std::vector<std::string> &fields ) + { + int i ; + fields.clear() ; + if ( s == NULL ) + return ; + + std::string f ; + for ( i = 0 ; s[i] ; ++i ) + { + if ( s[i] == delimit || s[i] == '\n' ) + { + fields.push_back( f ) ; + f.clear() ; + } + else + f.append( 1, s[i] ) ; + } + fields.push_back( f ) ; + f.clear() ; + } + + void BuildBlockChrIdOffset() + { + // Build the map for the offsets of chr id in the exonBlock list. + exonBlocksChrIdOffset[ exonBlocks[0].chrId] = 0 ; + int cnt = exonBlocks.size() ; + for ( int i = 1 ; i < cnt ; ++i ) + { + if ( exonBlocks[i].chrId != exonBlocks[i - 1].chrId ) + exonBlocksChrIdOffset[ exonBlocks[i].chrId ] = i ; + } + } + + bool CanReach( int from, int to, struct _adj *adj, bool *visited ) + { + if ( visited[from] ) + return false ; + visited[from] = true ; + int p ; + p = adj[from].next ; + while ( p != -1 ) + { + if ( adj[p].ind == to ) + return true ; + if ( adj[p].ind < to + && CanReach( adj[p].ind, to, adj, visited ) ) + return true ; + p = adj[p].next ; + } + return false ; + } + + void AdjustAndCreateExonBlocks( int tag, std::vector<struct _block> &newExonBlocks ) + { + int i, j ; + if ( exonBlocks[tag].depth != NULL ) + { + // Convert the increment and decrement into actual depth. + int len = exonBlocks[tag].end - exonBlocks[tag].start + 1 ; + int *depth = exonBlocks[tag].depth ; + for ( i = 1 ; i < len ; ++i ) + depth[i] = depth[i - 1] + depth[i] ; + + struct _block island ; // the portion created by the hollow. + island.start = island.end = -1 ; + island.depthSum = 0 ; + + /*if ( exonBlocks[tag].start == 1562344 ) + { + for ( i = 0 ; i < len ; ++i ) + printf( "%d\n", depth[i] ) ; + }*/ + // Adjust boundary accordingly. + int64_t adjustStart = exonBlocks[tag].start ; + int64_t adjustEnd = exonBlocks[tag].end ; + + if ( exonBlocks[tag].leftType == 0 && exonBlocks[tag].rightType != 0 ) + { + for ( i = len - 1 ; i >= 0 ; --i ) + if ( depth[i] < gMinDepth ) + break ; + ++i ; + if ( exonBlocks[tag].rightType == 2 && i + exonBlocks[tag].start > exonBlocks[tag].rightSplice ) + i = exonBlocks[tag].rightSplice - exonBlocks[tag].start ; + adjustStart = i + exonBlocks[tag].start ; + + if ( adjustStart > exonBlocks[tag].start ) + { + int s, e ; + // firstly [s,e] is the range of depth array. + for ( s = 0 ; s < i ; ++s ) + if ( depth[s] >= gMinDepth ) + break ; + for ( e = i - 1 ; e >= s ; -- e ) + if ( depth[e] >= gMinDepth ) + break ; + if ( e >= s ) + { + island = exonBlocks[tag] ; + island.depthSum = 0 ; + island.leftType = island.rightType = 0 ; + for ( j = s ; j <= e ; ++j ) + island.depthSum += depth[j] ; + island.start = s + exonBlocks[tag].start ; // offset the coordinate. + island.end = e + exonBlocks[tag].start ; + } + } + } + else if ( exonBlocks[tag].leftType != 0 && exonBlocks[tag].rightType == 0 ) + { + for ( i = 0 ; i < len ; ++i ) + if ( depth[i] < gMinDepth ) + break ; + --i ; + if ( exonBlocks[tag].leftType == 1 && i + exonBlocks[tag].start < exonBlocks[tag].leftSplice ) + i = exonBlocks[tag].leftSplice - exonBlocks[tag].start ; + adjustEnd = i + exonBlocks[tag].start ; + + if ( adjustEnd < exonBlocks[tag].end ) + { + int s, e ; + // firstly [s,e] is the range of depth array. + for ( s = i + 1 ; s < len ; ++s ) + if ( depth[s] >= gMinDepth ) + break ; + for ( e = len - 1 ; e >= s ; --e ) + if ( depth[e] >= gMinDepth ) + break ; + if ( e >= s ) + { + island = exonBlocks[tag] ; + island.depthSum = 0 ; + island.leftType = island.rightType = 0 ; + for ( j = s ; j <= e ; ++j ) + island.depthSum += depth[j] ; + island.start = s + exonBlocks[tag].start ; // offset the coordinate. + island.end = e + exonBlocks[tag].start ; + } + } + } + else if ( exonBlocks[tag].leftType == 0 && exonBlocks[tag].rightType == 0 ) + { + for ( i = 0 ; i < len ; ++i ) + if ( depth[i] >= gMinDepth ) + break ; + adjustStart = i + exonBlocks[tag].start ; + + for ( i = len - 1 ; i >= 0 ; --i ) + if ( depth[i] >= gMinDepth ) + break ; + adjustEnd = i + exonBlocks[tag].start ; + } + else if ( exonBlocks[tag].leftType == 1 && exonBlocks[tag].rightType == 2 + && exonBlocks[tag].end - exonBlocks[tag].start + 1 >= 1000 ) + { + // The possible merge of two genes or merge of UTRs, if we don't break the low coverage part. + // If we decide to cut, I'll reuse the variable "island" to represent the subexon on right hand side. + int l ; + int gapSize = 30 ; + for ( i = 0 ; i <= len - ( gapSize + 1 ) ; ++i ) + { + if ( depth[i] < gMinDepth ) + { + for ( l = i ; l < i + ( gapSize + 1 ) ; ++l ) + if ( depth[l] >= gMinDepth ) + break ; + if ( l < i + ( gapSize + 1 ) ) + { + i = l - 1 ; + continue ; + } + else + break ; + } + } + + for ( j = len - 1 ; j >= ( gapSize + 1 ) - 1 ; --j ) + { + if ( depth[j] < gMinDepth ) + { + for ( l = j ; l >= j - ( gapSize + 1 ) + 1 ; --l ) + if ( depth[l] >= gMinDepth ) + break ; + + if ( l >= j - ( gapSize + 1 ) + 1 ) + { + j = l + 1 ; + continue ; + } + else + break ; + } + } + + if ( j - i + 1 > gapSize ) + { + // we break. + --i ; ++j ; + + adjustEnd = i + exonBlocks[tag].start ; + + if ( j < len - 1 ) + { + island = exonBlocks[tag] ; + island.depthSum = 0 ; + island.leftType = 0 ; + for ( l = j ; l <= len - 1 ; ++l ) + island.depthSum += depth[j] ; + island.start = j + exonBlocks[tag].start ; // offset the coordinate. + island.end = exonBlocks[tag].end ; + } + + exonBlocks[tag].rightType = 0 ; // put it here, so the island can get the right info + //fprintf( stderr, "break %d %d %d %d.\n", (int)exonBlocks[tag].start, (int)adjustEnd, i + (int)exonBlocks[tag].start, j + (int)exonBlocks[tag].start ) ; + } + } + + int lostDepthSum = 0 ; + for ( i = exonBlocks[tag].start ; i < adjustStart ; ++i ) + lostDepthSum += depth[i - exonBlocks[tag].start ] ; + for ( i = adjustEnd + 1 ; i < exonBlocks[tag].end ; ++i ) + lostDepthSum += depth[i - exonBlocks[tag].start ] ; + exonBlocks[tag].depthSum -= lostDepthSum ; + + delete[] exonBlocks[tag].depth ; + exonBlocks[tag].depth = NULL ; + + //if ( exonBlocks[tag].start == 1562344 ) + // printf( "%d %d\n", adjustStart, adjustEnd ) ; + if ( ( len > 1 && adjustEnd - adjustStart + 1 <= 1 ) || ( adjustEnd - adjustStart + 1 <= 0 ) ) + return ; + exonBlocks[tag].start = adjustStart ; + exonBlocks[tag].end = adjustEnd ; + + if ( island.start != -1 && island.end < exonBlocks[tag].start ) + newExonBlocks.push_back( island ) ; + + newExonBlocks.push_back( exonBlocks[tag] ) ; + if ( island.start != -1 && island.start > exonBlocks[tag].end ) + newExonBlocks.push_back( island ) ; + } + } + public: + std::vector<struct _block> exonBlocks ; + + Blocks() { } + ~Blocks() + { + int blockCnt = exonBlocks.size() ; + for ( int i = 0 ; i < blockCnt ; ++i ) + { + if ( exonBlocks[i].next != NULL ) + { + delete[] exonBlocks[i].next ; + } + if ( exonBlocks[i].prev != NULL ) + { + delete[] exonBlocks[i].prev ; + } + } + } + + double GetAvgDepth( const struct _block &block ) + { + return block.depthSum / (double)( block.end - block.start + 1 ) ; + } + + int BuildExonBlocks( Alignments &alignments ) + { + int tag = 0 ; + while ( alignments.Next() ) + { + int i, j, k ; + int segCnt = alignments.segCnt ; + struct _pair *segments = alignments.segments ; + int eid = 0 ; // the exonblock id that the segment update + + // locate the first exonblock that beyond the start of the read. + while ( tag < (int)exonBlocks.size() && ( exonBlocks[tag].end < segments[0].a - 1 + || exonBlocks[tag].chrId != alignments.GetChromId() ) ) + { + ++tag ; + } + // due to the merge of exon blocks, we might need roll back + --tag ; + while ( tag >= 0 && ( exonBlocks[tag].end >= segments[0].a - 1 + && exonBlocks[tag].chrId == alignments.GetChromId() ) ) + { + --tag ; + } + ++tag ; + + for ( i = 0 ; i < segCnt ; ++i ) + { + //if ( i == 0 ) + // printf( "hi %d %s %d %d\n", i, alignments.GetReadId(), segments[i].a, segments[i].b ) ; + for ( j = tag ; j < (int)exonBlocks.size() ; ++j ) + { + if ( exonBlocks[j].end >= segments[i].a - 1 ) + break ; + } + if ( j >= (int)exonBlocks.size() ) + { + // Append a new block + struct _block newSeg ; + newSeg.chrId = alignments.GetChromId() ; + newSeg.start = segments[i].a ; + newSeg.end = segments[i].b ; + + newSeg.leftSplice = -1 ; + newSeg.rightSplice = -1 ; + if ( i > 0 ) + newSeg.leftSplice = segments[i].a ; + if ( i < segCnt - 1 ) + newSeg.rightSplice = segments[i].b ; + + exonBlocks.push_back( newSeg ) ; + + eid = exonBlocks.size() - 1 ; + } + else if ( exonBlocks[j].end < segments[i].b || + ( exonBlocks[j].start > segments[i].a && exonBlocks[j].start <= segments[i].b + 1 ) ) + { + // If overlaps with a current exon block, so we extend it + if ( exonBlocks[j].end < segments[i].b ) + { + // extends toward right + exonBlocks[j].end = segments[i].b ; + if ( i > 0 && ( exonBlocks[j].leftSplice == -1 || segments[i].a < exonBlocks[j].leftSplice ) ) + exonBlocks[j].leftSplice = segments[i].a ; + if ( i < segCnt - 1 && segments[i].b > exonBlocks[j].rightSplice ) + exonBlocks[j].rightSplice = segments[i].b ; + eid = j ; + + // Merge with next few exon blocks + for ( k = j + 1 ; k < (int)exonBlocks.size() ; ++k ) + { + if ( exonBlocks[k].start <= exonBlocks[j].end + 1 ) + { + if ( exonBlocks[k].end > exonBlocks[j].end ) + exonBlocks[j].end = exonBlocks[k].end ; + + if ( exonBlocks[k].leftSplice != -1 && ( exonBlocks[j].leftSplice == -1 || exonBlocks[k].leftSplice < exonBlocks[j].leftSplice ) ) + exonBlocks[j].leftSplice = exonBlocks[k].leftSplice ; + if ( exonBlocks[k].rightSplice != -1 && exonBlocks[k].rightSplice > exonBlocks[j].rightSplice ) + exonBlocks[j].rightSplice = exonBlocks[k].rightSplice ; + + } + else + break ; + } + + if ( k > j + 1 ) + { + // Remove the merged blocks + int a, b ; + for ( a = j + 1, b = k ; b < (int)exonBlocks.size() ; ++a, ++b ) + exonBlocks[a] = exonBlocks[b] ; + for ( a = 0 ; a < k - ( j + 1 ) ; ++a ) + exonBlocks.pop_back() ; + } + } + else if ( exonBlocks[j].start > segments[i].a && exonBlocks[j].start <= segments[i].b + 1 ) + { + // extends toward left + exonBlocks[j].start = segments[i].a ; + if ( i > 0 && ( exonBlocks[j].leftSplice == -1 || segments[i].a < exonBlocks[j].leftSplice ) ) + exonBlocks[j].leftSplice = segments[i].a ; + if ( i < segCnt - 1 && segments[i].b > exonBlocks[j].rightSplice ) + exonBlocks[j].rightSplice = segments[i].b ; + eid = j ; + + // Merge with few previous exon blocks + for ( k = j - 1 ; k >= 0 ; --k ) + { + if ( exonBlocks[k].end >= exonBlocks[k + 1].start - 1 ) + { + if ( exonBlocks[k + 1].start < exonBlocks[k].start ) + { + exonBlocks[k].start = exonBlocks[k + 1].start ; + } + + if ( exonBlocks[k].leftSplice != -1 && ( exonBlocks[j].leftSplice == -1 || exonBlocks[k].leftSplice < exonBlocks[j].leftSplice ) ) + exonBlocks[j].leftSplice = exonBlocks[k].leftSplice ; + if ( exonBlocks[k].rightSplice != -1 && exonBlocks[k].rightSplice > exonBlocks[j].rightSplice ) + exonBlocks[j].rightSplice = exonBlocks[k].rightSplice ; + + } + else + break ; + } + + if ( k < j - 1 ) + { + int a, b ; + eid = k + 1 ; + for ( a = k + 2, b = j + 1 ; b < (int)exonBlocks.size() ; ++a, ++b ) + exonBlocks[a] = exonBlocks[b] ; + for ( a = 0 ; a < ( j - 1 ) - k ; ++a ) + exonBlocks.pop_back() ; + + } + } + } + else if ( exonBlocks[j].start > segments[i].b + 1 ) + { + int size = exonBlocks.size() ; + int a ; + // No overlap, insert a new block + struct _block newSeg ; + newSeg.chrId = alignments.GetChromId() ; + newSeg.start = segments[i].a ; + newSeg.end = segments[i].b ; + + newSeg.leftSplice = -1 ; + newSeg.rightSplice = -1 ; + if ( i > 0 ) + newSeg.leftSplice = segments[i].a ; + if ( i < segCnt - 1 ) + newSeg.rightSplice = segments[i].b ; + + // Insert at position j + exonBlocks.push_back( newSeg ) ; + for ( a = size ; a > j ; --a ) + exonBlocks[a] = exonBlocks[a - 1] ; + exonBlocks[a] = newSeg ; + + eid = a ; + } + else + { + // The segment is contained in j + eid = -1 ; + } + + // Merge the block with the mate if the gap is very small + // Note that since reads are already sorted by coordinate, + // the previous exon blocks is built completely. + /*int64_t matePos ; + int mateChrId ; + alignments.GetMatePosition( mateChrId, matePos ) ; + if ( i == 0 && eid > 0 && mateChrId == exonBlocks[ eid ].chrId + && matePos < segments[0].a + && matePos >= exonBlocks[eid - 1].start + && matePos <= exonBlocks[eid - 1].end + && segments[0].a - matePos + 1 <= 500 + && exonBlocks[eid-1].chrId == exonBlocks[eid].chrId + && exonBlocks[eid].start - exonBlocks[eid - 1].end - 1 <= 30 ) + { + printf( "%d: (%d-%d) (%d-%d). %d %d\n", ( segments[0].a + alignments.readLen - 1 ) - matePos + 1, exonBlocks[eid - 1].start, exonBlocks[eid - 1].end, + exonBlocks[eid].start, + exonBlocks[eid].end, eid, exonBlocks.size() ) ; + exonBlocks[eid - 1].end = exonBlocks[eid].end ; + + if ( exonBlocks[eid].leftSplice != -1 && ( exonBlocks[eid - 1].leftSplice == -1 || exonBlocks[eid].leftSplice < exonBlocks[eid - 1].leftSplice ) ) + exonBlocks[eid - 1].leftSplice = exonBlocks[k].leftSplice ; + if ( exonBlocks[eid].rightSplice != -1 && exonBlocks[eid].rightSplice > exonBlocks[eid - 1].rightSplice ) + exonBlocks[eid - 1].rightSplice = exonBlocks[eid].rightSplice ; + + int size = exonBlocks.size() ; + for ( j = eid ; j < size - 1 ; ++j ) + exonBlocks[j] = exonBlocks[j + 1] ; + exonBlocks.pop_back() ; + }*/ + } + } + /*for ( int i = 0 ; i < (int)exonBlocks.size() ; ++i ) + { + printf( "%d %d\n", exonBlocks[i].start, exonBlocks[i].end ) ; + }*/ + + if ( exonBlocks.size() > 0 ) + { + BuildBlockChrIdOffset() ; + int cnt = exonBlocks.size() ; + for ( int i = 0 ; i < cnt ; ++i ) + { + exonBlocks[i].contigId = exonBlocks[i].chrId ; + + exonBlocks[i].leftType = 0 ; + exonBlocks[i].rightType = 0 ; + exonBlocks[i].depth = NULL ; + exonBlocks[i].nextCnt = 0 ; + exonBlocks[i].prevCnt = 0 ; + exonBlocks[i].leftStrand = '.' ; + exonBlocks[i].rightStrand = '.' ; + } + } + return exonBlocks.size() ; + } + + void FilterSplitSitesInRegions( std::vector<struct _splitSite> &sites ) + { + int i, j, k ; + int size = sites.size() ; + int bsize = exonBlocks.size() ; + int tag = 0 ; + + for ( i = 0 ; i < bsize ; ++i ) + { + while ( tag < size && ( sites[tag].chrId < exonBlocks[i].chrId || + ( sites[tag].chrId == exonBlocks[i].chrId && sites[tag].pos < exonBlocks[i].start ) ) ) + { + ++tag ; + } + if ( tag >= size ) + break ; + + if ( sites[tag].chrId != exonBlocks[i].chrId || sites[tag].pos > exonBlocks[i].end ) + continue ; + + for ( j = tag + 1 ; j < size ; ++j ) + if ( sites[j].chrId != exonBlocks[i].chrId || sites[j].pos > exonBlocks[i].end ) + break ; + + // [tag,j) holds the split sites in this region. + int leftStrandSupport[2] = {0, 0} ; + int rightStrandSupport[2] = {0, 0} ; + int strandCnt[2] = { 0, 0 } ; + for ( k = tag ; k < j ; ++k ) + { + if ( sites[k].strand == '-' ) + { + ++strandCnt[0] ; + if ( sites[k].oppositePos < exonBlocks[i].start ) + leftStrandSupport[0] += sites[k].support ; + else if ( sites[k].oppositePos > exonBlocks[i].end ) + rightStrandSupport[0] += sites[k].support ; + } + else if ( sites[k].strand == '+' ) + { + ++strandCnt[1] ; + if ( sites[k].oppositePos < exonBlocks[i].start ) + leftStrandSupport[1] += sites[k].support ; + else if ( sites[k].oppositePos > exonBlocks[i].end ) + rightStrandSupport[1] += sites[k].support ; + } + + } + + + if ( leftStrandSupport[0] == 0 && rightStrandSupport[0] == 0 + && leftStrandSupport[1] != 0 && rightStrandSupport[1] != 0 && strandCnt[0] == 2 ) + { + // check whether a different strand accidentally show up in this region. + bool remove = false ; + for ( k = tag ; k < j ; ++k ) + { + if ( sites[k].strand == '-' && + sites[k].oppositePos >= sites[k].pos && sites[k].oppositePos <= exonBlocks[i].end && + sites[k].support <= 2 ) + { + remove = true ; + break ; + } + } + + if ( remove ) + for ( k = tag ; k < j ; ++k ) + if ( sites[k].strand == '-' ) + sites[k].support = -1 ; + } + else if ( leftStrandSupport[1] == 0 && rightStrandSupport[1] == 0 + && leftStrandSupport[0] != 0 && rightStrandSupport[0] != 0 && strandCnt[1] == 2 ) + { + // check whether a different strand accidentally show up in this region. + bool remove = false ; + for ( k = tag ; k < j ; ++k ) + { + if ( sites[k].strand == '+' && + sites[k].oppositePos >= sites[k].pos && sites[k].oppositePos <= exonBlocks[i].end && + sites[k].support <= 2 ) + { + remove = true ; + break ; + } + } + + if ( remove ) + for ( k = tag ; k < j ; ++k ) + if ( sites[k].strand == '+' ) + sites[k].support = -1 ; + } + + tag = j ; + } + + k = 0 ; + for ( i = 0 ; i < size ; ++i ) + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + ++k ; + } + sites.resize( k ) ; + } + + // Filter the pair of split sites that is likely merge two genes. + // We filter the case like [..]------------------------[..] + // [..]--[..] [..]-[...] + void FilterGeneMergeSplitSites( std::vector<struct _splitSite> &sites ) + { + int i, j, k ; + int bsize = exonBlocks.size() ; + int ssize = sites.size() ; + + struct _pair32 *siteToBlock = new struct _pair32[ ssize ] ; + struct _adj *adj = new struct _adj[ ssize / 2 + bsize ] ; + bool *visited = new bool[bsize] ; + + int adjCnt = 0 ; + for ( i = 0 ; i < bsize ; ++i ) + { + adj[i].info = 0 ; // in the header, info means the number of next block + adj[i].support = 0 ; + adj[i].next = -1 ; + } + adjCnt = i ; + memset( siteToBlock, -1, sizeof( struct _pair32 ) * ssize ) ; + memset( visited, false, sizeof( bool ) * bsize ) ; + + // Build the graph. + k = 0 ; + for ( i = 0 ; i < bsize ; ++i ) + { + for ( ; k < ssize ; ++k ) + { + if ( sites[k].oppositePos < sites[k].pos ) + continue ; + if ( sites[k].chrId < exonBlocks[i].chrId + || ( sites[k].chrId == exonBlocks[i].chrId && sites[k].pos < exonBlocks[i].start ) ) + continue ; + break ; + } + + for ( ; k < ssize ; ++k ) + { + if ( sites[k].chrId > exonBlocks[i].chrId + || sites[k].pos > exonBlocks[i].end ) + break ; + + if ( sites[k].oppositePos <= exonBlocks[i].end ) // ignore self-loop + continue ; + + for ( j = i + 1 ; j < bsize ; ++j ) + if ( sites[k].oppositePos >= exonBlocks[j].start && sites[k].oppositePos <= exonBlocks[j].end ) + break ; + if ( sites[k].oppositePos >= exonBlocks[j].start && sites[k].oppositePos <= exonBlocks[j].end ) + { + int p ; + p = adj[i].next ; + while ( p != -1 ) + { + if ( adj[p].ind == j ) + { + ++adj[p].info ; + adj[p].support += sites[k].uniqSupport ; + break ; + } + p = adj[p].next ; + } + if ( p == -1 ) + { + adj[ adjCnt ].ind = j ; + adj[ adjCnt ].info = 1 ; + adj[ adjCnt ].support = sites[k].uniqSupport ; + adj[ adjCnt ].next = adj[i].next ; + adj[i].next = adjCnt ; + ++adj[i].info ; + ++adjCnt ; + } + + siteToBlock[k].a = i ; + siteToBlock[k].b = j ; + } + } + } + for ( k = 0 ; k < ssize ; ++k ) + { + if ( sites[k].oppositePos - sites[k].pos + 1 < 20000 || sites[k].uniqSupport >= 30 ) + continue ; + + int from = siteToBlock[k].a ; + int to = siteToBlock[k].b ; + if ( to - from - 1 < 2 || adj[from].info <= 1 ) + continue ; + + memset( &visited[from], false, sizeof( bool ) * ( to - from + 1 ) ) ; + + int cnt = 0 ; + int p = adj[from].next ; + int max = -1 ; + int maxP = 0 ; + while ( p != -1 ) + { + if ( adj[p].support >= 10 && adj[p].ind <= to ) + ++cnt ; + if ( adj[p].support > max || ( adj[p].support == max && adj[p].ind == to ) ) + { + max = adj[p].support ; + maxP = p ; + } + + if ( adj[p].ind == to && adj[p].info > 1 ) + { + cnt = 0 ; + break ; + } + p = adj[p].next ; + } + if ( cnt <= 1 ) + continue ; + + if ( max != -1 && adj[maxP].ind == to ) + continue ; + + // No other path can reach "to" + p = adj[from].next ; + while ( p != -1 ) + { + if ( adj[p].ind != to ) + { + //if ( sites[k].pos ==43917158 ) + // printf( "hi %d %d. (%d %d)\n", from, adj[p].ind, exonBlocks[ adj[p].ind ].start, exonBlocks[ adj[p].ind ].end ) ; + if ( CanReach( adj[p].ind, to, adj, visited ) ) + break ; + } + p = adj[p].next ; + } + if ( p != -1 ) + continue ; + + //if ( sites[k].pos == 34073267 ) + // printf( "hi %d %d. (%d %d)\n", from, to, exonBlocks[to].start, exonBlocks[to].end ) ; + + // There are some blocks between that can reach "to" + for ( i = from + 1 ; i < to ; ++i ) + { + p = adj[i].next ; + while ( p != -1 ) + { + if ( adj[p].ind == to && adj[p].support >= 10 ) + break ; + p = adj[p].next ; + } + if ( p != -1 ) + break ; + } + if ( i >= to ) + continue ; + + // Filter the site + //printf( "filtered: %d %d\n", sites[k].pos, sites[k].oppositePos ) ; + sites[k].support = -1 ; + for ( j = k + 1 ; j < ssize ; ++j ) + { + if ( sites[j].pos == sites[k].oppositePos && sites[j].oppositePos == sites[k].pos ) + sites[j].support = -1 ; + if ( sites[j].pos > sites[k].oppositePos ) + break ; + } + } + + k = 0 ; + for ( i = 0 ; i < ssize ; ++i ) + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + ++k ; + } + sites.resize( k ) ; + + // Filter the intron on the different strand of a gene. + ssize = k ; + k = 0 ; + for ( i = 0 ; i < bsize ; ) + { + int farthest = exonBlocks[i].end ; + + for ( j = i ; j < bsize ; ++j ) + { + if ( exonBlocks[j].start > farthest || exonBlocks[i].chrId != exonBlocks[j].chrId ) + break ; + int p = adj[i].next ; + while ( p != -1 ) + { + if ( exonBlocks[ adj[p].ind ].end > farthest ) + farthest = exonBlocks[ adj[p].ind ].end ; + p = adj[p].next ; + } + } + + for ( ; k < ssize ; ++k ) + { + if ( sites[k].chrId < exonBlocks[i].chrId || + ( sites[k].chrId == exonBlocks[i].chrId && sites[k].pos < exonBlocks[i].start ) ) + continue ; + break ; + } + + if ( sites[k].chrId > exonBlocks[i].chrId || sites[k].pos > farthest ) + { + i = j ; + continue ; + } + //printf( "%d %d. %d %d. %d %d\n", i, j, exonBlocks[i].start, farthest, k, sites[k].pos ) ; + + + int from = k ; + int to ; + for ( to = k ; to < ssize ; ++to ) + if ( sites[to].chrId != exonBlocks[i].chrId || sites[to].pos > farthest ) + break ; + + int strandSupport[2] = {0, 0}; + int strandCount[2] = {0, 0}; + for ( k = from ; k < to ; ++k ) + { + if ( sites[k].oppositePos <= sites[k].pos ) + continue ; + + int tag = ( sites[k].strand == '+' ? 1 : 0 ) ; + strandSupport[tag] += sites[k].support ; + ++strandCount[tag] ; + } + + // Not mixtured. + if ( strandCount[0] * strandCount[1] == 0 ) + { + i = j ; + k = to ; + continue ; + } + + char removeStrand = 0 ; + if ( strandCount[0] == 1 && strandCount[1] >= 3 + && strandSupport[1] >= 20 * strandSupport[0] && strandSupport[0] <= 5 ) + { + removeStrand = '-' ; + } + else if ( strandCount[1] == 1 && strandCount[0] >= 3 + && strandSupport[0] >= 20 * strandSupport[1] && strandSupport[1] <= 5 ) + { + removeStrand = '+' ; + } + + if ( removeStrand == 0 ) + { + i = j ; k = to ; + continue ; + } + + for ( k = from ; k < to ; ++k ) + { + if ( sites[k].strand == removeStrand && sites[k].oppositePos >= exonBlocks[i].start && sites[k].oppositePos <= farthest ) + { + //printf( "filtered: %d %d\n", sites[k].pos, sites[k].oppositePos ) ; + sites[k].support = -1 ; + } + } + + i = j ; + k = to ; + } + + k = 0 ; + for ( i = 0 ; i < ssize ; ++i ) + if ( sites[i].support > 0 ) + { + sites[k] = sites[i] ; + ++k ; + } + sites.resize( k ) ; + + delete[] visited ; + delete[] siteToBlock ; + delete[] adj ; + } + + void SplitBlocks( Alignments &alignments, std::vector< struct _splitSite > &splitSites ) + { + std::vector<struct _block> rawExonBlocks = exonBlocks ; + int i, j ; + int tag = 0 ; + int bsize = rawExonBlocks.size() ; + int ssize = splitSites.size() ; + + // Make sure not overflow + struct _splitSite tmp ; + tmp.pos = -1 ; + splitSites.push_back( tmp ) ; + + exonBlocks.clear() ; + for ( i = 0 ; i < bsize ; ++i ) + { + while ( tag < ssize && ( splitSites[tag].chrId < rawExonBlocks[i].chrId || + ( splitSites[tag].chrId == rawExonBlocks[i].chrId && splitSites[tag].pos < rawExonBlocks[i].start ) ) ) + ++tag ; + + int l ; + for ( l = tag ; l < ssize ; ++l ) + if ( splitSites[l].chrId != rawExonBlocks[i].chrId || splitSites[l].pos > rawExonBlocks[i].end ) + break ; + int64_t start = rawExonBlocks[i].start ; + int64_t end = rawExonBlocks[i].end ; + //printf( "%lld %lld\n", start, end ) ; + //printf( "%s %s: %d %d\n", alignments.GetChromName( splitSites[tag].chrId ), alignments.GetChromName( rawExonBlocks[i].chrId ), tag, l ) ; + for ( j = tag ; j <= l ; ++j ) + { + int leftType = 0 ; + int rightType = 0 ; + if ( j > tag ) + { + start = splitSites[j - 1].pos ; // end is from previous stage + leftType = splitSites[j - 1].type ; + } + else + start = rawExonBlocks[i].start ; + + if ( j <= l - 1 ) + { + end = splitSites[j].pos ; + rightType = splitSites[j].type ; + } + else + end = rawExonBlocks[i].end ; + + if ( leftType == 2 ) + ++start ; + if ( rightType == 1 ) + --end ; + + struct _block tmpB ; + tmpB = rawExonBlocks[i] ; + tmpB.start = start ; + tmpB.end = end ; + tmpB.depthSum = 0 ; + tmpB.ratio = 0 ; + //printf( "\t%lld %lld\n", start, end ) ; + tmpB.leftType = leftType ; + tmpB.rightType = rightType ; + tmpB.depth = NULL ; + + exonBlocks.push_back( tmpB ) ; + // handle the soft boundary is the same as the hard boundary case + // or adjacent splice sites + /*if ( j == tag && start == end ) + { + exonBlocks.pop_back() ; + --end ; + } + else if ( j == l && start > end ) + { + exonBlocks.pop_back() ; + }*/ + if ( start > end + //|| ( tmpB.start == rawExonBlocks[i].start && tmpB.end != rawExonBlocks[i].end && tmpB.end - tmpB.start + 1 <= 7 ) + //|| ( tmpB.end == rawExonBlocks[i].end && tmpB.start != rawExonBlocks[i].start && tmpB.end - tmpB.start + 1 <= 7 )) // the case of too short overhang + || ( tmpB.leftType == 0 && tmpB.rightType == 2 && tmpB.end - tmpB.start + 1 <= 9 ) + || ( tmpB.leftType == 1 && tmpB.rightType == 0 && tmpB.end - tmpB.start + 1 <= 9 ) ) + { + exonBlocks.pop_back() ; + } + /*else if ( start == end ) + { + + }*/ + } + } + BuildBlockChrIdOffset() ; + } + + void ComputeDepth( Alignments &alignments ) + { + // Go through the alignment list again to fill in the depthSum; + int i ; + int j ; + int tag = 0 ; + int blockCnt = exonBlocks.size() ; + + std::vector<struct _block> newExonBlocks ; + while ( alignments.Next() ) + { + int segCnt = alignments.segCnt ; + struct _pair *segments = alignments.segments ; + + while ( tag < blockCnt && ( exonBlocks[tag].chrId < alignments.GetChromId() || + ( exonBlocks[tag].chrId == alignments.GetChromId() && exonBlocks[tag].end < segments[0].a ) ) ) + { + AdjustAndCreateExonBlocks( tag, newExonBlocks ) ; + ++tag ; + } + for ( i = 0 ; i < segCnt ; ++i ) + { + for ( j = tag ; j < blockCnt && ( exonBlocks[j].chrId == alignments.GetChromId() && exonBlocks[j].start <= segments[i].b ) ; ++j ) + { + int64_t s = -1, e = -1 ; + exonBlocks[j].depthSum += Overlap( segments[i].a, segments[i].b, exonBlocks[j].start, exonBlocks[j].end, s, e ) ; + if ( s == -1 ) + continue ; + + if ( exonBlocks[j].depth == NULL ) + { + int len = exonBlocks[j].end - exonBlocks[j].start + 1 ; + exonBlocks[j].depth = new int[len + 1] ; + memset( exonBlocks[j].depth, 0, sizeof( int ) * ( len + 1 ) ) ; + exonBlocks[j].leftSplice = exonBlocks[j].start ; + exonBlocks[j].rightSplice = exonBlocks[j].end ; + } + int *depth = exonBlocks[j].depth ; + ++depth[s - exonBlocks[j].start] ; + --depth[e - exonBlocks[j].start + 1] ; // notice the +1 here, since the decrease of coverage actually happens at next base. + + // record the longest alignment stretch support the splice site. + if ( exonBlocks[j].leftType == 1 && segments[i].a == exonBlocks[j].start + && e > exonBlocks[j].leftSplice ) + { + exonBlocks[j].leftSplice = e ; + } + if ( exonBlocks[j].rightType == 2 && segments[i].b == exonBlocks[j].end + && s < exonBlocks[j].rightSplice ) + { + exonBlocks[j].rightSplice = s ; + } + } + } + } + + for ( ; tag < blockCnt ; ++tag ) + AdjustAndCreateExonBlocks( tag, newExonBlocks ) ; + exonBlocks.clear() ; + + // Due to multi-alignment, we may skip some alignments that determines leftSplice and + // rightSplice, hence filtered some subexons. As a result, some subexons' anchor subexon will disapper + // and we need to change its boundary type. + blockCnt = newExonBlocks.size() ; + for ( i = 0 ; i < blockCnt ; ++i ) + { + if ( ( i == 0 && newExonBlocks[i].leftType == 2 ) || + ( i > 0 && newExonBlocks[i].leftType == 2 && newExonBlocks[i - 1].end + 1 != newExonBlocks[i].start ) ) + { + newExonBlocks[i].leftType = 0 ; + } + + if ( ( i == blockCnt - 1 && newExonBlocks[i].rightType == 1 ) || + ( i < blockCnt - 1 && newExonBlocks[i].rightType == 1 && + newExonBlocks[i].end + 1 != newExonBlocks[i + 1].start ) ) + newExonBlocks[i].rightType = 0 ; + } + exonBlocks = newExonBlocks ; + } + + // If two blocks whose soft boundary are close to each other, we can merge them. + void MergeNearBlocks() + { + std::vector<struct _block> rawExonBlocks = exonBlocks ; + int i, k ; + int bsize = rawExonBlocks.size() ; + int *newIdx = new int[bsize] ; // used for adjust prev and next. Note that by merging, it won't change the number of prevCnt or nextCnt. + + exonBlocks.clear() ; + exonBlocks.push_back( rawExonBlocks[0] ) ; + k = 0 ; + newIdx[0] = 0 ; + for ( i = 1 ; i < bsize ; ++i ) + { + if ( rawExonBlocks[i].chrId == exonBlocks[k].chrId + && rawExonBlocks[i].leftType == 0 && exonBlocks[k].rightType == 0 + && rawExonBlocks[i].start - exonBlocks[k].end - 1 <= 50 + && rawExonBlocks[i].depthSum / double( rawExonBlocks[i].end - rawExonBlocks[i].start + 1 ) < 3.0 + && exonBlocks[k].depthSum / double( exonBlocks[k].end - exonBlocks[i].start + 1 ) < 3.0 ) + { + exonBlocks[k].end = rawExonBlocks[i].end ; + exonBlocks[k].rightType = rawExonBlocks[i].rightType ; + exonBlocks[k].rightStrand = rawExonBlocks[i].rightStrand ; + exonBlocks[k].depthSum += rawExonBlocks[i].depthSum ; + if ( rawExonBlocks[i].rightSplice != -1 ) + exonBlocks[k].rightSplice = rawExonBlocks[i].rightSplice ; + + if ( exonBlocks[k].nextCnt > 0 ) + delete[] exonBlocks[k].next ; + exonBlocks[k].nextCnt = rawExonBlocks[i].nextCnt ; + exonBlocks[k].next = rawExonBlocks[i].next ; + } + else + { + exonBlocks.push_back( rawExonBlocks[i] ) ; + ++k ; + } + newIdx[i] = k ; + } + BuildBlockChrIdOffset() ; + bsize = exonBlocks.size() ; + for ( i = 0 ; i < bsize ; ++i ) + { + int cnt = exonBlocks[i].prevCnt ; + for ( k = 0 ; k < cnt ; ++k ) + exonBlocks[i].prev[k] = newIdx[ exonBlocks[i].prev[k] ] ; + + cnt = exonBlocks[i].nextCnt ; + for ( k = 0 ; k < cnt ; ++k ) + exonBlocks[i].next[k] = newIdx[ exonBlocks[i].next[k] ] ; + } + delete[] newIdx ; + } + + void AddIntronInformation( std::vector<struct _splitSite> &sites, Alignments &alignments ) + { + // Add the connection information, support information and the strand information. + int i, j, k, tag ; + tag = 0 ; + int scnt = sites.size() ; + int exonBlockCnt = exonBlocks.size() ; + bool flag ; + + for ( i = 0 ; i < exonBlockCnt ; ++i ) + { + exonBlocks[i].prevCnt = exonBlocks[i].nextCnt = 0 ; + exonBlocks[i].prev = exonBlocks[i].next = NULL ; + exonBlocks[i].leftStrand = exonBlocks[i].rightStrand = '.' ; + exonBlocks[i].leftSplice = exonBlocks[i].rightSplice = 0 ; + } + + // Now, we add the region id and compute the length of each region, here the region does not contain possible IR event. + int regionId = 0 ; + for ( i = 0 ; i < exonBlockCnt ; ) + { + if ( exonBlocks[i].leftType == 2 && exonBlocks[i].rightType == 1 ) + { + j = i + 1 ; + } + else + for ( j = i + 1 ; j < exonBlockCnt ; ++j ) + if ( exonBlocks[j].start > exonBlocks[j - 1].end + 1 || ( exonBlocks[j].leftType == 2 && exonBlocks[j].rightType == 1 ) ) + break ; + for ( k = i ; k < j ; ++k ) + exonBlocks[k].contigId = regionId ; + ++regionId ; + i = j ; + } + int *regionLength = new int[regionId] ; + memset( regionLength, 0, sizeof( int ) * regionId ) ; + for ( i = 0 ; i < exonBlockCnt ; ++i ) + { + regionLength[ exonBlocks[i].contigId ] += exonBlocks[i].end - exonBlocks[i].start + 1 ; + } + + for ( i = 0 ; i < scnt ; ) + { + for ( j = i ; j < scnt ; ++j ) + { + if ( sites[j].chrId != sites[i].chrId || + sites[j].pos != sites[i].pos || + sites[j].type != sites[i].type ) + break ; + } + // [i,j-1] are the indices of the sites with same coordinate + // It is possible a sites corresponds to 2 strands, then we should only keep one of them. + char strand = '.' ; + int strandSupport[2] = {0, 0}; + for ( k = i ; k < j ; ++k ) + { + if ( sites[k].strand == '-' ) + strandSupport[0] += sites[k].support ; + else if ( sites[k].strand == '+' ) + strandSupport[1] += sites[k].support ; + } + if ( strandSupport[0] > strandSupport[1] ) + strand = '-' ; + else if ( strandSupport[1] > strandSupport[0] ) + strand = '+' ; + + + int cnt = j - i ; + // Locate the first subexon that can overlap with this site + while ( tag < exonBlockCnt ) + { + if ( ( exonBlocks[tag].chrId == sites[i].chrId && exonBlocks[tag].end >= sites[i].pos ) + || exonBlocks[tag].chrId > sites[i].chrId ) + break ; + ++tag ; + } + flag = false ; + for ( k = tag; k < exonBlockCnt ; ++k ) + { + if ( exonBlocks[tag].start > sites[i].pos || + exonBlocks[tag].chrId > sites[i].chrId ) + break ; + + if ( exonBlocks[tag].start == sites[i].pos || + exonBlocks[tag].end == sites[i].pos ) + { + flag = true ; + break ; + } + } + + if ( !flag ) + { + i = j ; + continue ; + } + + if ( sites[i].type == 1 && exonBlocks[tag].start == sites[i].pos ) + { + exonBlocks[tag].prevCnt = 0 ; + exonBlocks[tag].prev = new int[cnt] ; + exonBlocks[tag].leftStrand = strand ; + + // And we also need to put the "next" here. + // Here we assume the oppositePos sorted in increasing order + k = tag - 1 ; + for ( int l = j - 1 ; l >= i ; --l ) + { + for ( ; k >= 0 ; --k ) + { + if ( exonBlocks[k].end < sites[l].oppositePos || exonBlocks[k].chrId != sites[l].chrId ) + break ; + if ( exonBlocks[k].end == sites[l].oppositePos ) //&& exonBlocks[k].rightType == sites[l].type ) + { + if ( sites[l].strand != strand || + sites[l].strand != exonBlocks[k].rightStrand ) + break ; + exonBlocks[k].next[ exonBlocks[k].nextCnt ] = tag ; + exonBlocks[tag].prev[ exonBlocks[tag].prevCnt] = k ; // Note that the prev are sorted in decreasing order + ++exonBlocks[k].nextCnt ; + ++exonBlocks[tag].prevCnt ; + double factor = 1.0 ; + if ( regionLength[ exonBlocks[k].contigId ] < alignments.readLen ) + { + int left ; + for ( left = k ; left >= 0 ; --left ) + if ( exonBlocks[left].contigId != exonBlocks[k].contigId ) + break ; + ++left ; + + int prevType = 0 ; + // only consider the portion upto k. + for ( int m = left ; m <= k ; ++m ) + if ( exonBlocks[m].leftType == 1 ) + ++prevType ; + + if ( prevType == 0 ) + factor = 2 * (double)alignments.readLen / regionLength[ exonBlocks[k].contigId ] ; + } + if ( regionLength[ exonBlocks[tag].contigId ] < alignments.readLen ) + { + int right ; + for ( right = tag ; right < exonBlockCnt ; ++right ) + if ( exonBlocks[right].contigId != exonBlocks[tag].contigId ) + break ; + --right ; + + int nextType = 0 ; + for ( int m = tag ; m <= right ; ++m ) + if ( exonBlocks[m].rightType == 2 ) + ++nextType ; + + if ( nextType == 0 ) + factor = 2 * (double)alignments.readLen / regionLength[ exonBlocks[tag].contigId ] ; + } + if ( factor > 10 ) + factor = 10 ; + + if ( exonBlocks[k].contigId != exonBlocks[tag].contigId ) // the support of introns between regions. + { + // Adjust the support if one of the anchor exon is too short. + // This avoid the creation of alternative 3'/5' end due to low coverage from too short anchor. + exonBlocks[tag].leftSplice += sites[l].support * factor ; + exonBlocks[k].rightSplice += sites[l].support * factor ; + } + break ; + } + } + } + } + else if ( sites[i].type == 2 && exonBlocks[tag].end == sites[i].pos ) + { + exonBlocks[tag].nextCnt = 0 ; // cnt ; it should reach cnt after putting the ids in + exonBlocks[tag].next = new int[cnt] ; + exonBlocks[tag].rightStrand = strand ; + } + + i = j ; + } + // Reverse the prev list + for ( i = 0 ; i < exonBlockCnt ; ++i ) + { + for ( j = 0, k = exonBlocks[i].prevCnt - 1 ; j < k ; ++j, --k ) + { + int tmp = exonBlocks[i].prev[j] ; + exonBlocks[i].prev[j] = exonBlocks[i].prev[k] ; + exonBlocks[i].prev[k] = tmp ; + } + } + + // If all of the subexon anchored the intron are filtered, we need to change the type of the other anchor. + for ( i = 0 ; i < exonBlockCnt ; ++i ) + { + if ( exonBlocks[i].leftType == 1 && exonBlocks[i].prevCnt == 0 ) + { + exonBlocks[i].leftType = 0 ; + exonBlocks[i].leftStrand = '.' ; + if ( i > 0 && exonBlocks[i - 1].rightType == 1 ) + exonBlocks[i - 1].rightType = 0 ; + } + if ( exonBlocks[i].rightType == 2 && exonBlocks[i].nextCnt == 0 ) + { + exonBlocks[i].rightType = 0 ; + exonBlocks[i].rightStrand = '.' ; + if ( i < exonBlockCnt - 1 && exonBlocks[i + 1].leftType == 2 ) + exonBlocks[i + 1].leftType = 0 ; + } + } + MergeNearBlocks() ; + + delete[] regionLength ; + } + + double PickLeftAndRightRatio( double l, double r ) + { + if ( l >= 0 && r >= 0 ) + return l < r ? l : r ; + else if ( l < 0 && r < 0 ) + return -1 ; + else if ( l < 0 ) + return r ; + else + return l ; + } + + double PickLeftAndRightRatio( const struct _block &b ) + { + return PickLeftAndRightRatio( b.leftRatio, b.rightRatio ) ; + } + + void ComputeRatios() + { + int i, j ; + int exonBlockCnt = exonBlocks.size() ; + for ( i = 0 ; i < exonBlockCnt ; ++i ) + { + exonBlocks[i].leftRatio = -1 ; + exonBlocks[i].rightRatio = -1 ; + if ( exonBlocks[i].leftType == 2 && exonBlocks[i].rightType == 1 ) + { + // ]...[ + double anchorAvg = 0 ; + double avgDepth = GetAvgDepth( exonBlocks[i] ) ; + if ( i >= 1 && exonBlocks[i - 1].chrId == exonBlocks[i].chrId ) + { + anchorAvg = GetAvgDepth( exonBlocks[i - 1] ) ; + if ( anchorAvg > 1 && avgDepth > 1 ) + exonBlocks[i].leftRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; + } + if ( i < exonBlockCnt - 1 && exonBlocks[i + 1].chrId == exonBlocks[i].chrId ) + { + anchorAvg = GetAvgDepth( exonBlocks[i + 1] ) ; + if ( anchorAvg > 1 && avgDepth > 1 ) + exonBlocks[i].rightRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; + } + } + + if ( exonBlocks[i].leftType == 0 && exonBlocks[i].rightType == 1 ) + { + // (..[ , overhang subexon. + double avgDepth = GetAvgDepth( exonBlocks[i] ) ; + double anchorAvg = GetAvgDepth( exonBlocks[i + 1] ) ; + if ( avgDepth > 1 && anchorAvg > 1 ) + exonBlocks[i].rightRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; + } + + /*if ( exonBlocks[i].leftType == 1 ) + { + // For the case (...[, the leftRatio is actuall the leftratio of the subexon on its right. + int len = 0 ; + double depthSum = 0 ; + int tag = i ; + + for ( j = 0 ; j < exonBlocks[tag].prevCnt ; ++j ) + { + int k = exonBlocks[tag].prev[j] ; + len += ( exonBlocks[k].end - exonBlocks[k].start + 1 ) ; + depthSum += exonBlocks[k].depthSum ; + } + double otherAvgDepth = depthSum / len ; + double avgDepth = GetAvgDepth( exonBlocks[tag] ) ; + + // adjust avgDepth by looking into the following exon blocks(subexon) in the same region whose left side is not hard end + for ( j = tag + 1 ; j < exonBlockCnt ; ++j ) + { + if ( exonBlocks[j].start != exonBlocks[j - 1].end + 1 || exonBlocks[j].leftType == 1 ) + break ; + double tmp = GetAvgDepth( exonBlocks[j] ) ; + if ( tmp > avgDepth ) + avgDepth = tmp ; + } + + if ( avgDepth < 1 ) + avgDepth = 1 ; + if ( otherAvgDepth < 1 ) + otherAvgDepth = 1 ; + + //exonBlocks[i].leftRatio = pow( log( avgDepth ) / log(2.0), 2.0 / 3.0 ) - pow( log( otherAvgDepth ) / log( 2.0 ), 2.0 / 3.0 ); + exonBlocks[i].leftRatio = sqrt( avgDepth ) - log( avgDepth ) - ( sqrt( otherAvgDepth ) + log( otherAvgDepth ) ) ; + + }*/ + if ( exonBlocks[i].rightType == 0 && exonBlocks[i].leftType == 2 ) + { + // for overhang subexon. + double avgDepth = GetAvgDepth( exonBlocks[i] ) ; + double anchorAvg = GetAvgDepth( exonBlocks[i - 1] ) ; + if ( avgDepth > 1 && anchorAvg > 1 ) + exonBlocks[i].leftRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; + + } + /*if ( exonBlocks[i].rightType == 2 ) + { + int len = 0 ; + double depthSum = 0 ; + int tag = i ; + + for ( j = 0 ; j < exonBlocks[tag].nextCnt ; ++j ) + { + int k = exonBlocks[tag].next[j] ; + len += ( exonBlocks[k].end - exonBlocks[k].start + 1 ) ; + depthSum += exonBlocks[k].depthSum ; + } + double otherAvgDepth = depthSum / len ; + double avgDepth = GetAvgDepth( exonBlocks[tag] ) ; + + // adjust avgDepth by looking into the earlier exon blocks(subexon) in the same region whose right side is not hard end + for ( j = tag - 1 ; j >= 0 ; --j ) + { + if ( exonBlocks[j].end + 1 != exonBlocks[j + 1].start || exonBlocks[j].rightType == 2 ) + break ; + double tmp = GetAvgDepth( exonBlocks[j] ) ; + if ( tmp > avgDepth ) + avgDepth = tmp ; + } + + if ( avgDepth < 1 ) + avgDepth = 1 ; + if ( otherAvgDepth < 1 ) + otherAvgDepth = 1 ; + + exonBlocks[i].rightRatio = sqrt( avgDepth ) - log( avgDepth ) - ( sqrt( otherAvgDepth ) + log( otherAvgDepth ) ) ; + //pow( log( avgDepth ) / log(2.0), 2.0 / 3.0 ) - pow( log( otherAvgDepth ) / log(2.0), 2.0 / 3.0 ); + }*/ + // The remaining case the islands, (...) + } + + // go through each region of subexons, and only compute the ratio for the first and last hard boundary + for ( i = 0 ; i < exonBlockCnt ; ) + { + // the blocks from [i,j) corresponds to a region, namely consecutive subexons. + for ( j = i + 1 ; j < exonBlockCnt ; ++j ) + if ( exonBlocks[j].contigId != exonBlocks[i].contigId ) + break ; + + int leftSupport = 0 ; + int rightSupport = 0 ; + int leftTag = j, rightTag = i - 1 ; + for ( int k = i ; k < j ; ++k ) + { + if ( exonBlocks[k].leftType == 1 && exonBlocks[k].leftSplice > 0 ) + { + leftSupport += exonBlocks[k].leftSplice ; + if ( k < leftTag ) + leftTag = k ; + } + if ( exonBlocks[k].rightType == 2 && exonBlocks[k].rightSplice > 0 ) + { + rightSupport += exonBlocks[k].rightSplice ; + if ( k > rightTag ) + rightTag = k ; + } + } + + if ( leftSupport > 0 && rightSupport > 0 ) + { + // when the right splice support is much greater than that of the left side, there should be a soft boundary for the left side. + // Wether we want to include this soft boundary or not will be determined in class, when it looked at whether the overhang block + // should be kept or not. + exonBlocks[ leftTag ].leftRatio = sqrt( rightSupport ) - log( ( rightSupport + leftSupport ) * 0.5 ) - ( sqrt( leftSupport ) ) ; //+ log( leftSupport ) ) ; + exonBlocks[ rightTag ].rightRatio = sqrt( leftSupport ) - log( ( rightSupport + leftSupport ) * 0.5 ) - ( sqrt( rightSupport ) ) ; //+ log( rightSupport ) ) ; + } + + i = j ; // don't forget this. + } + } +} ; + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/classes.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,491 @@ +#include <stdio.h> +#include <getopt.h> +#include <vector> +#include <pthread.h> + +#include "alignments.hpp" +#include "SubexonGraph.hpp" +#include "SubexonCorrelation.hpp" +#include "Constraints.hpp" +#include "TranscriptDecider.hpp" + +char usage[] = "./classes [OPTIONS]:\n" + "Required:\n" + "\t-s STRING: path to the subexon file.\n" + "\t-b STRING: path to the BAM file.\n" + "\t\tor\n" + "\t--lb STRING: path to the file of the list of BAM files.\n" + "Optional:\n" + "\t-p INT: number of threads. (default: 1)\n" + "\t-o STRING: the prefix of the output file. (default: not used)\n" + "\t-c FLOAT: only use the subexons with classifier score <= than the given number. (default: 0.05)\n" + "\t-f FLOAT: filter the transcript from the gene if its abundance is lower than the given number percent of the most abundant one. (default: 0.05)\n" + "\t-d FLOAT: filter the transcript whose average read depth is less than the given number. (default: 2.5)\n" + "\t--ls STRING: path to the file of the list of single-sample subexon files. (default: not used)\n" + "\t--hasMateIdSuffix: the read id has suffix such as .1, .2 for a mate pair. (default: false)\n" + "\t--maxDpConstraintSize: the maximum number of subexons a constraint can cover in dynamic programming. (default: 7; -1 for inf)\n" + "\t--primaryParalog: use primary alignment to retain paralog genes instead of unique alignments. (default: not used)\n" + ; + +static const char *short_options = "s:b:f:o:d:p:c:h" ; +static struct option long_options[] = + { + { "ls", required_argument, 0, 10000 }, + { "lb", required_argument, 0, 10001 }, + { "hasMateIdSuffix", no_argument, 0, 10002 }, + { "primaryParalog", no_argument, 0, 10003 }, + { "maxDpConstraintSize", required_argument, 0, 10004 }, + { (char *)0, 0, 0, 0} + } ; + +struct _getAlignmentsInfoThreadArg +{ + std::vector<Alignments> *pAlignmentFiles ; + int numThreads ; + int tid ; +} ; + +struct _getConstraintsThreadArg +{ + std::vector<Constraints> *pMultiSampleConstraints ; + int numThreads ; + int tid ; + + struct _subexon *subexons ; + int seCnt ; + int start, end ; +} ; + +void *GetAlignmentsInfo_Thread( void *pArg ) +{ + int i ; + std::vector <Alignments> &alignmentFiles = *( ( (struct _getAlignmentsInfoThreadArg *)pArg )->pAlignmentFiles ) ; + int tid = ( (struct _getAlignmentsInfoThreadArg *)pArg )->tid ; + int numThreads = ( (struct _getAlignmentsInfoThreadArg *)pArg )->numThreads ; + int size = alignmentFiles.size() ; + for ( i = 0 ; i < size ; ++i ) + { + if ( i % numThreads == tid ) + { + alignmentFiles[i].GetGeneralInfo( true ) ; + alignmentFiles[i].Rewind() ; + } + } + + pthread_exit( NULL ) ; +} + + +void *GetConstraints_Thread( void *pArg ) +{ + int i ; + struct _getConstraintsThreadArg &arg = *( (struct _getConstraintsThreadArg *)pArg ) ; + std::vector<Constraints> &multiSampleConstraints = *( arg.pMultiSampleConstraints ) ; + int tid = arg.tid ; + int numThreads = arg.numThreads ; + int size = multiSampleConstraints.size() ; + for ( i = 0 ; i < size ; ++i ) + { + if ( i % numThreads == tid ) + multiSampleConstraints[i].BuildConstraints( arg.subexons, arg.seCnt, arg.start, arg.end ) ; + } + pthread_exit( NULL ) ; +} + +int main( int argc, char *argv[] ) +{ + int i, j ; + int size ; + + if ( argc <= 1 ) + { + printf( "%s", usage ) ; + return 0 ; + } + + int c, option_index ; // For getopt + option_index = 0 ; + FILE *fpSubexon = NULL ; + double FPKMFraction = 0.05 ; + double classifierThreshold ; + double txptMinReadDepth = 2.5 ; + char outputPrefix[1024] = "" ; + int numThreads = 1 ; + bool hasMateReadIdSuffix = false ; + bool usePrimaryAsUnique = false ; + int maxDpConstraintSize = 7 ; + + std::vector<Alignments> alignmentFiles ; + SubexonCorrelation subexonCorrelation ; + + classifierThreshold = 0.05 ; + while ( 1 ) + { + c = getopt_long( argc, argv, short_options, long_options, &option_index ) ; + if ( c == -1 ) + break ; + + if ( c == 's' ) + { + fpSubexon = fopen( optarg, "r" ) ; + } + else if ( c == 'b' ) + { + Alignments a ; + a.Open( optarg ) ; + //a.SetAllowClip( false ) ; + alignmentFiles.push_back( a ) ; + } + else if ( c == 'c' ) + { + classifierThreshold = atof( optarg ) ; + } + else if ( c == 'f' ) + { + FPKMFraction = atof( optarg ) ; + } + else if ( c == 'o' ) + { + strcpy( outputPrefix, optarg ) ; + } + else if ( c == 'd' ) + { + txptMinReadDepth = atof( optarg ) ; + } + else if ( c == 'p' ) + { + numThreads = atoi( optarg ) ; + } + else if ( c == 10000 ) // the list of subexon files. + { + subexonCorrelation.Initialize( optarg ) ; + } + else if ( c == 10001 ) // the list of bam files. + { + FILE *fp = fopen( optarg, "r" ) ; + char buffer[1024] ; + while ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) + { + int len = strlen( buffer ) ; + if ( buffer[len - 1] == '\n' ) + { + buffer[len - 1] = '\0' ; + --len ; + + } + Alignments a ; + a.Open( buffer ) ; + alignmentFiles.push_back( a ) ; + } + fclose( fp ) ; + } + else if ( c == 10002 ) // the mate pair read id has suffix. + { + hasMateReadIdSuffix = true ; + } + else if ( c == 10003 ) // do not check unique alignment + { + usePrimaryAsUnique = true ; + } + else if ( c == 10004 ) // maxDpConstraintSize + { + maxDpConstraintSize = atoi(optarg) ; + } + else + { + printf( "%s", usage ) ; + exit( 1 ) ; + } + } + if ( fpSubexon == NULL ) + { + printf( "Cannot find combined subexon file.\n" ) ; + exit( 1 ) ; + } + if ( alignmentFiles.size() < 1 ) + { + printf( "Must use -b option to specify BAM files.\n" ) ; + exit( 1 ) ; + } + + + if ( alignmentFiles.size() < 50 ) + { + size = alignmentFiles.size() ; + for ( i = 0 ; i < size ; ++i ) + { + alignmentFiles[i].GetGeneralInfo( true ) ; + alignmentFiles[i].Rewind() ; + } + } + else + { + struct _getAlignmentsInfoThreadArg *args = new struct _getAlignmentsInfoThreadArg[ numThreads ] ; + pthread_attr_t pthreadAttr ; + pthread_t *threads ; + + pthread_attr_init( &pthreadAttr ) ; + pthread_attr_setdetachstate( &pthreadAttr, PTHREAD_CREATE_JOINABLE ) ; + + threads = new pthread_t[ numThreads ] ; + + for ( i = 0 ; i < numThreads ; ++i ) + { + args[i].pAlignmentFiles = &alignmentFiles ; + args[i].tid = i ; + args[i].numThreads = numThreads ; + + pthread_create( &threads[i], &pthreadAttr, GetAlignmentsInfo_Thread, &args[i] ) ; + } + + for ( i = 0 ; i < numThreads ; ++i ) + { + pthread_join( threads[i], NULL ) ; + } + + pthread_attr_destroy( &pthreadAttr ) ; + delete[] args ; + delete[] threads ; + } + + // Build the subexon graph + SubexonGraph subexonGraph( classifierThreshold, alignmentFiles[0], fpSubexon ) ; + subexonGraph.ComputeGeneIntervals() ; + + // Solve gene by gene + int sampleCnt = alignmentFiles.size() ; + std::vector<Constraints> multiSampleConstraints ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + Constraints constraints( &alignmentFiles[i] ) ; + constraints.SetHasMateReadIdSuffix( hasMateReadIdSuffix ) ; + constraints.SetUsePrimaryAsUnique( usePrimaryAsUnique ) ; + multiSampleConstraints.push_back( constraints ) ; + } + MultiThreadOutputTranscript outputHandler( sampleCnt, alignmentFiles[0] ) ; + outputHandler.SetOutputFPs( outputPrefix ) ; + + if ( numThreads <= 1 ) + { + TranscriptDecider transcriptDecider( FPKMFraction, classifierThreshold, txptMinReadDepth, sampleCnt, alignmentFiles[0] ) ; + + transcriptDecider.SetMultiThreadOutputHandler( &outputHandler ) ; + transcriptDecider.SetNumThreads( numThreads ) ; + transcriptDecider.SetMaxDpConstraintSize( maxDpConstraintSize ) ; + + int giCnt = subexonGraph.geneIntervals.size() ; + for ( i = 0 ; i < giCnt ; ++i ) + { + struct _geneInterval gi = subexonGraph.geneIntervals[i] ; + struct _subexon *intervalSubexons = new struct _subexon[ gi.endIdx - gi.startIdx + 1 ] ; + subexonGraph.ExtractSubexons( gi.startIdx, gi.endIdx, intervalSubexons ) ; + printf( "%d: %d %s %d %d\n", i, gi.endIdx - gi.startIdx + 1, + alignmentFiles[0].GetChromName( intervalSubexons[0].chrId ), + gi.start + 1, gi.end + 1 ) ; + fflush( stdout ) ; + + subexonCorrelation.ComputeCorrelation( intervalSubexons, gi.endIdx - gi.startIdx + 1, alignmentFiles[0] ) ; + for ( j = 0 ; j < sampleCnt ; ++j ) + multiSampleConstraints[j].BuildConstraints( intervalSubexons, gi.endIdx - gi.startIdx + 1, gi.start, gi.end ) ; + + transcriptDecider.Solve( intervalSubexons, gi.endIdx - gi.startIdx + 1, multiSampleConstraints, subexonCorrelation ) ; + + for ( j = 0 ; j < gi.endIdx - gi.startIdx + 1 ; ++j ) + { + delete[] intervalSubexons[j].prev ; + delete[] intervalSubexons[j].next ; + } + delete[] intervalSubexons ; + } + } + else // multi-thread case. + { + --numThreads ; // one thread is used for read in the data. + + // Allocate memory + struct _transcriptDeciderThreadArg *pArgs = new struct _transcriptDeciderThreadArg[ numThreads ] ; + pthread_mutex_t ftLock ; + int *freeThreads ; + int ftCnt ; + pthread_cond_t fullWorkCond ; + pthread_attr_t pthreadAttr ; + pthread_t *threads ; + pthread_t *getConstraintsThreads ; + bool *initThreads ; + + pthread_mutex_init( &ftLock, NULL ) ; + pthread_cond_init( &fullWorkCond, NULL ) ; + pthread_attr_init( &pthreadAttr ) ; + pthread_attr_setdetachstate( &pthreadAttr, PTHREAD_CREATE_JOINABLE ) ; + + threads = new pthread_t[ numThreads ] ; + getConstraintsThreads = new pthread_t[ numThreads ] ; + initThreads = new bool[numThreads] ; + freeThreads = new int[ numThreads ] ; + ftCnt = numThreads ; + for ( i = 0 ; i < numThreads ; ++i ) + { + pArgs[i].tid = i ; + pArgs[i].sampleCnt = sampleCnt ; + pArgs[i].numThreads = numThreads ; + pArgs[i].maxDpConstraintSize = maxDpConstraintSize ; + pArgs[i].FPKMFraction = FPKMFraction ; + pArgs[i].classifierThreshold = classifierThreshold ; + pArgs[i].txptMinReadDepth = txptMinReadDepth ; + pArgs[i].alignments = &alignmentFiles[0] ; + //pArgs[i].constraints = new std::vector<Constraints> ; + pArgs[i].outputHandler = &outputHandler ; + + freeThreads[i] = i ; + pArgs[i].freeThreads = freeThreads ; + pArgs[i].ftCnt = &ftCnt ; + pArgs[i].ftLock = &ftLock ; + pArgs[i].fullWorkCond = &fullWorkCond ; + + for ( j = 0 ; j < sampleCnt ; ++j ) + { + Constraints constraints( &alignmentFiles[j] ) ; + pArgs[i].constraints.push_back( constraints ) ; + } + + initThreads[i] = false ; + } + + + // Read in and distribute the work + int giCnt = subexonGraph.geneIntervals.size() ; + for ( i = 0 ; i < giCnt ; ++i ) + { + struct _geneInterval gi = subexonGraph.geneIntervals[i] ; + struct _subexon *intervalSubexons = new struct _subexon[ gi.endIdx - gi.startIdx + 1 ] ; + subexonGraph.ExtractSubexons( gi.startIdx, gi.endIdx, intervalSubexons ) ; + subexonCorrelation.ComputeCorrelation( intervalSubexons, gi.endIdx - gi.startIdx + 1, alignmentFiles[0] ) ; + printf( "%d: %d %s %d %d. Free threads: %d/%d\n", i, gi.endIdx - gi.startIdx + 1, + alignmentFiles[0].GetChromName( intervalSubexons[0].chrId ), + gi.start + 1, gi.end + 1, ftCnt, numThreads + 1 ) ; + fflush( stdout ) ; + + int gctCnt = ftCnt ; + if ( gctCnt > 1 && sampleCnt > 1 ) + { + gctCnt = ( gctCnt < sampleCnt ? gctCnt : sampleCnt ) ; + struct _getConstraintsThreadArg *args = new struct _getConstraintsThreadArg[ gctCnt ] ; + for ( j = 0 ; j < gctCnt ; ++j ) + { + args[j].pMultiSampleConstraints = &multiSampleConstraints ; + args[j].numThreads = gctCnt ; + args[j].tid = j ; + args[j].subexons = intervalSubexons ; + args[j].seCnt = gi.endIdx - gi.startIdx + 1 ; + args[j].start = gi.start ; args[j].end = gi.end ; + pthread_create( &getConstraintsThreads[j], &pthreadAttr, GetConstraints_Thread, &args[j] ) ; + } + + + + for ( j = 0 ; j < gctCnt ; ++j ) + pthread_join( getConstraintsThreads[j], NULL ) ; + + delete[] args ; + } + else + { + for ( j = 0 ; j < sampleCnt ; ++j ) + multiSampleConstraints[j].BuildConstraints( intervalSubexons, gi.endIdx - gi.startIdx + 1, gi.start, gi.end ) ; + } + + // Search for the free queue. + int tag = -1 ; // get the working thread. + pthread_mutex_lock( &ftLock ) ; + if ( ftCnt == 0 ) + { + pthread_cond_wait( &fullWorkCond, &ftLock ) ; + } + tag = freeThreads[ ftCnt - 1 ] ; + --ftCnt ; + pthread_mutex_unlock( &ftLock ) ; + + if ( initThreads[tag] ) + pthread_join( threads[tag], NULL ) ; // Make sure the chosen thread exits. + + // Assign the subexons, the constraints and correlation content. + pArgs[tag].subexons = new struct _subexon[gi.endIdx - gi.startIdx + 1] ; + pArgs[tag].seCnt = gi.endIdx - gi.startIdx + 1 ; + for ( j = 0 ; j < pArgs[tag].seCnt ; ++j ) + { + pArgs[tag].subexons[j] = intervalSubexons[j] ; + int cnt = intervalSubexons[j].prevCnt ; + pArgs[tag].subexons[j].prev = new int[cnt] ; + memcpy( pArgs[tag].subexons[j].prev, intervalSubexons[j].prev, sizeof( int ) * cnt ) ; + cnt = intervalSubexons[j].nextCnt ; + pArgs[tag].subexons[j].next = new int[cnt] ; + memcpy( pArgs[tag].subexons[j].next, intervalSubexons[j].next, sizeof( int ) * cnt ) ; + } + + for ( j = 0 ; j < sampleCnt ; ++j ) + { + pArgs[tag].constraints[j].Assign( multiSampleConstraints[ j ] ) ; + } + pArgs[tag].subexonCorrelation.Assign( subexonCorrelation ) ; + pthread_create( &threads[tag], &pthreadAttr, TranscriptDeciderSolve_Wrapper, &pArgs[tag] ) ; + initThreads[tag] = true ; + for ( j = 0 ; j < gi.endIdx - gi.startIdx + 1 ; ++j ) + { + delete[] intervalSubexons[j].prev ; + delete[] intervalSubexons[j].next ; + } + delete[] intervalSubexons ; + } + + for ( i = 0 ; i < numThreads ; ++i ) + { + if ( initThreads[i] ) + pthread_join( threads[i], NULL ) ; + } + + // Release memory + for ( i = 0 ; i < numThreads ; ++i ) + { + std::vector<Constraints>().swap( pArgs[i].constraints ) ; + } + delete []pArgs ; + pthread_attr_destroy( &pthreadAttr ) ; + pthread_mutex_destroy( &ftLock ) ; + pthread_cond_destroy( &fullWorkCond ) ; + + delete[] threads ; + delete[] getConstraintsThreads ; + delete[] initThreads ; + delete[] freeThreads ; + } // end of else for multi-thread. + + outputHandler.OutputCommandInfo( argc, argv ) ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + char buffer[1024] ; + alignmentFiles[i].GetFileName( buffer ) ; + int separator = -1 ; + + // deprive the directory. + for ( j = 0 ; buffer[j] ; ++j ) + { + if ( buffer[j] == '/' ) + separator = j ; + } + if ( separator != -1 ) + { + ++separator ; + for ( j = separator ; buffer[j] ; ++j ) + buffer[j - separator] = buffer[j] ; + buffer[j - separator] = '\0' ; + } + outputHandler.OutputCommentToSampleGTF( i, buffer ) ; + } + outputHandler.ComputeFPKMTPM( alignmentFiles ) ; + outputHandler.Flush() ; + + for ( i = 0 ; i < sampleCnt ; ++i ) + alignmentFiles[i].Close() ; + fclose( fpSubexon ) ; + return 0 ; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/defs.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,24 @@ +#ifndef _LSONG_CLASSES_DEFS_HEADER +#define _LSONG_CLASSES_DEFS_HEADER + +#include <stdint.h> + +#define MAX_SEG_COUNT 127 + +struct _pair +{ + int64_t a, b ; +} ; + +struct _pair32 +{ + int a, b ; +} ; + +/*char nucToNum[26] = { 0, -1, 1, -1, -1, -1, 2, + -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, 3, + -1, -1, -1, -1, -1, -1 } ; + +char numToNuc[26] = {'A', 'C', 'G', 'T'} ;*/ +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/example/slist Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,2 @@ +example/s1.bam +example/s2.bam
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/gamma.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,201 @@ +#include "gamma.hpp" + +/** The digamma function in long double precision. +* @param x the real value of the argument +* @return the value of the digamma (psi) function at that point +* @author Richard J. Mathar +* @since 2005-11-24 +*/ +long double digammal(long double x) +{ + /* force into the interval 1..3 */ + if( x < 0.0L ) + return digammal(1.0L-x)+M_PIl/tanl(M_PIl*(1.0L-x)) ; /* reflection formula */ + else if( x < 1.0L ) + return digammal(1.0L+x)-1.0L/x ; + else if ( x == 1.0L) + return -M_GAMMAl ; + else if ( x == 2.0L) + return 1.0L-M_GAMMAl ; + else if ( x == 3.0L) + return 1.5L-M_GAMMAl ; + else if ( x > 3.0L) + /* duplication formula */ + return 0.5L*(digammal(x/2.0L)+digammal((x+1.0L)/2.0L))+M_LN2l ; + else + { + /* Just for your information, the following lines contain + * the Maple source code to re-generate the table that is + * eventually becoming the Kncoe[] array below + * interface(prettyprint=0) : + * Digits := 63 : + * r := 0 : + * + * for l from 1 to 60 do + * d := binomial(-1/2,l) : + * r := r+d*(-1)^l*(Zeta(2*l+1) -1) ; + * evalf(r) ; + * print(%,evalf(1+Psi(1)-r)) ; + *o d : + * + * for N from 1 to 28 do + * r := 0 : + * n := N-1 : + * + * for l from iquo(n+3,2) to 70 do + * d := 0 : + * for s from 0 to n+1 do + * d := d+(-1)^s*binomial(n+1,s)*binomial((s-1)/2,l) : + * od : + * if 2*l-n > 1 then + * r := r+d*(-1)^l*(Zeta(2*l-n) -1) : + * fi : + * od : + * print(evalf((-1)^n*2*r)) ; + *od : + *quit : + */ + static long double Kncoe[] = { .30459198558715155634315638246624251L, + .72037977439182833573548891941219706L, -.12454959243861367729528855995001087L, + .27769457331927827002810119567456810e-1L, -.67762371439822456447373550186163070e-2L, + .17238755142247705209823876688592170e-2L, -.44817699064252933515310345718960928e-3L, + .11793660000155572716272710617753373e-3L, -.31253894280980134452125172274246963e-4L, + .83173997012173283398932708991137488e-5L, -.22191427643780045431149221890172210e-5L, + .59302266729329346291029599913617915e-6L, -.15863051191470655433559920279603632e-6L, + .42459203983193603241777510648681429e-7L, -.11369129616951114238848106591780146e-7L, + .304502217295931698401459168423403510e-8L, -.81568455080753152802915013641723686e-9L, + .21852324749975455125936715817306383e-9L, -.58546491441689515680751900276454407e-10L, + .15686348450871204869813586459513648e-10L, -.42029496273143231373796179302482033e-11L, + .11261435719264907097227520956710754e-11L, -.30174353636860279765375177200637590e-12L, + .80850955256389526647406571868193768e-13L, -.21663779809421233144009565199997351e-13L, + .58047634271339391495076374966835526e-14L, -.15553767189204733561108869588173845e-14L, + .41676108598040807753707828039353330e-15L, -.11167065064221317094734023242188463e-15L } ; + + register long double Tn_1 = 1.0L ; /* T_{n-1}(x), started at n=1 */ + register long double Tn = x-2.0L ; /* T_{n}(x) , started at n=1 */ + register long double resul = Kncoe[0] + Kncoe[1]*Tn ; + + x -= 2.0L ; + int n ; + + for( n = 2 ; n < sizeof(Kncoe)/sizeof(long double) ;n++) + { + const long double Tn1 = 2.0L * x * Tn - Tn_1 ; /* Chebyshev recursion, Eq. 22.7.4 Abramowitz-Stegun */ + resul += Kncoe[n]*Tn1 ; + Tn_1 = Tn ; + Tn = Tn1 ; + } + return resul ; + } +} + + + +double trigamma ( double x, int *ifault ) + +//**************************************************************************** +// purpose: +// +// trigamma calculates trigamma(x) = d**2 log(gamma(x)) / dx**2 +// +// licensing: +// +// this code is distributed under the gnu lgpl license. +// +// modified: +// +// 19 january 2008 +// +// author: +// +// original fortran77 version by be schneider. +// c++ version by john burkardt. +// +// reference: +// +// be schneider, +// algorithm as 121: +// trigamma function, +// applied statistics, +// volume 27, number 1, pages 97-99, 1978. +// +// parameters: +// +// input, double x, the argument of the trigamma function. +// 0 < x. +// +// output, int *ifault, error flag. +// 0, no error. +// 1, x <= 0. +// +// output, double trigamma, the value of the trigamma function at x. +// +{ + double a = 0.0001; + double b = 5.0; + double b2 = 0.1666666667; + double b4 = -0.03333333333; + double b6 = 0.02380952381; + double b8 = -0.03333333333; + double value; + double y; + double z; + // + // check the input. + // + if ( x <= 0.0 ) + { + *ifault = 1; + value = 0.0; + return value; + } + + *ifault = 0; + z = x; + // + // use small value approximation if x <= a. + // + if ( x <= a ) + { + value = 1.0 / x / x; + return value; + } + // + // increase argument to ( x + i ) >= b. + // + value = 0.0; + + while ( z < b ) + { + value = value + 1.0 / z / z; + z = z + 1.0; + } + // + // apply asymptotic formula if argument is b or greater. + // + y = 1.0 / z / z; + + value = value + 0.5 * + y + ( 1.0 + y * ( b2+ y * ( b4 + y * ( b6+ y * b8 )))) / z; + + return value; +} + + +double LogGammaDensity( double x, double k, double theta ) +{ + return -k * log( theta ) + ( k - 1 ) * log( x ) - x / theta - lgamma( k ) ; +} + +double MixtureGammaAssignment( double x, double pi, double* k, double *theta ) +{ + if ( pi == 1 ) + return 0 ; + else if ( pi == 0 ) + return 1 ; + + double lf0 = LogGammaDensity( x, k[0], theta[0] ) ; + double lf1 = LogGammaDensity( x, k[1], theta[1] ) ; + + return (double)1.0 / ( 1.0 + exp( lf1 + log( 1 - pi ) - lf0 - log( pi ) ) ) ; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/gamma.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,34 @@ +#ifndef _LSONG_GAMMA +#define _LSONG_GAMMA + +#include <stdio.h> +#include <math.h> + +#ifndef M_PIl +/** The constant Pi in high precision */ +#define M_PIl 3.1415926535897932384626433832795029L +#endif +#ifndef M_GAMMAl +/** Euler's constant in high precision */ +#define M_GAMMAl 0.5772156649015328606065120900824024L +#endif +#ifndef M_LN2l +/** the natural logarithm of 2 in high precision */ +#define M_LN2l 0.6931471805599453094172321214581766L +#endif + +/** The digamma function in long double precision. +* @param x the real value of the argument +* @return the value of the digamma (psi) function at that point +* @author Richard J. Mathar +* @since 2005-11-24 +*/ +long double digammal(long double x) ; + +//https://people.sc.fsu.edu/~jburkardt/cpp_src/asa121/asa121.hpp +double trigamma ( double x, int *ifault ); + + +double LogGammaDensity( double x, double k, double theta ) ; +double MixtureGammaAssignment( double x, double pi, double* k, double *theta ) ; +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/grader.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,840 @@ +// Compare the reference and the predictions +// Format: ./a.out ref.gtf prediction.gtf +#include <stdio.h> +#include <string.h> +#include <stdlib.h> + +#define MAX_TXPT 2000000 + +#define DEBUG 0 + +bool flagMultiExonOnly = false ; +bool flagValidChr = true ; +int relaxWidth = 20 ; + +struct _pair +{ + int a, b ; +} ; + +struct _transcript +{ + char tid[50] ; + char chrom[50] ; + char strand ; + struct _pair *exons ; + int ecnt ; +} ; + +struct _info +{ + double coverage ; + int byId ; // It gets this coverage by comparing with byId. +} ; + +struct _intron +{ + char chrom[50] ; + int start, end ; + int tindex ; // The index to the corresponding transcripts +} ; + +struct _exon +{ + char chrom[50] ; + int start, end ; + //int tindex ; + int soft ; // left-bit: left side is soft, right-bit: right side is soft + bool matched ; +} ; + +int TranscriptComp( const void *p1, const void *p2 ) +{ + const struct _transcript *a = (struct _transcript *)p1 ; + const struct _transcript *b = ( struct _transcript *)p2 ; + int tmp = strcmp( a->chrom, b->chrom ) ; + if ( tmp != 0 ) + return tmp ; + return a->exons[0].a - b->exons[0].a ; +} + +int TranscriptComp_ByIntron( const void *p1, const void *p2 ) +{ + const struct _transcript *a = (struct _transcript *)p1 ; + const struct _transcript *b = ( struct _transcript *)p2 ; + if ( a->strand != b->strand ) + return a->strand - b->strand ; + int tmp = strcmp( a->chrom, b->chrom ) ; + if ( tmp != 0 ) + return tmp ; + if ( a->ecnt != b->ecnt ) + return a->ecnt - b->ecnt ; + int i ; + for ( i = 0 ; i < a->ecnt - 1 ; ++i ) + { + if ( a->exons[i].b != b->exons[i].b ) + return a->exons[i].b - b->exons[i].b ; + if ( a->exons[i + 1].a != b->exons[i + 1].a ) + return a->exons[i + 1].a - b->exons[i + 1].a ; + } + + return 0 ; //strcmp( a->tid, b->tid ) ; +} + +bool validChrom( char *chrom ) +{ + if ( !flagValidChr ) + return true ; + if ( chrom[0] != 'c' || chrom[1] != 'h' || chrom[2] != 'r' ) + return false ; + // only consider chr1-22,x,y,z + if ( chrom[3]=='x' || chrom[3]=='X' + || chrom[3] == 'y' || chrom[3] == 'Y' + || chrom[3] == 'm' || chrom[3] == 'M' + || ( chrom[3] >= '3' && chrom[3] <= '9' ) ) + { + if ( chrom[4] == '\0' ) + return true ; + else + return false ; + } + if ( chrom[3] == '1' ) + { + if ( chrom[4] == '\0' ) + return true ; + if ( chrom[4] >= '0' && chrom[4] <= '9' && chrom[5] == '\0' ) + return true ; + return false ; + } + if ( chrom[3] == '2' ) + { + if ( chrom[4] == '\0' ) + return true ; + if ( chrom[4] >= '0' && chrom[4] <= '2' && chrom[5] == '\0' ) + return true ; + return false ; + } + return false ; +} + +void ReverseExonList( struct _pair *exons, int ecnt ) +{ + if ( ecnt < 2 || ( exons[0].a < exons[1].a ) ) + return ; + int i, j ; + struct _pair tmp ; + + for ( i = 0, j = ecnt - 1 ; i < j ; ++i, --j ) + { + tmp = exons[i] ; + exons[i] = exons[j] ; + exons[j] = tmp ; + } +} + +/** + Merge exons that are next to each other. Showed up in scripture. +*/ +int CleanExonList( struct _pair *exons, int ecnt ) +{ + int i, k ; + for ( i = 0 ; i < ecnt - 1 ; ++i ) + if ( exons[i + 1].a - exons[i].b - 1 < 20 ) + { + exons[i + 1].a = exons[i].a ; + exons[i].a = -1 ; + } + k = 0 ; + for ( i = 0 ; i < ecnt ; ++i ) + { + if ( exons[i].a == -1 ) + continue ; + exons[k] = exons[i] ; + ++k ; + } + return k ; +} + +/** + Remove the duplicated intron chain in the list +*/ +int RemoveDuplicateIntronChain( struct _transcript *t, int tcnt ) +{ + int i, j, k ; + + qsort( t, tcnt, sizeof( *t ), TranscriptComp_ByIntron ) ; + for ( i = 1 ; i < tcnt ; ++i ) + { + k = i - 1 ; + while ( t[k].ecnt == -1 ) + --k ; + + if ( t[i].ecnt != t[k].ecnt || strcmp( t[i].chrom, t[k].chrom ) || t[i].strand != t[k].strand ) + continue ; + for ( j = 0 ; j < t[i].ecnt - 1 ; ++j ) + if ( t[i].exons[j].b != t[k].exons[j].b || + t[i].exons[j + 1].a != t[k].exons[j + 1].a ) + break ; + if ( j >= t[i].ecnt - 1 && t[i].ecnt != 1 ) + t[i].ecnt = -1 ; + if ( t[i].ecnt == 1 && t[i].exons[0].a == t[k].exons[0].a && + t[i].exons[0].b == t[k].exons[0].b ) + t[i].ecnt = -1 ; + /*if ( t[i].ecnt == -1 ) + { + printf( "%s <- %s\n", t[i].tid, t[k].tid ) ; + }*/ + } + + k = 0 ; + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( t[i].ecnt == -1 ) + { + free( t[i].exons ) ; + continue ; + } + t[k] = t[i] ; + ++k ; + } + tcnt = k ; + return tcnt ; +} + + +double CompareTranscripts( const struct _transcript &ref, const struct _transcript &pred ) +{ + int i, j ; + struct _pair *refExons = ref.exons ; + int refECnt = ref.ecnt ; + struct _pair *predExons = pred.exons ; + int predECnt = pred.ecnt ; + + // Prediction must be a "subset" of the reference + if ( refECnt < predECnt ) + return -1 ; + + // single exon case + if ( refECnt == 1 ) + { + if ( refExons[0].b < predExons[0].a || refExons[0].a > predExons[0].b ) + return -1 ; + else + return 1 ; + } + + if ( predECnt == 1 ) + { + for ( i = 0 ; i < refECnt ; ++i ) + { + if ( refExons[i].b < predExons[0].a || refExons[i].a > predExons[0].b ) + continue ; + else + { + /*if ( i == 0 && predExons[0].a >= refExons[0].a - relaxWidth ) + return 0 ; + else if ( i == refECnt - 1 && predExons[0].b <= refExons[i].b + relaxWidth ) + return 0 ; + else if ( i > 0 && i < refECnt - 1 && predExons[0].a >= refExons[i].a - relaxWidth && + predExons[0].b <= refExons[i].b + relaxWidth ) + return 0 ; + + return -1 ;*/ + return 0 ; + } + } + return -1 ; + } + + if ( predECnt > 0 && ref.strand != pred.strand ) + return -1 ; + + // Test the intron chain + for ( i = 0 ; i < refECnt - 1 ; ++i ) + if ( refExons[i].b == predExons[0].b ) + break ; + if ( i >= refECnt - 1 ) + return -1 ; + //if ( i != 0 && predExons[0].a < refExons[i].a - relaxWidth ) + // return -1 ; + + for ( j = 0 ; i < refECnt - 1 && j < predECnt - 1 ; ++i, ++j ) + { + if ( refExons[i].b != predExons[j].b || refExons[i + 1].a != predExons[j + 1].a ) + break ; + } + if ( j >= predECnt - 1 ) + { + /*if ( i == refECnt - 1 || + predExons[ predECnt - 1 ].b <= refExons[i].b + relaxWidth ) + return (double)( predECnt - 1 ) / (double)( refECnt - 1 ) ; + else + return -1 ;*/ + return (double)( predECnt - 1 ) / (double)( refECnt - 1 ) ; + } + else + return -1 ; +} + +bool WithinRange( int a, int b, int w = 10 ) +{ + if ( a - b >= -w && a - b <= w ) + return true ; + return false ; +} + +int GetIntrons( struct _transcript *transcripts, int tcnt, struct _intron *introns ) +{ + int i, j, k, tag ; + int cnt = 0 ; + for ( i = 0 ; i < tcnt ; ++i ) + { + for ( j = 0 ; j < transcripts[i].ecnt - 1 ; ++j ) + { + bool flag = false ; + + tag = 0 ; + for ( k = cnt - 1 ; k >= 0 ; --k ) + { + if ( strcmp( introns[k].chrom, transcripts[i].chrom ) ) + { + //printf( "hi\n" ) ; + tag = k + 1 ; + break ; + } + if ( introns[k].start < transcripts[i].exons[j].b ) + { + tag = k + 1 ; + break ; + } + else if ( introns[k].start == transcripts[i].exons[j].b && + introns[k].end == transcripts[i].exons[j + 1].a ) + { + flag = true ; + break ; + } + } + + if ( !flag ) + { + for ( k = cnt ; k > tag ; --k ) + introns[k] = introns[k - 1] ; + strcpy( introns[k].chrom, transcripts[i].chrom ) ; + introns[k].start = transcripts[i].exons[j].b ; + introns[k].end = transcripts[i].exons[j + 1].a ; + introns[k].tindex = i ; + ++cnt ; + } + } + } + return cnt ; +} + +int GetExons( struct _transcript *transcripts, int tcnt, struct _exon *exons ) +{ + int i, j, k, tag ; + int cnt = 0 ; + for ( i = 0 ; i < tcnt ; ++i ) + { + for ( j = 0 ; j < transcripts[i].ecnt ; ++j ) + { + bool flag = false ; + int soft = 0 ; + if ( j == 0 ) + soft = soft | 2 ; + if ( j == transcripts[i].ecnt - 1 ) + soft = soft | 1 ; + + tag = 0 ; + for ( k = cnt - 1 ; k >= 0 ; --k ) + { + if ( strcmp( exons[k].chrom, transcripts[i].chrom ) ) + { + //printf( "hi\n" ) ; + tag = k + 1 ; + break ; + } + + if ( exons[k].start < transcripts[i].exons[j].a ) + { + tag = k + 1 ; + break ; + } + else if ( exons[k].start == transcripts[i].exons[j].a && + exons[k].end == transcripts[i].exons[j].b && + exons[k].soft == soft ) + { + flag = true ; + break ; + } + } + + if ( !flag ) + { + for ( k = cnt ; k > tag ; --k ) + exons[k] = exons[k - 1] ; + strcpy( exons[k].chrom, transcripts[i].chrom ) ; + exons[k].start = transcripts[i].exons[j].a ; + exons[k].end = transcripts[i].exons[j].b ; + //exons[k].tindex = i ; + exons[k].soft = soft ; + exons[k].matched = false ; + ++cnt ; + } + } + } + return cnt ; +} + +//chr1 HAVANA transcript 320162 324461 . + . gene_id "ENSG00000237094.6"; transcript_id "ENST00000423728.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP4-669L17.10"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP4-669L17.10-002"; level 2; havana_gene "OTTHUMG00000156968.4"; havana_transcript "OTTHUMT00000346879.1"; +int main( int argc, char *argv[] ) +{ + FILE *fp ; + struct _transcript *ref, *pred ; + struct _info *refInfo, *predInfo ; + + int refCnt, predCnt ; + char line[10000], filename[10000] ; + struct _pair tmpExons[10000] ; + int tmpECnt = 0 ; + + char chrom[50], tool[20], type[40], strand[3] ; + char tid[50] ; + char buffer[50] ; + int start, end ; + + int i, j, k, tag ; + + if ( argc == 1 ) + { + printf( "Compare the reference transcripts and predicted transcripts.\n" + "Format: ./grader ref.gtf prediction.gtf\n" ) ; + exit( 1 ) ; + } + // Parse the arguments + for ( i = 3 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "-M" ) ) + flagMultiExonOnly = true ; + else if ( !strcmp( argv[i], "-ac" ) ) + flagValidChr = false ; + else + { + printf( "Unknown argument: %s\n", argv[i] ) ; + exit( 1 ) ; + } + } + + /*======================================================================================== + Stage 1: Read the transcripts from the reference and prediction's GTF files. + ========================================================================================*/ + fp = NULL ; + fp = fopen( argv[1], "r" ) ; + if ( fp == NULL ) + { + printf( "Could not open file %s.\n", argv[1] ) ; + exit( 1 ) ; + } + refCnt = 0 ; + ref = ( struct _transcript *)malloc( MAX_TXPT * sizeof( struct _transcript ) ) ; + + strcpy( ref[0].tid, "tmp_-1" ) ; + while ( fgets( line, sizeof( line ), fp ) != NULL ) + { + if ( line[0] == '#' ) + continue ; + + sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ; + + if ( strcmp( type, "exon" ) || !validChrom( chrom ) ) + continue ; + + char *p = strstr( line, "transcript_id" ) ; + for ( ; *p != ' ' ; ++p ) + ; + p += 2 ; + sscanf( p, "%s", tid ) ; + //printf( "+%s %d\n", tid, strlen( tid ) ) ; + p = tid + strlen( tid ) ; + while ( *p != '\"' ) + --p ; + *p = '\0' ; + //printf( "%s\n", tid ) ; + if ( strcmp( tid, ref[ refCnt ].tid ) && tmpECnt ) + { + ReverseExonList( tmpExons, tmpECnt ) ; + tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; + if ( tmpECnt > 1 || !flagMultiExonOnly ) + { + ref[ refCnt ].ecnt = tmpECnt ; + ref[ refCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; + memcpy( ref[ refCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; + ++refCnt ; + } + tmpECnt = 0 ; + } + + tmpExons[ tmpECnt ].a = start ; + tmpExons[ tmpECnt ].b = end ; + ++tmpECnt ; + + ref[ refCnt ].strand = strand[0] ; + strcpy( ref[ refCnt ].chrom, chrom ) ; + strcpy( ref[ refCnt ].tid, tid ) ; + } + if ( tmpECnt != 0 ) + { + ReverseExonList( tmpExons, tmpECnt ) ; + tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; + if ( tmpECnt > 1 || !flagMultiExonOnly ) + { + ref[ refCnt ].ecnt = tmpECnt ; + ref[ refCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; + memcpy( ref[ refCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; + ++refCnt ; + } + tmpECnt = 0 ; + } + fclose( fp ) ; + + fp = NULL ; + fp = fopen( argv[2], "r" ) ; + if ( fp == NULL ) + { + printf( "Could not open file %s.\n", argv[2] ) ; + exit( 1 ) ; + } + predCnt = 0 ; + pred = ( struct _transcript *)malloc( MAX_TXPT * sizeof( struct _transcript ) ) ; + + strcpy( pred[0].tid, "tmp_-1" ) ; + tmpECnt = 0 ; + while ( fgets( line, sizeof( line ), fp ) != NULL ) + { + if ( line[0] == '#' ) + continue ; + sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ; + + if ( strcmp( type, "exon" ) || !validChrom( chrom ) ) + continue ; + + char *p = strstr( line, "transcript_id" ) ; + //char *p = strstr( line, "gene_name" ) ; + for ( ; *p != ' ' ; ++p ) + ; + p += 2 ; + sscanf( p, "%s", tid ) ; + //tid[ strlen( tid ) - 2 ] = '\0' ; + p = tid + strlen( tid ) ; + while ( *p != '\"' ) + --p ; + *p = '\0' ; + + if ( strcmp( tid, pred[ predCnt ].tid ) && tmpECnt ) + { + ReverseExonList( tmpExons, tmpECnt ) ; + tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; + if ( tmpECnt > 1 || !flagMultiExonOnly ) + { + pred[ predCnt ].ecnt = tmpECnt ; + pred[ predCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; + memcpy( pred[ predCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; + ++predCnt ; + } + tmpECnt = 0 ; + } + + tmpExons[ tmpECnt ].a = start ; + tmpExons[ tmpECnt ].b = end ; + ++tmpECnt ; + + pred[ predCnt ].strand = strand[0] ; + strcpy( pred[ predCnt ].chrom, chrom ) ; + strcpy( pred[ predCnt ].tid, tid ) ; + } + if ( tmpECnt > 0 ) + { + ReverseExonList( tmpExons, tmpECnt ) ; + tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; + if ( tmpECnt > 1 || !flagMultiExonOnly ) + { + pred[ predCnt ].ecnt = tmpECnt ; + pred[ predCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; + memcpy( pred[ predCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; + ++predCnt ; + } + tmpECnt = 0 ; + } + + refCnt = RemoveDuplicateIntronChain( ref, refCnt ) ; + //printf( "predCnt = %d\n", predCnt ) ; + predCnt = RemoveDuplicateIntronChain( pred, predCnt ) ; + //printf( "predCnt = %d\n", predCnt ) ; + qsort( ref, refCnt, sizeof( ref[0] ), TranscriptComp ) ; + qsort( pred, predCnt, sizeof( pred[0] ), TranscriptComp ) ; + + + /*======================================================================================== + Stage 2: Compute the recall and precision. + ========================================================================================*/ + refInfo = ( struct _info * )malloc( refCnt * sizeof( *refInfo ) ) ; + predInfo = ( struct _info * )malloc( predCnt * sizeof( *predInfo ) ) ; + for ( i = 0 ; i < refCnt ; ++i ) + { + refInfo[i].coverage = -1 ; + refInfo[i].byId = -1 ; +#if DEBUG + printf( "=%d %s %s %d\n", i, ref[i].chrom, ref[i].tid, ref[i].ecnt ) ; + for ( j = 0 ; j < ref[i].ecnt ; ++j ) + printf( "%d %d\n", ref[i].exons[j].a, ref[i].exons[j].b ) ; +#endif + + } + for ( i = 0 ; i < predCnt ; ++i ) + { + predInfo[i].coverage = -1 ; + predInfo[i].byId = -1 ; +#if DEBUG + printf( "#%d %s %s %d\n", i, pred[i].chrom, pred[i].tid, pred[i].ecnt ) ; + for ( j = 0 ; j < pred[i].ecnt ; ++j ) + printf( "%d %d\n", pred[i].exons[j].a, pred[i].exons[j].b ) ; +#endif + } + tag = 0 ; + for ( i = 0 ; i < predCnt ; ++i ) + { + while ( tag < refCnt && + ( ( strcmp( ref[tag].chrom, pred[i].chrom ) < 0 ) || + ( !strcmp( ref[tag].chrom, pred[i].chrom ) && + ref[tag].exons[ ref[tag].ecnt - 1 ].b < pred[i].exons[0].a - relaxWidth ) ) ) + ++tag ; + + // if ( i == 16474 ) + // printf( "%d %d %d", i, tag, refCnt ) ; + for ( j = tag ; j < refCnt && ref[j].exons[0].a <= pred[i].exons[ pred[i].ecnt - 1 ].b + relaxWidth && !strcmp( ref[j].chrom, pred[i].chrom ); ++j ) + { + /*if ( !strcmp( pred[i].tid, "CUFF.4256.1" ) ) + { + printf( "%s ? %s\n", pred[i].tid, ref[j].tid ) ; + }*/ + // if ( i == 16474 ) + // { + // printf( "%d %d\n", i, j ) ; + // } + double coverage = CompareTranscripts( ref[j], pred[i] ) ; + if ( coverage > refInfo[j].coverage ) + { + refInfo[j].coverage = coverage ; + refInfo[j].byId = i ; + } + if ( coverage > predInfo[i].coverage ) + { + predInfo[i].coverage = coverage ; + predInfo[i].byId = j ; + } + } + } + + /*======================================================================================== + Stage 3: Dump the information into output files. + ========================================================================================*/ + //sprintf( filename, "grader.%s.recall", argv[1] ) ; + sprintf( filename, "grader.recall" ) ; + fp = fopen( filename, "w" ) ; + for ( i = 0 ; i < refCnt ; ++i ) + { + fprintf( fp, "%s\t%d\t%lf\t%s\n", ref[i].tid, ref[i].ecnt, refInfo[i].coverage, + refInfo[i].byId == -1 ? "-" : pred[ refInfo[i].byId ].tid ) ; + } + fclose( fp ) ; + + //sprintf( filename, "grader.%s.precision", argv[2] ) ; + sprintf( filename, "grader.precision" ) ; + fp = fopen( filename, "w" ) ; + for ( i = 0 ; i < predCnt ; ++i ) + { + fprintf( fp, "%s\t%d\t%lf\t%s\n", pred[i].tid, pred[i].ecnt, predInfo[i].coverage, + predInfo[i].byId == -1 ? "-" : ref[predInfo[i].byId].tid ) ; + } + fclose( fp ) ; + + // print the summary to the stdout + const int binCnt = 20 ; + int bins[binCnt + 1] ; + memset( bins, 0, sizeof( bins ) ) ; + k = 0 ; + for ( i = 0 ; i < refCnt ; ++i ) + { + if ( refInfo[i].coverage == -1 ) + continue ; + ++bins[ (int)( refInfo[i].coverage * binCnt ) ] ; + ++k ; + } + for ( i = 0 ; i <= binCnt ; ++i ) + { + printf( "recall %lf %lf %d/%d\n", (double)i / binCnt, (double)k / refCnt, k, refCnt ) ; + k -= bins[i] ; + } + + memset( bins, 0, sizeof( bins ) ) ; + k = 0 ; + for ( i = 0 ; i < predCnt ; ++i ) + { + if ( predInfo[i].coverage == -1 ) + continue ; + ++bins[ (int)( predInfo[i].coverage * binCnt ) ] ; + ++k ; + } + for ( i = 0 ; i <= binCnt ; ++i ) + { + printf( "precision %lf %lf %d/%d\n", (double)i / binCnt, (double)k / predCnt, k, predCnt ) ; + k -= bins[i] ; + } + + /*======================================================================================== + Stage 4: Evaluations for introns. + ========================================================================================*/ + struct _intron *refIntrons = ( struct _intron * )malloc( sizeof( struct _intron ) * MAX_TXPT ) ; + int riCnt = GetIntrons( ref, refCnt, refIntrons ) ; + struct _intron *predIntrons = ( struct _intron * )malloc( sizeof( struct _intron ) * MAX_TXPT ) ; + int piCnt = GetIntrons( pred, predCnt, predIntrons ) ; + int matchedIntron = 0 ; + /*for ( i = 0 ; i < riCnt ; ++i ) + { + printf( "%d %s %d %d\n", i, refIntrons[i].chrom, refIntrons[i].start, refIntrons[i].end ) ; + }*/ + + fp = fopen( "grader.intron", "w" ) ; + tag = 0 ; + for ( i = 0 ; i < piCnt ; ++i ) + { + bool flag = false ; + while ( 1 ) + { + if ( tag >= riCnt ) + break ; + int tmp = strcmp( refIntrons[tag].chrom, predIntrons[i].chrom ) ; + if ( tmp < 0 || ( tmp == 0 && refIntrons[tag].start < predIntrons[i].start ) ) + ++tag ; + else + break ; + } + //printf( "%d (%s %d %d) %d=>", i, predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, tag ) ; + /*if ( predIntrons[i].start == 1613457 ) + printf( "%d %d\n", tag, riCnt ) ;*/ + for ( j = tag ; j < riCnt ; ++j ) + { + if ( strcmp( refIntrons[j].chrom, predIntrons[i].chrom ) > 0 || + refIntrons[j].start > predIntrons[i].start /*+ 10*/ ) + break ; + if ( refIntrons[j].start == predIntrons[i].start && + refIntrons[j].end == predIntrons[i].end ) + //if ( WithinRange( refIntrons[j].start, predIntrons[i].start ) && + // WithinRange( refIntrons[j].end, predIntrons[i].end ) ) + { + ++matchedIntron ; + flag = true ; + break ; + } + } + //printf( "%d\n", j ) ; + if ( flag ) + fprintf( fp, "Y\t%s\t%d\t%d\t%s\n", predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, pred[ predIntrons[i].tindex ].tid ) ; + else + fprintf( fp, "N\t%s\t%d\t%d\t%s\n", predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, pred[ predIntrons[i].tindex ].tid ) ; + } + fclose( fp ) ; + printf( "\n" ) ; + printf( "Intron evaluation\n") ; + //printf( "\tmatched\t%d\n", matchedIntron ) ; + printf( "\trecall\t%lf\t%d/%d\n", matchedIntron / (double)riCnt, matchedIntron, riCnt ) ; + printf( "\tprecision\t%lf\t%d/%d\n", matchedIntron / (double)piCnt, matchedIntron, piCnt ) ; + + /*======================================================================================== + Stage 4: Evaluations for exons. + ========================================================================================*/ + struct _exon *refExons = ( struct _exon * )malloc( sizeof( struct _exon ) * MAX_TXPT ) ; + int reCnt = GetExons( ref, refCnt, refExons ) ; + struct _exon *predExons = ( struct _exon * )malloc( sizeof( struct _exon ) * MAX_TXPT ) ; + int peCnt = GetExons( pred, predCnt, predExons ) ; + int matchedExons = 0, matchedInternalExons = 0 ; + int refInternalExons = 0, predInternalExons = 0 ; + + for ( i = 0 ; i < reCnt ; ++i ) + { + //printf( "%d %d\n", refExons[i].start, refExons[i].end ) ; + if ( refExons[i].soft == 0 ) + ++refInternalExons ; + //if ( refExons[i].soft == 3 ) + // printf( "hi\n" ) ; + } + for ( i = 0 ; i < peCnt ; ++i ) + if ( predExons[i].soft == 0 ) + ++predInternalExons ; + tag = 0 ; + for ( i = 0 ; i < peCnt ; ++i ) + { + bool flag = false ; + while ( 1 ) + { + if ( tag >= reCnt ) + break ; + int tmp = strcmp( refExons[tag].chrom, predExons[i].chrom ) ; + if ( tmp < 0 || ( tmp == 0 && refExons[tag].end < predExons[i].start ) ) + ++tag ; + else + break ; + } + for ( j = tag ; j < reCnt ; ++j ) + { + if ( strcmp( refExons[j].chrom, predExons[i].chrom ) > 0 || + refExons[j].start > predExons[i].end /*+ 10*/ ) + break ; + if ( refExons[j].soft != predExons[i].soft + || refExons[j].end < predExons[i].start ) + continue ; + + if ( refExons[j].start == predExons[i].start && + refExons[j].end == predExons[i].end && predExons[i].soft == 0 ) + //if ( WithinRange( refIntrons[j].start, predIntrons[i].start ) && + // WithinRange( refIntrons[j].end, predIntrons[i].end ) ) + { + refExons[j].matched = true ; + predExons[i].matched = true ; + ++matchedInternalExons ; + flag = true ; + break ; + } + else if ( ( refExons[j].start == predExons[i].start || ( predExons[i].soft & 2 ) ) + && ( refExons[j].end == predExons[i].end || (predExons[i].soft & 1 ) ) ) + { + refExons[j].matched = true ; + predExons[i].matched = true ; + flag = true ; + //break ; + } + } + //printf( "%d\n", j ) ; + } + + printf( "\n" ) ; + printf( "Exon evaluation\n" ) ; + //printf( "\tmatched\t%d\n", matchedExons ) ; + matchedExons = 0 ; + for ( i = 0 ; i < reCnt ; ++i ) + if ( refExons[i].matched ) + ++matchedExons ; + printf( "\trecall\t%lf\t%d/%d\n", (double)matchedExons / reCnt, matchedExons, reCnt ) ; + matchedExons = 0 ; + for ( i = 0 ; i < peCnt ; ++i ) + if ( predExons[i].matched ) + ++matchedExons ; + printf( "\tprecision\t%lf\t%d/%d\n", (double)matchedExons / peCnt, matchedExons, peCnt ) ; + + printf( "\n" ) ; + printf( "Internal exon evaluation\n" ) ; + //printf( "\tmatched\t%d\n", matchedInternalExons ) ; + printf( "\trecall\t%lf\t%d/%d\n", (double)matchedInternalExons / refInternalExons, matchedInternalExons, refInternalExons ) ; + printf( "\tprecision\t%lf\t%d/%d\n", (double)matchedInternalExons / predInternalExons, matchedInternalExons, predInternalExons ) ; + return 0 ; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/psiclass Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,411 @@ +#!/usr/bin/env perl + +use strict ; +use warnings ; + +use Cwd 'cwd' ; +use Cwd 'abs_path' ; +use File::Basename; +use threads ; +use threads::shared ; + +die "Usage: ./psiclass [OPTIONS]\n". + "Required:\n". + "\t-b STRING: paths to the alignment BAM files. Use comma to separate multiple BAM files\n". + "\t\tor\n". + "\t--lb STRING: path to the file listing the alignments BAM files\n". + "Optional:\n". + "\t-s STRING: path to the trusted splice file (default: not used)\n". + "\t-o STRING: prefix of output files (default: ./psiclass)\n". + "\t-p INT: number of processes/threads (default: 1)\n". + "\t-c FLOAT: only use the subexons with classifier score <= than the given number (default: 0.05)\n". + "\t--sa FLOAT: the minimum average number of supported read for retained introns (default: 0.5)\n". + "\t--vd FLOAT : the minimum average coverage depth of a transcript to be reported (defaults: 1.0)\n". + "\t--maxDpConstraintSize: the number of subexons a constraint can cover in DP. (default: 7. -1 for inf)\n". + "\t--bamGroup STRING: path to the file listing the group id of BAMs in the --lb file (default: not used)\n". + "\t--primaryParalog: use primary alignment to retain paralog genes (default: use unique alignments)\n". + #"\t--mateIdx INT: the read id has suffix such as .1, .2 for a mate pair. (default: auto)\n". + "\t--version: print version and exit\n". + "\t--stage INT: (default: 0)\n". + "\t\t0-start from beginning - building splice sites for each sample\n". + "\t\t1-start from building subexon files for each sample\n". + "\t\t2-start from combining subexon files across samples\n". + "\t\t3-start from assembling the transcripts for each sample\n". + "\t\t4-start from voting the consensus transcripts across samples\n" + if ( @ARGV == 0 ) ; + +my $WD = dirname( abs_path( $0 ) ) ; + +my $i ; +my $cmd ; +my $prefix = "psiclass_" ; +my $numThreads = 1 ; + +sub system_call +{ + print $_[0], "\n" ; + system( $_[0] ) == 0 or die "Terminated\n" ; +} + +# Process the arguments +my @bamFiles : shared ; +my $spliceFile = "" ; +my $bamFileList = "" ; +my $stage = 0 ; +my $classesOpt = "" ; +my $juncOpt = "" ; +my $trustSpliceOpt = "" ; +my $voteOpt = "" ; +my $outdir = "." ; +my $mateIdx = -1 ; +my $spliceAvgSupport = 0.5 ; +my $bamGroup = "" ; +for ( $i = 0 ; $i < @ARGV ; ++$i ) +{ + if ( $ARGV[$i] eq "--lb" ) + { + $bamFileList = $ARGV[$i + 1] ; + open FP1, $ARGV[$i + 1] ; + while ( <FP1> ) + { + chomp ; + push @bamFiles, $_ ; + } + ++$i ; + } + elsif ( $ARGV[$i] eq "-b" ) + { + push @bamFiles, ( split /,/, $ARGV[$i + 1] ) ; + ++$i ; + } + elsif ( $ARGV[$i] eq "-s" ) + { + $spliceFile = $ARGV[$i + 1] ; + ++$i ; + } + elsif ( $ARGV[$i] eq "-o" ) + { + my $o = $ARGV[$i + 1] ; + # Parse the -o option to get the folder and prefix + if ( $o =~ /\// ) + { + if ( substr( $o, -1 ) ne "/" ) # file prefix is in -o + { + my @cols = split /\/+/, $o ; + $prefix = $cols[-1] ; + $outdir = join( "/", @cols[0..($#cols-1)] ) ; + } + else # -o only specifies path + { + $outdir = substr( $o, 0, length( $o ) - 1 ) ; + } + } + else + { + $prefix = $o ; + } + + if ( substr( $prefix, -1 ) ne "_" ) + { + $prefix .= "_" ; + } + + ++$i ; + } + elsif ( $ARGV[$i] eq "--stage" ) + { + $stage = $ARGV[$i + 1] ; + ++$i ; + } + elsif ( $ARGV[ $i ] eq "-p" ) + { + $numThreads = $ARGV[$i + 1] ; + $classesOpt .= " -p $numThreads" ; + ++$i ; + } + elsif ( $ARGV[ $i ] eq "-c" ) + { + $classesOpt .= " -c ".$ARGV[ $i + 1 ] ; + ++$i ; + } + elsif ( $ARGV[ $i ] eq "--mateIdx" ) + { + $mateIdx = $ARGV[ $i + 1 ] ; + if ( $mateIdx == 1 ) + { + $classesOpt .= " --hasMateIdSuffix" ; + $juncOpt .= " --hasMateIdSuffix" ; + } + ++$i ; + } + elsif ( $ARGV[$i] eq "--bamGroup" ) + { + $bamGroup = $ARGV[$i + 1] ; + ++$i ; + } + elsif ( $ARGV[ $i ] eq "--sa" ) + { + $trustSpliceOpt .= " -a ".$ARGV[$i + 1] ; + ++$i ; + } + elsif ( $ARGV[ $i ] eq "--vd" ) + { + $voteOpt .= " -d ".$ARGV[$i + 1] ; + ++$i ; + } + elsif ( $ARGV[$i] eq "--primaryParalog" ) + { + $classesOpt .= " --primaryParalog" ; + } + elsif ( $ARGV[$i] eq "--maxDpConstraintSize" ) + { + $classesOpt .= " --maxDpConstraintSize ".$ARGV[$i + 1] ; + ++$i ; + } + elsif ( $ARGV[$i] eq "--version" ) + { + die "PsiCLASS v1.0.1\n" ; + } + else + { + die "Unknown argument: ", $ARGV[$i], "\n" ; + } +} +if ( scalar( @bamFiles ) == 0 ) +{ + die "Must use option --lb to specify the list of bam files.\n" ; +} + +if ( $mateIdx == -1 ) +{ + open FPsam, "$WD/samtools-0.1.19/samtools view ".$bamFiles[0]."| head -1000 |" ; + my $flag = 0 ; + while ( <FPsam> ) + { + my @cols = split /\t/ ; + my $id = $cols[0] ; + + if ( !( $id =~ /[\.|\/][1|2]$/ ) ) + { + $flag = 1 ; + last ; + } + } + close FPsam ; + + if ( $flag == 0 ) + { + print "Found mate read id index suffix(.1 or /1). Calling \"--mateIdx 1\" option. If this is a false calling, please use \"--mateIdx 0\".\n" ; + $classesOpt .= " --hasMateIdSuffix" ; + $juncOpt .= " --hasMateIdSuffix" ; + + } +} + +mkdir $outdir if ( !-d $outdir ) ; +mkdir "$outdir/splice" if ( !-d "$outdir/splice" ) ; +mkdir "$outdir/subexon" if ( !-d "$outdir/subexon" ) ; + + +my $threadLock : shared ; +my @sharedFiles : shared ; +my @threads ; +for ( $i = 0 ; $i < $numThreads ; ++$i ) +{ + push @threads, $i ; +} + +sub threadRunSplice +{ + my $tid = threads->tid() - 1 ; + my $i ; + for ( $i = 0 ; $i < scalar( @bamFiles ) ; ++$i ) + { + next if ( ( $i % $numThreads ) != $tid ) ; + system_call( "$WD/junc ".$bamFiles[$i]." -a $juncOpt > $outdir/splice/${prefix}bam_$i.raw_splice" ) ; + } +} + +# Generate the splice file for each bam file. +if ( $stage <= 0 ) +{ + if ( $numThreads == 1 ) + { + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + system_call( "$WD/junc ".$bamFiles[$i]." -a $juncOpt > $outdir/splice/${prefix}bam_$i.raw_splice" ) ; + #if ( $spliceFile ne "" ) + #{ + # system_call( "perl $WD/ManipulateIntronFile.pl $spliceFile ${prefix}bam_$i.raw_splice > ${prefix}bam_$i.splice" ) ; + #} + #else + #{ + #system_call( "awk \'{if (\$6>1) print;}\' ${prefix}bam_$i.raw_splice > ${prefix}bam_$i.splice" ) ; + # system_call( "mv ${prefix}bam_$i.raw_splice ${prefix}bam_$i.splice" ) ; + #} + } + } + else + { + foreach ( @threads ) + { + $_ = threads->create( \&threadRunSplice ) ; + } + foreach ( @threads ) + { + $_->join() ; + } + } + + if ( $bamGroup eq "" || $spliceFile ne "" ) + { + open FPls, ">$outdir/splice/${prefix}splice.list" ; + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + print FPls "$outdir/splice/${prefix}bam_$i.raw_splice\n" ; + } + close FPls ; + + if ( $spliceFile ne "" ) + { + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $spliceFile > $outdir/splice/${prefix}bam_$i.splice" ) ; + } + } + else + { + system_call( "$WD/trust-splice $outdir/splice/${prefix}splice.list ". $bamFiles[0] ." $trustSpliceOpt > $outdir/splice/${prefix}bam.trusted_splice" ) ; + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $outdir/splice/${prefix}bam.trusted_splice > $outdir/splice/${prefix}bam_$i.splice" ) ; + } + } + } + else + { + my @bamToGroupId ; + my %groupNameToId ; + open FPbg, $bamGroup ; + my $groupUsed = 0 ; + while (<FPbg>) + { + chomp ; + if ( !defined $groupNameToId{$_} ) + { + $groupNameToId{$_} = $groupUsed ; + ++$groupUsed ; + } + push @bamToGroupId, $groupNameToId{$_} ; + } + close FPbg ; + + # Process each group one by one + my $group ; + for ( $group = 0 ; $group < $groupUsed ; ++$group ) + { + open FPls, ">$outdir/splice/${prefix}splice_$group.list" ; + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + next if ( $bamToGroupId[$i] != $group ) ; + print FPls "$outdir/splice/${prefix}bam_$i.raw_splice\n" ; + } + close FPls ; + + system_call( "$WD/trust-splice $outdir/splice/${prefix}splice_$group.list ". $bamFiles[0] ." $trustSpliceOpt > $outdir/splice/${prefix}bam_$group.trusted_splice" ) ; + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + next if ( $bamToGroupId[$i] != $group ) ; + system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $outdir/splice/${prefix}bam_$group.trusted_splice > $outdir/splice/${prefix}bam_$i.splice" ) ; + } + } + + } +} + + +# Get subexons from each bam file +sub threadRunSubexonInfo +{ + my $tidAdjust = 0 ; + $tidAdjust = $numThreads if ( $stage < 1 ) ; + my $tid = threads->tid() - $tidAdjust - 1 ; + my $i ; + for ( $i = 0 ; $i < scalar( @bamFiles ) ; ++$i ) + { + next if ( ( $i % $numThreads ) != $tid ) ; + system_call( "$WD/subexon-info ".$bamFiles[$i]." $outdir/splice/${prefix}bam_$i.splice > $outdir/subexon/${prefix}subexon_$i.out" ) ; + } +} + + +if ( $stage <= 1 ) +{ + if ( $numThreads == 1 ) + { + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + system_call( "$WD/subexon-info ".$bamFiles[$i]." $outdir/splice/${prefix}bam_$i.splice > $outdir/subexon/${prefix}subexon_$i.out" ) ; + } + } + else + { + #print "hi ", scalar( @threads ), " ", scalar( @bamFiles), "\n" ; + foreach ( @threads ) + { + $_ = threads->create( \&threadRunSubexonInfo ) ; + } + foreach ( @threads ) + { + $_->join() ; + } + } + + open FPls, ">$outdir/subexon/${prefix}subexon.list" ; + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + print FPls "$outdir/subexon/${prefix}subexon_$i.out\n" ; + } + close FPls ; +} + +# combine the subexons. +if ( $stage <= 2 ) +{ + $cmd = "$WD/combine-subexons --ls $outdir/subexon/${prefix}subexon.list > $outdir/subexon/${prefix}subexon_combined.out" ; + system_call( "$cmd" ) ; +} + +# Run classes +if ( $stage <= 3 ) +{ + my $trimPrefix = substr( $prefix, 0, -1 ) ; + my $bamPath = "" ; + if ( $bamFileList ne "" ) + { + $bamPath = " --lb $bamFileList " ; + } + else + { + foreach my $b (@bamFiles) + { + $bamPath .= " -b $b " ; + } + } + $cmd = "$WD/classes $classesOpt $bamPath -s $outdir/subexon/${prefix}subexon_combined.out -o $outdir/${trimPrefix} > $outdir/${prefix}classes.log" ; + system_call( "$cmd" ) ; +} + +# Run voting +if ( $stage <= 4 ) +{ + open FPgtflist, ">$outdir/${prefix}gtf.list" ; + for ( $i = 0 ; $i < @bamFiles ; ++$i ) + { + print FPgtflist "$outdir/${prefix}sample_${i}.gtf\n" ; + } + close FPgtflist ; + $cmd = "$WD/vote-transcripts --lg $outdir/${prefix}gtf.list $voteOpt > $outdir/${prefix}vote.gtf" ; + system_call( "$cmd" ) ; +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/.gitignore Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,4 @@ +*.o +.*.swp +*.a +*.dSYM
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/AUTHORS Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,20 @@ +Heng Li from the Sanger Institute wrote most of the initial source codes +of SAMtools and various converters. + +Bob Handsaker from the Broad Institute is a major contributor to the +SAM/BAM specification. He designed and implemented the BGZF format, the +underlying indexable compression format for the BAM format. BGZF does +not support arithmetic between file offsets. + +Jue Ruan for the Beijing Genome Institute designed and implemented the +RAZF format, an alternative indexable compression format. RAZF supports +arithmetic between file offsets, at the cost of increased index file +size and the full compatibility with gzip. RAZF is optional and only +used in `faidx' for indexing RAZF compressed fasta files. + +Colin Hercus updated novo2sam.pl to support gapped alignment by +novoalign. + +Petr Danecek contributed the header parsing library sam_header.c and +sam2vcf.pl script and added knet support to the RAZF library. +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/COPYING Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2008-2009 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/ChangeLog.old Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,3875 @@ +commit db2ad3e19068cbafde72ecde75d0638bbb3598ba +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 16 14:45:17 2012 -0500 + + removed downsample.c + +commit 6c55c576903992c6fef148fe3b606fbc8bd10655 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 16 14:45:06 2012 -0500 + + print to output + +commit db1044a34e6049c87eaa63c39ed6e56f03e7d4c1 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 16 14:39:34 2012 -0500 + + removed sample + + Downsampling already exists in "view". View also keeps pairing while "sample" does not. + +commit ffdeed3e5d4a530bfdf6f9ba97fff0ba7add6cba +Merge: 2daad7b accf026 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 16 14:22:15 2012 -0500 + + Merge branch 'master' of github.com:lh3/samtools + +commit accf0260fd1117e10047344345d40b31a9ec31bb +Merge: 9134e0d c554160 +Author: Heng Li <lh3@me.com> +Date: Thu Feb 16 11:21:14 2012 -0800 + + Merge pull request #8 from nh13/master + + Patches + +commit c554160df16ec7748cfdda4c7b54c641be7b809f +Author: Nils Homer <nils.homer@lifetech.com> +Date: Thu Feb 16 14:06:52 2012 -0500 + + * more README.md work + +commit 2a81ffe349208d917666808fbc9f3041e0cb57de +Author: Nils Homer <nils.homer@lifetech.com> +Date: Thu Feb 16 14:06:10 2012 -0500 + + * more README work + +commit fb3125f732715f62cded8685a23a002a96ce009b +Author: Nils Homer <nils.homer@lifetech.com> +Date: Thu Feb 16 14:05:19 2012 -0500 + + * more README work + +commit 444d41002c37e1c3d0f9208b4a88126c47276386 +Author: Nils Homer <nils.homer@lifetech.com> +Date: Thu Feb 16 14:02:13 2012 -0500 + + * updating README + +commit dec53cb1043fe7efadfde75fa2fd39b76de22e54 +Author: Nils Homer <nils.homer@lifetech.com> +Date: Thu Feb 16 13:55:01 2012 -0500 + + updating the README for markdown syntax + +commit 798da18c346dca8ec6005582a0ddb1d5420b04ca +Author: Nils Homer <nils.homer@lifetech.com> +Date: Thu Feb 16 13:48:35 2012 -0500 + + adding a README with the current differences between this repository and + the official one + +commit 4d22d86c0f28636662f2144a88cd168e104c4275 +Author: Nils Homer <nils.homer@lifetech.com> +Date: Thu Feb 16 13:35:03 2012 -0500 + + adding "samtools sample" to the main + +commit 893c25a37c21005dc42f45d45e9ad78ddc5f29bb +Author: Nils Homer <nils.homer@lifetech.com> +Date: Thu Feb 16 13:33:51 2012 -0500 + + * removing some compile flags to work with OS X + +commit 7ac22f72fdc32edd5c24af6baebfa7db5faf8e7b +Author: Jonathan Manning <jonathan.manning@lifetech.com> +Date: Thu Feb 16 10:47:14 2012 -0500 + + Check write filehandle after opening for write. tamw/tamr is a union type, so change is only semantic. + + Signed-off-by: Nils Homer <nils.homer@lifetech.com> + +commit fef53330416631690f60fdff42b6e43d764170dc +Author: Jonathan Manning <jonathan.manning@lifetech.com> +Date: Thu Feb 16 10:44:59 2012 -0500 + + Catch and report invalid BAM header, instead of segfaulting later on. + + Signed-off-by: Nils Homer <nils.homer@lifetech.com> + +commit 5cc013fe4930bf9b6e7963aab1cd4a3c94f695bc +Author: Jonathan Manning <jonathan.manning@lifetech.com> +Date: Thu Feb 16 10:44:16 2012 -0500 + + Add downsample to examples. + + Signed-off-by: Nils Homer <nils.homer@lifetech.com> + +commit b3fa9e7071532905a81dc7aa48eadc24b8c8846b +Author: Jonathan Manning <jonathan.manning@lifetech.com> +Date: Thu Feb 16 10:43:48 2012 -0500 + + Adjust for leading hard clip on colorspace reads. + + Signed-off-by: Nils Homer <nils.homer@lifetech.com> + +commit 1a9296c1389469d1c1db5b8069f0e11ffcc8abb2 +Author: Jonathan Manning <jonathan.manning@lifetech.com> +Date: Thu Feb 16 10:42:52 2012 -0500 + + Add samtools sample command, contributed by Davide Cittaro <davide.cittaro@ifom-ieo-campus.it>. + + Signed-off-by: Nils Homer <nils.homer@lifetech.com> + +commit 2a804f3379748aeba944f1dec306dd726ff3235e +Author: Jonathan Manning <jonathan.manning@lifetech.com> +Date: Thu Feb 16 10:42:07 2012 -0500 + + Add samtools qa command, contributed by Roman Valls Guimera <roman.valls.guimera@scilifelab.se>. + + Signed-off-by: Nils Homer <nils.homer@lifetech.com> + +commit 0f3207fe8fd93e44d40fcf57204079c8c06d24a6 +Author: Jonathan Manning <jonathan.manning@lifetech.com> +Date: Thu Feb 16 10:39:08 2012 -0500 + + Makefile cleanup - allow CC, CFLAGS, LDFLAGS to be passed on make command line. Use LDFLAGS in samtools compile. + + Signed-off-by: Nils Homer <nils.homer@lifetech.com> + +commit 6e7df604025f6a86881bf7f4a16f30e15d31538a +Author: Jonathan Manning <jonathan.manning@lifetech.com> +Date: Thu Feb 16 10:31:15 2012 -0500 + + Allow max_mem for sort to be specified with units. + + Signed-off-by: Nils Homer <nils.homer@lifetech.com> + +commit f12ebcaf6e60d34180a27d70e09b743cef140b98 +Author: Jonathan Manning <jonathan.manning@lifetech.com> +Date: Thu Feb 16 10:29:11 2012 -0500 + + Allow user defined [lowercase] tags in header elements. + + Signed-off-by: Nils Homer <nils.homer@lifetech.com> + +commit 50b931fa3312dc109537a4260698ddecd0f06a05 +Author: Jonathan Manning <jonathan.manning@lifetech.com> +Date: Thu Feb 16 10:27:11 2012 -0500 + + Check lowerbound in text entry box to avoid segfault in tview. Remove redundant call to bam_aux_get. + + Signed-off-by: Nils Homer <nils.homer@lifetech.com> + +commit 5e729da5190949a813d20d329eab7ddb661816bd +Author: Nils Homer <nils.homer@lifetech.com> +Date: Thu Feb 16 10:31:48 2012 -0500 + + * fixing overflow/underflow in integer parsing + +commit fa50a4330b9abedaf07c26e13d31f05e57f1d319 +Author: Nils Homer <nils.homer@lifetech.com> +Date: Thu Feb 16 10:30:40 2012 -0500 + + * updating help message for samtools depth + +commit 79e52c9624b6dd3bdfdf439f4b4bc6f774c230a4 +Author: Nils Homer <nils.homer@lifetech.com> +Date: Thu Feb 16 10:29:32 2012 -0500 + + * adding support for outputting a circos histogram file in "samtools depth". Use + the "-c/-B" options. + +commit 2daad7b52daa86561c0fb65fe366691fad9f5ed3 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 16 09:31:57 2012 -0500 + + bugfix: wrong SP; missing DV in the VCF hdr + +commit 9134e0d5047c281ef3bd53da91771d4814a5131c +Author: Heng Li <lh3@live.co.uk> +Date: Wed Feb 8 11:19:12 2012 -0500 + + missing support of DV + +commit 34ebf12078c1d1015a0b8b9a9221243a60b22893 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Feb 8 11:08:56 2012 -0500 + + new BCF DV format: number of variant reads + +commit 9589d3312fa2d076f48bdd68e2a5edd419c8070c +Author: Heng Li <lh3@live.co.uk> +Date: Tue Jan 10 10:30:27 2012 -0500 + + scale depth to quality (hidden option) + +commit 704473e14668333ecaca5fb7b238af405c43e3b1 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Jan 10 10:18:17 2012 -0500 + + really nothing + +commit 01b307fd287962372bbf07461c88b54f41636817 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Dec 7 13:07:42 2011 -0500 + + added an example containing 'B' + +commit c678791f0451ceb9205c1ab5c52c84641863c99a +Author: Heng Li <lh3@live.co.uk> +Date: Sat Dec 3 12:10:30 2011 -0500 + + 'B' now moves backward w.r.t. the query + +commit 152119bc06a073933ca830e8e1407538e44626cc +Author: Heng Li <lh3@live.co.uk> +Date: Fri Dec 2 10:50:12 2011 -0500 + + better consensus; a little more robust + +commit 454da4754ac503edda5b1329b67757d797e46e07 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Dec 2 00:20:22 2011 -0500 + + in pileup call remove_B() + +commit ff2bcac1cc078ba1879f18c89cfae314439d7086 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Dec 2 00:17:32 2011 -0500 + + working on a few toy examples + +commit 745ca7260158d6df7897b52598033ffb055a9e4f +Author: Heng Li <lh3@live.co.uk> +Date: Thu Dec 1 22:55:39 2011 -0500 + + bam_remove_B(); not tested + +commit 07e4cdc7300abfcc82e03105b4689f95cab551cd +Author: Heng Li <lh3@live.co.uk> +Date: Thu Nov 10 12:58:55 2011 -0500 + + baseQ threshold on plain pipleup; removed -E + +commit 322ebf2082dfa91df44b3a996d26c85357e5d5a2 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Oct 19 09:28:04 2011 -0400 + + fixed two gcc warnings + +commit a632457b4c4adc50d833b56b5a5231feafaf8193 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Oct 4 10:13:23 2011 -0400 + + change size_t to uint32_t in bam_header_t + + This may cause issues on 64-bit big-endian machines. Reported and fixed by Paolo Emilio Mazzon. + +commit af31bf5a78aea03baf6eb90fe50076549d499f6e +Author: Heng Li <lh3@live.co.uk> +Date: Mon Sep 26 20:17:57 2011 -0400 + + rename pad2unpad to depad + +commit 77b198b73dfad1048e5d1c5a64aa75ee7b90f596 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Sep 23 01:22:40 2011 -0400 + + convert padded BAM to unpadded BAM + +commit adb9e2342b7b7501d9527d3c23afab10469ae2c6 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Sep 7 11:40:50 2011 -0400 + + generate template cigar with "fixmate" + +commit 46e5ab445a0fe880216cbc0daf1225725b569d7a +Author: Heng Li <lh3@live.co.uk> +Date: Fri Sep 2 12:50:18 2011 -0400 + + update kseq.h to the latest version + +commit 68e9e4a73eb91405bb3e56bf0cdaf12d1b487abb +Author: Heng Li <lh3@live.co.uk> +Date: Fri Sep 2 12:44:45 2011 -0400 + + Release samtools-0.1.18 + +commit aa06bdadb2d109a79f927f478102f96a1f5fd258 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Sep 2 12:14:17 2011 -0400 + + updated the revision number + +commit 267e1e1b6e54c0ab24f94cd9aee9cbd2d1923f9f +Merge: 19ff1d3 aebab30 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Sep 2 12:13:08 2011 -0400 + + Merge https://github.com/lh3/samtools into reduce + + Conflicts: + bam_md.c + + Fixed a few typos in the merge + +commit aebab302399c24eaa6c5ab79d13d6bd5e2e9ea9a +Merge: c2c63d0 da62663 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Sep 2 09:03:49 2011 -0700 + + Merge pull request #4 from peterjc/x_equals2 + + Implement basic support for =/X CIGAR operations + +commit 19ff1d3d7f47d7e61b121292aefe5a74bb8a18d2 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Aug 25 16:38:12 2011 -0400 + + reduce BAM size (experimental) + +commit da626630fd98fd4e07ceb4d58c5c9a42d312a85d +Author: peterjc <p.j.a.cock@googlemail.com> +Date: Mon Aug 22 06:58:08 2011 +0100 + + Support =/X CIGAR operations (treated like M) + +commit 461d8003529db77a4d5ecbd108312e868b051a3d +Author: peterjc <p.j.a.cock@googlemail.com> +Date: Mon Aug 22 05:52:56 2011 +0100 + + Define CIGAR equals and X operationss (7 and 8) + +commit c2c63d067113baab41f3bc35fb28f4f00578accb +Merge: 7ab3ef3 9a0ed9a +Author: Heng Li <lh3@live.co.uk> +Date: Thu Aug 18 17:21:54 2011 -0700 + + Merge pull request #3 from peterjc/x_equals + + Accept SAM files using = in CIGAR (treats X and = as M) + +commit 9a0ed9a6b85c7981465f459300208dbd93e3c6f5 +Author: peterjc <p.j.a.cock@googlemail.com> +Date: Thu Aug 18 19:28:52 2011 +0100 + + Accept SAM files using = in CIGAR (treats X and = as M) + +commit 7ab3ef388c1eb34d7912fd70cc5656c955240263 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Aug 8 10:22:22 2011 -0400 + + bugfix: indexing takes huge memory + + This happens when an unmapped mate has coordinate 1. Thank Joel Martin for the fix. + +commit a3f6738593e944354a8f75306687d8b3acf08bf1 +Merge: a8bdca9 bc67ea2 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Aug 8 09:52:26 2011 -0400 + + Merge branch 'master' of github.com:lh3/samtools + +commit bc67ea225da653f36a70b38382d6111dd494f659 +Author: Petr Danecek <pd3@sanger.ac.uk> +Date: Thu Jul 28 20:03:16 2011 +0100 + + Variant Distance Bias + +commit deb578f0c49d0b7d8c3bc6be220b4d67e2e7dfdf +Author: Petr Danecek <pd3@sanger.ac.uk> +Date: Tue Jul 26 09:57:37 2011 +0100 + + If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but use the tag instead. + +commit a8bdca9cf482a637b89ee4f98469a93e0ab5e69b +Author: Heng Li <lh3@live.co.uk> +Date: Mon Jul 25 10:10:55 2011 -0400 + + bugfix: LRT2=nan + +commit 0afe33137d046a3e849eeb4a54590f27cbad4228 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Jul 22 21:55:38 2011 -0400 + + fixed a bug/typo + +commit 62d5849658c10222d40308c6b53ab4f99a448494 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Jul 15 16:04:19 2011 -0400 + + allow to set see in subsampling + +commit 5f46243824cc9435b167973e1d51e13128794ea1 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Jul 15 15:54:47 2011 -0400 + + support subsampling + +commit 5e55b6f34fc86cba7cf98d52ccaed405c3ffabbc +Author: Heng Li <lh3@live.co.uk> +Date: Fri Jul 15 15:53:38 2011 -0400 + + support indels + +commit f31c162926d6f43e8b60171789a258d02e1f9be5 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jul 7 17:02:33 2011 -0400 + + do not count indel with "view -Y" + +commit e412dae587883b4c17e5fbf4b7c33f38bfa8458a +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jul 7 00:35:25 2011 -0400 + + for WIN32 compatibility + +commit 70a52501bcfa63824749893a5ab8ed3c38e34958 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jul 7 00:32:46 2011 -0400 + + for WIN32 compatibility + +commit 00438f14ed5984f08e8f7645a9b95644a812f969 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Jul 6 23:41:45 2011 -0400 + + fixed an uninitialized variable + +commit 7609c4a01059c326544b3d0142dfe9c4229d68c6 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Jul 6 23:39:31 2011 -0400 + + fixed an uninitialized variable + +commit cec7189a412f80ccb068a73bd28528915c16b0bf +Author: Heng Li <lh3@live.co.uk> +Date: Wed Jul 6 22:53:19 2011 -0400 + + Release samtools-0.1.17 + +commit 93c06a249de3bb666029bf07b66de5e8e5e314fa +Author: Heng Li <lh3@live.co.uk> +Date: Wed Jul 6 09:46:09 2011 -0400 + + bugfix: incorrect idxstats for the last seq + + Again, this bug is caused by 3rd-party code for the sorting order checking. + +commit 84f6ca62db6e27b8c4c711e7b5f3ca704bf27b4f +Author: Heng Li <lh3@live.co.uk> +Date: Tue Jul 5 23:30:23 2011 -0400 + + output mapping quality in the old pileup format + +commit 362e05fd670886acaede69b864903d730b9db3ca +Author: Heng Li <lh3@live.co.uk> +Date: Tue Jul 5 21:59:22 2011 -0400 + + added a brief description of the VCF format + +commit e690a696468205e0cc4560016361c997660dd496 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Jul 5 16:23:10 2011 -0400 + + improved samtools manual page + +commit 362b4a1408ef3c32311d638aa8d85ce39c1c7b2d +Author: Heng Li <lh3@live.co.uk> +Date: Tue Jul 5 15:58:29 2011 -0400 + + merge bcftools.1 to samtools.1 + +commit 643e0e61ba7266efbc9e5bfcb8e41f369ba2ce0a +Author: Heng Li <lh3@live.co.uk> +Date: Tue Jul 5 13:39:02 2011 -0400 + + mpileup: when region set, set reference properly + +commit 613e4d67624a94f62563935fbd5cc294df69605a +Author: Heng Li <lh3@live.co.uk> +Date: Mon Jul 4 23:29:02 2011 -0400 + + compute the min PL diff + +commit 5b7d5d3f52b97ca42c8500eede808dab88a46a53 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Jul 4 22:57:48 2011 -0400 + + rename trio.c to mut.c + +commit 84fe96ad64b0365ead93a4115d1684b9bebb98fc +Author: Heng Li <lh3@live.co.uk> +Date: Sun Jul 3 15:38:51 2011 -0400 + + added pair caller interface; not tested + +commit 2f2867b87b84c35319cc416d6173819d5c8a4e8c +Author: Heng Li <lh3@live.co.uk> +Date: Sun Jul 3 15:24:23 2011 -0400 + + inital implementation of a pair caller + +commit e97653cf2ad653c95886933c42a2b5492ccab5ff +Author: Heng Li <lh3@live.co.uk> +Date: Sun Jul 3 00:06:28 2011 -0400 + + convert bam to single-end fastq + +commit e8013e11f7a8db0a8d18c60d130169cca39bf2bd +Author: Heng Li <lh3@live.co.uk> +Date: Sat Jul 2 14:39:18 2011 -0400 + + improve BED parsing + +commit 1025714325fdc636aeee47a76db8dafbbbfde64b +Author: Heng Li <lh3@live.co.uk> +Date: Fri Jul 1 14:19:54 2011 -0400 + + update the manual page + +commit 8022d0039dff47b1c11b2421357d510c1f28ae15 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Jul 1 14:17:03 2011 -0400 + + output the best constrained genotypes in trio + +commit 18c87295e12f5bebafdcae00d52000fb94c8a566 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Jul 1 11:18:14 2011 -0400 + + added documentations for view -T + +commit daf7a8d96bd495296bf7c7d99cddb808a3ced7d5 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jun 30 22:45:20 2011 -0400 + + fixed a bug in writing SP + +commit e5c32bf9b28c6e3e861db88de56b5dbe11058b61 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jun 30 22:35:25 2011 -0400 + + optionally output read positions in mpileup + +commit 1008051155ec994c1901e18f3eb03ea32a62e5d7 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jun 30 22:17:25 2011 -0400 + + make faidx works with <2GB lines + +commit 2daebb63762425dd3074ddf71582ad189001e394 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jun 30 17:28:58 2011 -0400 + + fixed an issue in the trio caller and the indel caller + +commit 9fdd52cf0716fb342a94946433d564b28b230835 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jun 30 13:34:01 2011 -0400 + + Added trio caller; NOT tested yet + +commit ea22a8ed83625e9c82382b56acc42a2d9cfd17e5 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jun 30 11:42:29 2011 -0400 + + convert PL to 10-likelihood GL + +commit 10d7065267b0d12c2bfcb6c70204fb6944cd395d +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jun 30 10:49:05 2011 -0400 + + fix a compatibility issue with the new bcftools + +commit d340f01f609c61b719d38a6a55629a3fc899e1cd +Author: Heng Li <lh3@live.co.uk> +Date: Sun Jun 26 23:41:20 2011 -0400 + + allow to ignore RG + +commit d6321faf98ebfe899b9409fb23c90a4aa8c6b542 +Author: Heng Li <lh3@live.co.uk> +Date: Sun Jun 5 23:05:21 2011 -0400 + + fixed a bug in SO checking due to a recent change + +commit bc995abf666d0c9ab4258f6c1b3518a45a89209f +Author: Heng Li <lh3@live.co.uk> +Date: Fri Jun 3 14:45:36 2011 -0400 + + update the version number + +commit 9e7cd83a08383858d008e0ccb2238a2b93831d6c +Author: Heng Li <lh3@live.co.uk> +Date: Fri Jun 3 14:43:12 2011 -0400 + + smarter way to parse a region string + +commit e58a90a0fde54053dac65352b34c13c3fea815fc +Author: Heng Li <lh3@live.co.uk> +Date: Wed Jun 1 14:36:22 2011 -0400 + + output LRT2 instead of LRT1 + +commit 08f78c9af3e5661f04f80bef424232de721dba03 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Jun 1 14:02:28 2011 -0400 + + genotype test, but assuming 1-degree + +commit 587b852340d7e60f6f7cf474a92ef77aeab46018 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Jun 1 12:55:19 2011 -0400 + + perform 2-degree test by default + +commit 3d38e403c5c830478b7eb157a484776997440501 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Jun 1 12:44:34 2011 -0400 + + fixed a typo; but the result is still not good + +commit 06291624f7dcc57445676f3be25d0bc355dd7110 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Jun 1 12:24:18 2011 -0400 + + fixed a typo + +commit 63b98aa33636b0d82a435bf49153c8c1502e7d42 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Jun 1 12:23:37 2011 -0400 + + added HWE+F<0 filter + +commit 37d926e8999999b593d0637ab7dc379dbd3d6006 +Author: Heng Li <lh3@live.co.uk> +Date: Wed May 4 10:11:59 2011 -0400 + + improved sorting order checking in index + + Patches from Jonathan Manning + +commit 1c2dc6762c5f7cd946046b53346513f2f9761dbf +Author: Heng Li <lh3@live.co.uk> +Date: Tue May 3 23:09:05 2011 -0400 + + added r^2 estimate; added Brent's method + +commit c2d3bcd8f98e31668b5f1321222fbc6fd6336e75 +Author: Heng Li <lh3@live.co.uk> +Date: Sun May 1 23:45:23 2011 -0400 + + combine several utilites into vcfutils.lua + +commit be2e7362d7593ea4d03fb33cdb6af2aa096ca6c4 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Apr 27 21:09:22 2011 -0400 + + minor warning + +commit 683ef0443860813d743cf84fa86dda9bfaf5445a +Author: Heng Li <lh3@live.co.uk> +Date: Wed Apr 27 10:10:38 2011 -0400 + + added versioning + +commit ed72f25ec85671f7646dbc92fa7b5b1dda427f7d +Author: Heng Li <lh3@live.co.uk> +Date: Wed Apr 27 10:04:02 2011 -0400 + + Output ML allele count + +commit 2a9e36d2d6c405b2411ca47458f028ada8fe1000 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Apr 26 16:14:20 2011 -0400 + + use ar -s + +commit 7a4f54e6dbcd7c94acbb3f1050a93f94b8a07949 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Apr 23 01:22:31 2011 -0400 + + added another type of LRT + +commit b9c5e84762a4aacce3a3771b51ea80967c79a2e5 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 22 16:00:31 2011 -0400 + + added version + +commit 8fad6677c5952efd67391581d64e67e02e7f6e68 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 22 00:30:19 2011 -0400 + + remove the pileup command + +commit 3a962fb6ebf779de70f9e6effb2d8701a9aa3dd9 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Apr 21 23:10:45 2011 -0400 + + Release 0.1.16 (r963:234) + +commit b4d683cffbd98c43f05aff8610b37d63dd7e54aa +Author: Heng Li <lh3@live.co.uk> +Date: Thu Apr 21 12:44:44 2011 -0400 + + fixed a bug when coordinate-less reads are on the reverse strand + +commit c5ec45a128f409debc6a56a798024f53004037dc +Author: Heng Li <lh3@live.co.uk> +Date: Wed Apr 20 11:36:52 2011 -0400 + + added option '-f' to merge to avoid overwritting + +commit 68d431531370d24907c01a27f166f2341d7c4d35 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Apr 20 10:26:58 2011 -0400 + + do not print a warning + +commit 32922607e51ad2260c337eb022b9e4aedacb049f +Author: Heng Li <lh3@live.co.uk> +Date: Wed Apr 20 10:21:06 2011 -0400 + + Added ldpair to compute LD between requested pairs + +commit b8d6fa71b91678fa02338257e0707d1e5ca098dd +Author: Heng Li <lh3@live.co.uk> +Date: Sun Apr 17 21:51:43 2011 -0400 + + On a toy sample, type "B" seems to be accepted + +commit 0e7ee9a6bb4029184202aa6e6738105ba0c0510b +Author: Heng Li <lh3@live.co.uk> +Date: Sun Apr 17 21:21:20 2011 -0400 + + added type "B"; not tested yet + +commit a513dfad0ac0062b03871eb6ecf26cb8d18dc895 +Author: Heng Li <lh3@live.co.uk> +Date: Sun Apr 17 19:25:54 2011 -0400 + + fixed a bug in bedidx.c: input BED not sorted + +commit de1e192bb0a8a762a54a6eee81d882fab01c3d32 +Author: Heng Li <lh3@live.co.uk> +Date: Sun Apr 17 18:51:08 2011 -0400 + + by default, always perform posterior chi^2 + +commit df6e0d1099895fc6cd7a19dc89fba95ed6654d35 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Apr 16 12:33:28 2011 -0400 + + added debugging + +commit 8ce52e024dc2ef361dbd5399c232163055057e70 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Apr 16 00:59:05 2011 -0400 + + avoid a segfault given wrong input + +commit e66b6684fc9a397f91ec29fdeecae9f8eb986a55 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 15 19:55:39 2011 -0400 + + do not segfault when there is no PL + +commit 9ce3c584ec0cebfa45576f2ef538df4dad2b7e55 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 15 11:59:55 2011 -0400 + + remove another unused part + +commit f53a051d68bf312ac8d5865210fae7a9808c0fb9 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 15 10:41:25 2011 -0400 + + print G3 if HWE is small + +commit 4b2c08bb86ca4ed4959e4cb77a28f7d6fc19f5c9 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 15 10:04:34 2011 -0400 + + fixed a bug + + actually not fix, but hide it + +commit 088e13c32453fb533b7bb1c65a573f9b90a23625 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 15 09:48:47 2011 -0400 + + added LRT based permutation; not used though + +commit 1e3c2001afcb80b5eaa4c3f88df9da7b01b62524 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 15 09:28:55 2011 -0400 + + Perform posterior contrast for small LRT + + Posterior contrast is much slower than LRT. Nonetheless, posterior P-value is + more robust to sequencing artifacts. Thus we may combine the two to achieve a + balance between speed and low FPR. + +commit 6f1b066270902198a7175ff6c1b05ebc8d1919be +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 15 01:36:06 2011 -0400 + + Added Brent's method + +commit 3d061e5db25b67b25f6ff87afe4162e121354232 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Apr 14 23:30:10 2011 -0400 + + fixed a typo in printing + +commit 7fd14ceb5990bb350b8e97346ef3537d80058def +Author: Heng Li <lh3@live.co.uk> +Date: Thu Apr 14 23:14:23 2011 -0400 + + fixed a stupid bug + +commit f5b2c3459ec098b3cafd9619b9077132516baf58 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Apr 14 22:42:35 2011 -0400 + + separate EM and posterior + + Now, constrast is not performed unless -C is in use. EM can be invoked + independently with -e without computing the posterior. + +commit 9eefcac963697fae554789b11ae3cb2c23f224d0 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Apr 14 22:00:19 2011 -0400 + + further code cleanup; prepare to add EM interface + +commit c2cce52355262743711e4742b0c8542bfcab1cdd +Author: Heng Li <lh3@live.co.uk> +Date: Thu Apr 14 21:44:03 2011 -0400 + + drop EM from prob1 + +commit 24016f04bd3bdffb7eeb50cb25854f5007feb70f +Author: Heng Li <lh3@live.co.uk> +Date: Thu Apr 14 21:08:33 2011 -0400 + + drop posterior LRT; prepare for clean up + +commit 3670d8bd88c3eb22873f0a80e2a5913f64ca8c9a +Author: Heng Li <lh3@live.co.uk> +Date: Thu Apr 14 20:57:43 2011 -0400 + + better initial values for LD + +commit d48a8873c060b18b57799cfe3a0e5496ba069457 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Apr 14 20:36:25 2011 -0400 + + finished EM + +commit b101f2db476188a950c23f5c1b6185fdb7f8f40b +Author: Heng Li <lh3@live.co.uk> +Date: Wed Apr 13 01:19:04 2011 -0400 + + genotype frequency estimate + +commit d79bdcbf6242ecfb8accba9ac9a22fbcbd543cf2 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Apr 13 00:37:22 2011 -0400 + + prepare for code clean up + +commit e0ce416abfc094f0c090957080b1404fd0edf752 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Apr 13 00:34:15 2011 -0400 + + rename ld.c to em.c + +commit 45ede3ad181f35c1be24bed5d75841e472357ab7 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Apr 13 00:22:10 2011 -0400 + + implemeted EM likelihood ratio test + + The idea is learned from a brief chat with Rasmus Nielsen. + +commit 0454a346b60e42b75a2f742272089810279c7131 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Apr 12 15:45:52 2011 -0400 + + added likelihood-ratio test (idea from Nick) + +commit f6287c8646c690440a1554c8958e7268f4134dc2 +Author: Heng Li <lh3@live.co.uk> +Date: Sun Apr 10 18:24:37 2011 -0400 + + Release samtools-0.1.15 (r949:203) + +commit de6023f38f4d652438557cf7a0ac6eec324e7416 +Author: Heng Li <lh3@live.co.uk> +Date: Sun Apr 10 15:54:58 2011 -0400 + + improved help information + +commit d3b337f2b7eda1e6f8f5575a19d1b5ed55cae279 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Apr 9 16:28:01 2011 -0400 + + fixed a minor issue + +commit 82f6e4f49247e75fbd8ec08c285b8d3047b3d235 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Apr 9 15:49:04 2011 -0400 + + separate QC-pass and QC-fail reads + +commit 8362b4a255081ee7ca0a4ca2eabc8c76758b6863 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 8 17:45:19 2011 -0400 + + added verbose level + +commit f7bf419c290462be7d289249a4a6d28f825b4c93 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 8 16:08:14 2011 -0400 + + fixed a bug + +commit 890cbb1ac93b3004fb6cf42ff47195077dcfc8ad +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 8 16:00:37 2011 -0400 + + drop unrelated @RG when "-R" is in use + +commit a62dc929c950fb51311b705f5b5bfba8e3f704d7 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 8 16:00:14 2011 -0400 + + skip header validation + +commit 39da810e2c56c8f0eff1ab726600b41f26d3d8e9 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Apr 5 23:52:22 2011 -0400 + + change error message + +commit c0c50a34df250ef8a7a29b172058cd229be582b5 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Apr 5 23:50:46 2011 -0400 + + fixed a bug caused by recent modifications + +commit 25226e8c468404cb5e1b5272efcea57e4193c762 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Apr 5 13:31:19 2011 -0400 + + reduce the indel filtering window + +commit 5e18d7014437734f9dac9ab45a95e43ec2526101 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Apr 4 13:56:20 2011 -0400 + + only output hwe if it is small enough + +commit 614941fb7dd276de662e7820eb8c7bae871a18cc +Author: Heng Li <lh3@live.co.uk> +Date: Mon Apr 4 13:34:02 2011 -0400 + + added HWE back + +commit 7abe8825aa0bacccdeb38125934ae94d18f0ad4d +Author: Heng Li <lh3@live.co.uk> +Date: Mon Apr 4 12:46:24 2011 -0400 + + EM estimate of genotype frequency + +commit 2bfeff9c645d177416664f1cb811e85cac3ff9e3 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Apr 4 11:29:12 2011 -0400 + + minor + +commit 401e40647e7e3abbac6e4ec3d8bb68eb6f2d401b +Author: Heng Li <lh3@live.co.uk> +Date: Mon Apr 4 11:24:04 2011 -0400 + + Added genotype freq estimate and association test + +commit 6cc226df6e3b480f1bd6e763ce8ef47f785bbb74 +Author: Heng Li <lh3@live.co.uk> +Date: Sun Apr 3 20:57:23 2011 -0400 + + minor changes + +commit 7e47a39630e812f09b80369f14606245976f687e +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 1 15:21:59 2011 -0400 + + print the grayscale + +commit 2f675d9c0dde3c166c99e335fa17c7873a5ae8d5 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 1 08:55:16 2011 -0400 + + change to comment + +commit 0592bb514994544ed84f51e509b233cf8821e0cf +Author: Heng Li <lh3@live.co.uk> +Date: Fri Apr 1 08:54:35 2011 -0400 + + added base quality filtering + +commit fc1b47e04a7b94f6362c45856cbeb89d9d0b5ca5 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Mar 31 23:31:14 2011 -0400 + + fixed a few typos in comments + +commit 60be79bc8f0d24656e5e8a329af7e9b5b91d4c8b +Author: Heng Li <lh3@live.co.uk> +Date: Thu Mar 31 23:13:23 2011 -0400 + + comments + +commit 2432864acc25ebe5cee4217dbb0120439077a7f8 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Mar 31 22:42:46 2011 -0400 + + added bam2depth.c, a demo program + +commit 39625f7c6bea9ccbfd9af0feb22348d52079f012 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Mar 31 16:37:22 2011 -0400 + + added bgzf_check_bgzf() (used by tabix) + +commit 6de6bd3fb67fd22753a5f07d4cc25bf94e1b5a8c +Author: Heng Li <lh3@live.co.uk> +Date: Thu Mar 31 16:37:08 2011 -0400 + + fixed a bug in bedidx.c + +commit 3b9e257d25b2e81eed1625bc5d2882ed486ef20e +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 30 13:27:15 2011 -0400 + + added bed support to bcftools + +commit 47bcce3d14ec4d205283b61e5e653803996c42e0 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 30 12:56:40 2011 -0400 + + Added BED support to "samtools view" + +commit a812386017faedfc86c0e6562adbb2138329cfeb +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 30 12:47:04 2011 -0400 + + support BED file + +commit 3052dddc929f1825e6e7f7f6f6724d9465d6cf9a +Author: Heng Li <lh3@live.co.uk> +Date: Mon Mar 28 15:51:55 2011 -0400 + + relax RG matching; proper mismatching message + +commit f86d60c8fe25785523f01fae1486d2a6df4ee6ef +Author: Heng Li <lh3@live.co.uk> +Date: Sat Mar 26 10:38:23 2011 -0400 + + Avoid reporting association when something unexpected, which I do not understand, happens. + +commit dd41e6b26fd9fe30218748b9a0a1f49bdb1862b9 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Mar 26 10:38:01 2011 -0400 + + Added -1 to merge + +commit 4a0364b0d7f87f1c88d71ec5857a1f1d40710681 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 23 16:56:55 2011 -0400 + + plot pairwise r^2 + +commit 452629a711582e612bec22b3b082e234bd37039b +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 23 14:31:01 2011 -0400 + + pairwise LD; case-control AF2 + +commit 52862951adcaecde26ba8f0d9c1897944640a674 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Mar 21 23:03:14 2011 -0400 + + Release samtools-0.1.14 (r933:170) + +commit 59a5a8ba8e2940f0e38238f9339f02c91a8a0ce4 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Mar 21 13:52:55 2011 -0400 + + optionally skip loci with too low sample coverage + +commit 6434264b5c69514d4fafe62cbd30b3bbaddc1d41 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Mar 19 14:38:25 2011 -0400 + + mpileup support Illumina1.3+ quality; skip non-variant sites when "view -v" is in use + +commit 5f59e01987e1d5eca7d6359cae64a9734b18beea +Author: Heng Li <lh3@live.co.uk> +Date: Fri Mar 18 17:19:18 2011 -0400 + + update version to r933:167 + +commit 4d2c3c950910aa3d2c87760c3532e458fe01c0fa +Author: Heng Li <lh3@live.co.uk> +Date: Fri Mar 18 16:25:01 2011 -0400 + + added "-1" to the command-line help + +commit 55313a015a7bd6369cf5a66fed7fab2333201dc9 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Mar 18 16:22:12 2011 -0400 + + added the "cat" command (by Chris Saunders) + +commit b670272cadf3efa4dc456ac4c76104f73477d60d +Author: Heng Li <lh3@live.co.uk> +Date: Fri Mar 18 15:59:46 2011 -0400 + + support varying the compression level + +commit c5dd3c9ca5f75f880e52c8cd2beae983bcb8d3b1 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 16 14:33:45 2011 -0400 + + update the manual pages + +commit 12fb4b596dc51bccd154fc4bd0593442f7937a46 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 16 12:49:26 2011 -0400 + + update changelog + +commit e7fe4fd66e02d60a1ca7952ad1938809e77729a9 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 16 12:10:05 2011 -0400 + + do not call indels when the depth is very high + +commit 7455eeaa32b949bb3856f75810890aabf7cacb18 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 16 11:56:56 2011 -0400 + + code clean up + +commit 5f16679e54ced8e67a75d949f9175c50480b914e +Author: Heng Li <lh3@live.co.uk> +Date: Tue Mar 15 14:45:24 2011 -0400 + + when -s is specified, change the sample order + +commit 7ba95adee09d3b06a7eaf797d25efef837e592f5 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Mar 15 14:11:42 2011 -0400 + + compute the rank in permutation + +commit d219783cea7643fc7e10e1bd3a98e9b3165b4506 +Author: Heng Li <lh3@live.co.uk> +Date: Sun Mar 13 21:35:13 2011 -0400 + + I have found a SERIOUS BUG!!! + +commit 8e20d04ecdac1a7788eef71c4bb91b8479cf7150 +Author: Heng Li <lh3@live.co.uk> +Date: Sun Mar 13 17:04:04 2011 -0400 + + optionally shuffle samples in a BCF (debugging) + +commit fc7b261f181f2a411427bc9ee5d586c883ca9cdc +Author: Heng Li <lh3@live.co.uk> +Date: Fri Mar 11 09:34:20 2011 -0500 + + fixed a bug + +commit b3bbcc3d40994ae85705ab6fef9866ec8c142201 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Mar 10 20:25:59 2011 -0500 + + use mode instead of mean + +commit f1161262d137098a19143b5cb0de810e5db3243e +Author: Heng Li <lh3@live.co.uk> +Date: Thu Mar 10 20:09:16 2011 -0500 + + start from the mean instead of the mode + +commit 2ba56f5e99e90674855c4ffc8bf583340b932e1e +Author: Heng Li <lh3@live.co.uk> +Date: Thu Mar 10 17:13:34 2011 -0500 + + fixed an error in Chi^2 test + +commit b4ce7ae400290bc43dd287240479667f99b3b11e +Author: Heng Li <lh3@live.co.uk> +Date: Thu Mar 10 00:23:39 2011 -0500 + + minor + +commit 8487fa5d3a73a43443964e731ea2a4c873c9d4e5 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 9 21:33:19 2011 -0500 + + added -F to accept BCFs generated by old samtools + +commit fd51d2093f7fd775a7eaaeea57fa34716ab59ac2 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 9 17:39:09 2011 -0500 + + update version + +commit b6da54335df943015a998a934075331b467abb5b +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 9 17:37:14 2011 -0500 + + compute pseudo-chi2 probability + +commit 9f73cefdb8935421d872b989dd98fbc8e1295029 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 9 15:54:04 2011 -0500 + + remove a comment which is wrong + +commit b10b1e47ece522e97ab8ef23417bcb6454f8b9db +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 9 15:51:12 2011 -0500 + + clean up + +commit 353bfae2c6ff59205bd9223db04084cf7f507f01 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 9 15:45:29 2011 -0500 + + for backup + +commit 53915d1c6410c2537d18bfa8eb8c657a2233c35e +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 9 15:27:56 2011 -0500 + + having debugging code + +commit 0d0dbf66995b1511390d593981eae7b5d36fe17b +Author: Heng Li <lh3@live.co.uk> +Date: Wed Mar 9 14:58:23 2011 -0500 + + temporary backup + +commit 5b74a174a8b637dee43b7f30250df6fb96580e12 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Mar 8 15:46:11 2011 -0500 + + the output makes sense, but there may be a typo... + +commit d81ec654b6c0c1eef6b0625d96f14b3155cee7c6 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Mar 8 15:19:09 2011 -0500 + + added contrast2(); fixed a bug in haploid mode + +commit 0cfd896fad5f7737cca49efa94a11892dafcd812 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Mar 7 21:40:17 2011 -0500 + + fixed a bug in haploid genotyping + +commit ccd52155ef61273f2b42ad9c7b31ff1915f81b24 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Mar 5 18:10:35 2011 -0500 + + fixed a few bugs; still not fully working + +commit edc3af753f96f831968ae32f2e0f915b74f31e6e +Author: Heng Li <lh3@live.co.uk> +Date: Fri Mar 4 17:31:33 2011 -0500 + + drop HWE calculation + +commit 92dac194debb66ca0718c21c871822dda2dd5bc1 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Mar 4 17:28:35 2011 -0500 + + implemented hap/dipoind mode; probably BUGGY! + +commit 7f26804bc27937e36fdc967e5c76514653ea40f5 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Mar 4 16:01:27 2011 -0500 + + read ploidy + +commit e7b7213475b5e61a69aab77ffb02b4983c8e7678 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Mar 4 14:12:14 2011 -0500 + + added math notes + +commit 46023e2f21321da83fc8e83e9229757a4e821acb +Author: Heng Li <lh3@live.co.uk> +Date: Fri Mar 4 13:34:10 2011 -0500 + + update BCF spec + +commit 13190c49eeb006ad7013b7f1e9fc1b3beca3ae78 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Mar 1 14:45:19 2011 -0500 + + Release samtools-0.1.13 (r926:134) + +commit be8fabbb6001d9fd5263a70a3e21ed6dfe5a9837 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Mar 1 14:07:15 2011 -0500 + + prepare to finalize 0.1.13 + +commit 1e8c753660978bed7e9289fe50becd596d9314bb +Author: Heng Li <lh3@live.co.uk> +Date: Tue Mar 1 09:40:17 2011 -0500 + + allow to change whether to drop ambiguous reads + +commit 412210bfdb46606023f2e4b9086f2787f0cf1c62 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 28 22:01:29 2011 -0500 + + revert to the old behavior of phase + +commit 46035589518cf84738de8666b866e2619457c1fb +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 28 16:46:23 2011 -0500 + + change version number + +commit 7f40c33e37fc16fcb0a375ce46ae1d09cafb6d50 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 28 16:37:42 2011 -0500 + + bugfix in indel calling: interger overflow + +commit 75849470efbe30042e5ddd516f9bcbe3b9bf6062 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 28 15:35:47 2011 -0500 + + fixed a typo + +commit 9e6fb569885f906fabaab7fc2f02eae82f4bd602 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 28 15:34:09 2011 -0500 + + minor changes to heuristic rules + +commit 30a799a91f5e2c10b761aa5437f902c6649fceb3 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 28 15:20:26 2011 -0500 + + fixed a bug in the latest change + +commit e21ba9df950ea37f5c1b35c2af9ba9a4e0bba02a +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 28 12:47:06 2011 -0500 + + put version in bam.h + +commit 918b14780c1dceb39c7010638ecd61c626e17166 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 28 12:00:38 2011 -0500 + + frag_t::phased==0 reads are dumped to chimera.bam + +commit 657293c7bdba3ac69f53cd1ffa2874ed8756475e +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 28 11:05:29 2011 -0500 + + change default -q to 37 (previously 40) + +commit 33d8d3bea76e466798ea322d68d34deb8d2dff06 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 28 10:39:57 2011 -0500 + + fixed a minor bug in BAM reading + +commit daa25d426d42465d76c7317c95772bbb36bb3f47 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Feb 26 21:07:24 2011 -0500 + + suppress gzopen64() warning + +commit 9cec4256eb9e7848d4711adb67b540659c141e32 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 25 22:14:52 2011 -0500 + + fixed a long existing bug in vcf2fq + +commit 304487c83067a733add71cbc3886fa8c49f7ef2a +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 25 16:37:40 2011 -0500 + + change version number + +commit 10ba6bf4f16692760f696f7b17f3719065786f77 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 25 16:34:08 2011 -0500 + + Change the order of PL; change SP to int32_t + +commit c5cc2a8036a9c3579fbfde651efec4f6763b0228 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 25 14:52:03 2011 -0500 + + claim X defined in the header + +commit 4ee8cb29f6092fd14a89f0cc5d3575112a204f39 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 25 14:40:24 2011 -0500 + + minor changes + +commit 00065e9336a2831dc53bee7da2f4719845be1a2a +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 25 11:39:06 2011 -0500 + + fixed an error in the BCF spec + +commit 1e2a73afcb72a02aa448718cb017c0438de89f90 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 25 11:36:40 2011 -0500 + + update BCF spec + +commit dbf8eedaa38a405cb2fba5b3952b85776f51d035 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 25 11:28:43 2011 -0500 + + update BCF spec + +commit eed1d91af9fad3c9d965333a55e623757f9c4e9d +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 25 09:51:39 2011 -0500 + + fixed a flaw in targetcut + +commit 59bc980bb832b92a8b0cc244cf106e6150e4db6f +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 25 00:54:35 2011 -0500 + + update manual page + +commit fcc4738c4abdca79e3de159e21208df1b98ac76c +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 25 00:45:39 2011 -0500 + + update version format + +commit 5748639ae542b7f6b853562edc2bb3faf43030e4 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 25 00:45:12 2011 -0500 + + update version number + +commit 06b44cc366cf27ce8976ee6a05810a0b3c48b56d +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 25 00:44:21 2011 -0500 + + update version number + +commit ab7f4529d12739ff66fd4c09af9d992ab59c53ef +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 25 00:42:55 2011 -0500 + + various help message + +commit a092e1f6f963272f8bb23616986ddaf604fd0f82 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 24 23:43:13 2011 -0500 + + disable unfinished functionality + +commit f00a78db72b14ee4c6689fc13f20ed31aeaecd40 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 24 10:04:56 2011 -0500 + + added "const" to bcf_p1_cal() + +commit 91049c4a8db3bf50dcc9d07506f22fa4ca5b5a96 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Feb 23 11:53:47 2011 -0500 + + randomly allocate unphased reads + +commit f4405354a8d4cb3441141fa734573031059d7f57 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 22 15:36:07 2011 -0500 + + fixed a typo + +commit 3075e4dc5c7c9d954426aabda6a73fa788357100 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 22 15:33:40 2011 -0500 + + make output more informative + +commit 628cf3235e2815a40acf089fb1d3357be6437787 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 22 14:50:06 2011 -0500 + + change the scoring rule; change default k to 13 + +commit f22fd99831e4b5c74f898719216f359dbe987bbf +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 22 14:45:15 2011 -0500 + + update scoring in masking + +commit 2f23547b81984555032aa0eefd064b8e07986fdc +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 22 14:37:17 2011 -0500 + + remove dropreg() + +commit 4d8b6b1f1f331ca9041983c66e34a857c3b8f1bb +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 22 13:10:16 2011 -0500 + + accept files from stdin + +commit 9b50c5038e6fc0185e29ca5b50fe0806a9a939b9 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 22 11:16:57 2011 -0500 + + fixed a bug in consensus generation + +commit 1332ab32fb788fdc81b2ba8653b905d106238fad +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 21 22:53:23 2011 -0500 + + print dropped fragments + +commit a288761b4ca1584e51076a71cbc4d72fe923dda1 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 21 22:37:04 2011 -0500 + + bugfix: singletons are not phased + +commit 683365f534c0223dea7d72532015ac16a45ba22b +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 21 17:27:10 2011 -0500 + + output singleton blocks + +commit 841a4609084d81f1bc81e0b00dd806002461e7d9 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 21 15:58:55 2011 -0500 + + fixed a bug; not working with -l right now + +commit fdd57ea31732b5516dc212d72174b60206952636 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 21 15:17:00 2011 -0500 + + skip mapQ==0 reads + +commit 4eb6ba75c23c1c9be5f76814fa1b93a2e304b2af +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 21 14:03:03 2011 -0500 + + print the "targetcut" command + +commit 0123d9559ba58b026c0dfd15bc26019a193cd21a +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 21 11:22:13 2011 -0500 + + allow to set the maximum depth + +commit 0f92eb248a4d06645b2c3d736a0faea8a7a9f731 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 21 09:56:41 2011 -0500 + + use a proper error model to call hets + +commit 587a01504af5aea6288740d121dccf48fb8a75f4 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 21 09:16:38 2011 -0500 + + phase is UNFINISHED; strip RG when merging + +commit 723bf3cd79e4f4a558373d4c707fa6b3db0fb357 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Feb 19 23:38:11 2011 -0500 + + use a proper model to compute consensus + +commit 891a6b02d4a9af2ed98fbaac4915bf1f0da4f6c8 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Feb 19 22:14:19 2011 -0500 + + added comment + +commit 8b55e0a581ecc9e4ba754d1f3c8784f3038b6e48 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 18 17:23:39 2011 -0500 + + change the output format + +commit 75c36e8c563eddd0a362ba3b38cf0aea21aafb1f +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 15 20:31:00 2011 -0500 + + fixed a bug in writing BAM + +commit bb0ce52f066cfebaa35a125d57b353bb717a5165 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 14 23:39:09 2011 -0500 + + skip uncovered; unknown alleles taken as X + +commit ba67f4d119c7d06907db3015d337d9a01a3fc9fe +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 14 23:21:19 2011 -0500 + + fixed a bug + +commit e4448d49e6129a5e1ee9c7f04f43612f12d6aad6 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 14 22:43:09 2011 -0500 + + prepare to read hets from a list; unfinished + +commit 129ea29c1f12177c0a7c3e21676f6210370fc59b +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 14 16:32:22 2011 -0500 + + updated khash.h to 0.2.5 + +commit 15b44ed93bd949dffcf79ac8dbea6d9b7dfcb58c +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 14 16:15:04 2011 -0500 + + use the latest version of khash + +commit 486c05f06f44d981dfb2069bcb43e4b35fd8389c +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 14 15:04:40 2011 -0500 + + change the default -k to 11 + +commit 07cf9d1e443d73cf053de38dd01671e3781f6e29 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 14 14:50:51 2011 -0500 + + sort fragments by vpos instead of by beg + +commit d0d3e7faabf5cbb7e5ff7b294f7e220da807c4c0 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 14 14:45:41 2011 -0500 + + shuffling the two haplotypes for better randomness + +commit 3be28eaf5f6033229aedf12ddb11a0084ba01cd8 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 14 14:09:17 2011 -0500 + + write chimeras to a separate BAM + +commit 80ccbc26f43918fe42be123cc1da9d3d7ce30816 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 14 13:54:13 2011 -0500 + + no mem leak/violation on small files; correctness is not checked + +commit 5c923867432fa14c26a19e3782e7f48d4080f6ac +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 14 13:50:25 2011 -0500 + + bam separation; at least not immediate segfault + +commit cea2643ec30a59735bf89b2f562b563bf7263e79 +Author: Heng Li <lh3@live.co.uk> +Date: Sun Feb 13 23:24:11 2011 -0500 + + on the way to implement BAM separation; unfinished + +commit 964269cd15036a470ca89e43d0952201a0825671 +Author: Heng Li <lh3@live.co.uk> +Date: Sun Feb 13 18:07:56 2011 -0500 + + keep singletons in the hash table + +commit 2d4aa649bd670d5e038a1acaefd33c5fe24ae0e8 +Author: Heng Li <lh3@live.co.uk> +Date: Sun Feb 13 17:42:24 2011 -0500 + + Revert "prepare to add bam separation" + + This reverts commit ed6957e5211c2c4cf684dcb8bbb661052c74df6f. + +commit ed6957e5211c2c4cf684dcb8bbb661052c74df6f +Author: Heng Li <lh3@live.co.uk> +Date: Sun Feb 13 00:24:28 2011 -0500 + + prepare to add bam separation + +commit d211e652d93791d2e112d334added243ffe5fc3e +Author: Heng Li <lh3@live.co.uk> +Date: Sat Feb 12 18:50:20 2011 -0500 + + accelerate kstrtok + +commit 2d6af49d331ff5afe7b9e9b102e79d7d4512fdbe +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 11 21:08:21 2011 -0500 + + split unlinked blocks + +commit 68e4cd1b560b0a6fd4c77e5e51eadde9fda26ea4 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 11 10:47:58 2011 -0500 + + remove heading and tailing ambiguous positions + +commit d2b685141426a902ae76660c1fbe8020da150cf8 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 11 10:02:21 2011 -0500 + + code clean up for further features + +commit c6980e062d55928b59f287c03e599dd5a37ed509 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 11 08:00:08 2011 -0500 + + change /64 to >>6; the latter is faster + +commit 91635b9c2687f24d72ee6a8aad2050a79bb8400f +Merge: 41d4df2 9a7e155 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 11 01:22:55 2011 -0500 + + Merge branch 'master' into devel + +commit 9a7e155cc591c1b6c9f7f9cb939364a6becb65b2 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 11 01:21:07 2011 -0500 + + output an unrecognized field as '.'; autofix GL/PL + +commit 41d4df2e9545e9abe97151cfe5d6c763f3d00db1 +Merge: c00c41c aacce0c +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 10 23:00:14 2011 -0500 + + Merge branch 'master' into devel + +commit aacce0ce7276f451e4fddf81832f9e5f7f65198b +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 10 22:57:53 2011 -0500 + + finished VCF->BCF conversion + +commit 0e875df643e41d848b709e2fa877de8ae53cdd4c +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 10 21:57:28 2011 -0500 + + fixed a bug in reading VCF files + +commit c00c41c2a5da69cccea64adb542a0b365e56b4fc +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 10 16:28:37 2011 -0500 + + suppres one-allele blocks + +commit 2e2354b673722e2f00d72970a043f80a66270da1 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 10 16:06:56 2011 -0500 + + fixed the bug in filtering + +commit d971e1fe24de4ecaf94055efffc5f641e2bdb563 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 10 12:24:23 2011 -0500 + + prepare to add filtering; buggy right now + +commit a0a5a3fbf504c3b02f7b9212e72315c1047cc249 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 10 11:55:02 2011 -0500 + + make masking optional + +commit 28db71ccd95054a5f8a47c2332794f8968f6a822 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 10 11:40:47 2011 -0500 + + routine to mask poorly called regions + +commit a3f6c439262bc10a4067860440f4d4dde9e0c515 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Feb 9 17:18:33 2011 -0500 + + code clean up: remove globals + +commit 0b711978492f6ad39d459d78723c299468906818 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Feb 9 16:52:54 2011 -0500 + + output more information + +commit f69d217ae5b691bf42ad07a97f29a7cc6456046f +Author: Heng Li <lh3@live.co.uk> +Date: Wed Feb 9 16:11:54 2011 -0500 + + fixed another bug in flipping + +commit d47882d549337fbcc251597508a2c7faf1bb92e2 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Feb 9 16:01:35 2011 -0500 + + fixed a stupid bug in flipping + +commit e33f89de499496537f5fbde396a66557f0353f1b +Author: Heng Li <lh3@live.co.uk> +Date: Wed Feb 9 15:54:42 2011 -0500 + + fix chimeras; a little weird... + +commit 03d3c1d0b945245108ce0942d4772536a32212c7 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Feb 9 13:27:35 2011 -0500 + + no effective change; prepare to fix chimera + +commit 6bc0a4676dd2252085a6e67bb06daa5ae05a554f +Author: Heng Li <lh3@live.co.uk> +Date: Wed Feb 9 11:52:58 2011 -0500 + + better count output + +commit dcac515439d25f71125d6de8111da417776ab9ce +Author: Heng Li <lh3@live.co.uk> +Date: Wed Feb 9 10:31:07 2011 -0500 + + prepare for another way of filtering + +commit ca7e4f1899b86d2e077994c789e8f69d699b3cd9 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 8 16:10:08 2011 -0500 + + fixed the bug; I can do better. + +commit 0733f77b98af121bdcb198cea6151d159831bb9c +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 8 15:55:38 2011 -0500 + + fixed two bugs; still not working... + +commit 80f18cba9ba73c9592380fc1ecd53c351d294782 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 8 15:42:58 2011 -0500 + + filter false SNPs; NOT working right now + +commit 69a66e2f96d5b102cd712ff1527a3802fa84c590 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 8 14:39:09 2011 -0500 + + write sequence in the SAM format for debugging + +commit b6f1c9d160822af2b713be206f37bd6dde00546a +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 7 11:51:21 2011 -0500 + + fixed two bugs + +commit 400aa5c06100af9c47cd5e4ce8b95b7deb84f54b +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 7 11:22:38 2011 -0500 + + Optionally apply BAQ + +commit 4c82e0e19682e424f5cdb8381364114c307b329e +Author: Heng Li <lh3@live.co.uk> +Date: Mon Feb 7 01:23:31 2011 -0500 + + improved output; the result makes sense at a glance + +commit dc7853a581ab24bcc496e96b123ccf637e32ed1d +Author: Heng Li <lh3@live.co.uk> +Date: Sun Feb 6 14:12:43 2011 -0500 + + process per linked block instead of per chr + +commit e867d9c6c2e61d9e748e78163e5481dca5697a36 +Author: Heng Li <lh3@live.co.uk> +Date: Sun Feb 6 00:45:46 2011 -0500 + + DP seems to work on toy examples + +commit 445ad72fc43d4354d56f5f759790e8ae0be73d02 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Feb 5 01:24:42 2011 -0500 + + implemented backtrack; not tested + +commit ba38e180b9cd545956583b22e97e09b4bb12073e +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 4 23:55:23 2011 -0500 + + More "correct" DP; backtrack not implemented + +commit d69761fd9351273ccd37ea431b10509add91e7cf +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 4 17:22:31 2011 -0500 + + scratch of dynamic programming; unfinished... + +commit 769ffcb44e26e59300791658801d321559b33858 +Author: Heng Li <lh3@live.co.uk> +Date: Fri Feb 4 16:29:55 2011 -0500 + + UNFINISHED commit. + +commit 9adab9591317c3467f3d8cdf2d19ec1f65d1b5b7 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 3 16:20:59 2011 -0500 + + another way of counting; can be even faster + +commit bbafbdc01ed1ceaab44927def1ad47c4c78aeb9c +Author: Heng Li <lh3@live.co.uk> +Date: Thu Feb 3 14:48:20 2011 -0500 + + for backup + +commit eba7446389cad62a19133bced1386a4334dcab79 +Merge: a44a98e f01a593 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Feb 2 14:06:07 2011 -0500 + + Merge branch 'master' into devel + +commit f01a5930445b5fda7e6b5b813ed63c652160ada2 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Feb 2 11:31:54 2011 -0500 + + Better truncation warning when EOF is absent + +commit dd3ee5ed26c8bbef4a62fa5b2bfb0a75833f2c31 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Feb 2 10:38:28 2011 -0500 + + fixed a typo in BCF/VCF headers + +commit b9d1137c55f401387113d1ad8a387489afe741db +Author: Heng Li <lh3@live.co.uk> +Date: Wed Feb 2 09:13:44 2011 -0500 + + fixed an out-of-boundary bug (fixed by Roel Kluin) + +commit a44a98e16559b9672e8a3492c8f8c640074b7ee2 +Merge: ef68a14 d0443d5 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 1 21:54:48 2011 -0500 + + Merge branch 'master' into devel + +commit d0443d5c2f648e0f69bd4c56eaac7868e501c18b +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 1 17:31:52 2011 -0500 + + improved sorting order checking + +commit ef68a14fab91399b2ecd38345936c3d6e7391cf3 +Merge: 1e597b3 1a39a2e +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 1 15:12:37 2011 -0500 + + Merge branch 'master' into devel + +commit 1a39a2eb08a270e20a34a0983e8bed6ffb3e2008 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 1 15:12:14 2011 -0500 + + more precise error message + +commit e028e7a47c02232e06a9dd3009262c00dede1060 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 1 14:48:01 2011 -0500 + + improved sorting order validation in index + +commit 1e597b3356744e2b791b12c9187f91c8054511d5 +Author: Heng Li <lh3@live.co.uk> +Date: Tue Feb 1 14:44:27 2011 -0500 + + testing only; not working + +commit 5753ace1e54228822d8ee95f69943f586e42f6e8 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Jan 31 17:37:08 2011 -0500 + + reduce the effect of seq errors at the cost of SN + +commit 6f239ce5e0abd47babee33174476d48b723260d8 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Jan 31 17:29:34 2011 -0500 + + added testing code + +commit 3db42fe22d27d61ab5735cd2308f73d93def8ebe +Author: Heng Li <lh3@live.co.uk> +Date: Mon Jan 31 14:33:21 2011 -0500 + + routine for phasing fosmid resequencing (incomplete) + +commit ed88f2797323229ae8f38fbcd107b231007956a8 +Author: Heng Li <lh3@live.co.uk> +Date: Mon Jan 31 10:12:53 2011 -0500 + + SAM output + +commit abc6acae28dc4794f6422255f077cf370d34e414 +Merge: f1985a9 b133dbf +Author: Heng Li <lh3@live.co.uk> +Date: Sat Jan 29 22:56:10 2011 -0500 + + Merge branch 'master' into devel + +commit b133dbf82de4e8cea5eb56e5bbf0c4b3e9368fd5 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Jan 29 22:37:11 2011 -0500 + + fixed a bug in tview on big-endian by Nathan Weeks + +commit 9d3fdaef29f91e21dbfcb9ff0165b9573e7c1042 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Jan 29 22:24:00 2011 -0500 + + update INSTALL + +commit 9d074a38bde53961f96157b6fb3683b6dded38d7 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Jan 29 21:56:25 2011 -0500 + + avoid a segfault when network connect fails + +commit f1985a93f7455b3ea1b0ef9b959d50b896ccd620 +Author: Heng Li <lh3@live.co.uk> +Date: Sat Jan 29 21:53:18 2011 -0500 + + fixed a bug about bit ordering + +commit d09797db6fef648a6823cbe718d67664660c6ebe +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jan 27 16:53:19 2011 -0500 + + point out there are 4 or fewer free parameters + +commit 5fd1717650ed68ab6c55d094d1648c16a054891a +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jan 27 16:09:18 2011 -0500 + + updated .gitignore + +commit fccb19fbe8f9de91f59d85bb49a248683dc6266c +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jan 27 16:08:14 2011 -0500 + + fixed a bug; better scoring + +commit b4dcb844bde3d09eedcd9f6832186ece60ae5afd +Merge: ffc3e89 6f502de +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jan 27 14:50:30 2011 -0500 + + Merge branch 'master' into devel + +commit 6f502dec46b18dae4bb5b2319715d028b5e193d0 +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jan 27 14:47:31 2011 -0500 + + skip unmapped and ref-skip reads in indel calling + +commit 3639f37dd8257b24560c35effcc3b6c16c3c1bcb +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jan 27 14:19:15 2011 -0500 + + fixed an out-of-boundary bug in rare cases + +commit ffc3e89678ab9052b84f403da1e43044b045e73f +Author: Heng Li <lh3@live.co.uk> +Date: Thu Jan 27 14:00:17 2011 -0500 + + targetcut can be compiled, though probably buggy + +commit f452b3ac51306865ddde31a8d715b155d4d3e6e6 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Jan 26 18:58:43 2011 -0500 + + this is for a very special application... + +commit ca1451c6406c7ee757cb31349ea0b8de70db0656 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Jan 26 18:48:09 2011 -0500 + + fixed compiling errors + +commit 085b87a7642865f17239fb6a436e626e25417838 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Jan 26 18:45:09 2011 -0500 + + This script was put in a wrong place... + +commit 090d360828622520de60385af4928ce1aebe0e48 +Author: Heng Li <lh3@live.co.uk> +Date: Wed Jan 26 18:33:58 2011 -0500 + + Imported from samtools-r902 +------------------------------------------------------------------------ +r108 | lh3lh3 | 2009-01-20 11:56:45 +0000 (Tue, 20 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/examples/Makefile + +made it a little more convenient + +------------------------------------------------------------------------ +r107 | lh3lh3 | 2009-01-20 11:53:30 +0000 (Tue, 20 Jan 2009) | 2 lines +Changed paths: + A /branches/dev/samtools/examples/Makefile + +added a Makefile + +------------------------------------------------------------------------ +r106 | lh3lh3 | 2009-01-20 11:25:05 +0000 (Tue, 20 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/maq2sam.c + +support RG tag + +------------------------------------------------------------------------ +r105 | lh3lh3 | 2009-01-18 17:37:20 +0000 (Sun, 18 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + +update changelog + +------------------------------------------------------------------------ +r104 | lh3lh3 | 2009-01-18 17:31:21 +0000 (Sun, 18 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_lpileup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-18 + * fixed a bug in bam_lpileup.c: segment start and end are not correctly recognized + +------------------------------------------------------------------------ +r103 | lh3lh3 | 2009-01-18 16:34:03 +0000 (Sun, 18 Jan 2009) | 5 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-17 + * fixed a bug when there are reads without coordinates + * also recognize type 'c' as 'A' + * found a bug in bam_lpileup.c; NOT fixed yet + +------------------------------------------------------------------------ +r102 | lh3lh3 | 2009-01-17 19:46:49 +0000 (Sat, 17 Jan 2009) | 2 lines +Changed paths: + A /branches/dev/samtools/INSTALL + +Instruction for compilation + +------------------------------------------------------------------------ +r101 | lh3lh3 | 2009-01-17 19:31:36 +0000 (Sat, 17 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + A /branches/dev/samtools/Makefile.lite + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/faidx.c + M /branches/dev/samtools/misc/Makefile + M /branches/dev/samtools/razf.c + + * replaced HAVE_RAZF with _NO_RAZF + * added Makefile.lite for people who have trouble with razf.c + +------------------------------------------------------------------------ +r100 | lh3lh3 | 2009-01-16 10:03:37 +0000 (Fri, 16 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_mate.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/misc/wgsim.c + + * samtools-0.1.1-15 + * fixed another bug in fixmate: unmapped pair has non-zero isize + +------------------------------------------------------------------------ +r99 | lh3lh3 | 2009-01-16 09:13:36 +0000 (Fri, 16 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + M /branches/dev/samtools/bam_mate.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-14 + * fixed a bug in fixmate: isize not equal to zero if two ends mapped to + different chr + +------------------------------------------------------------------------ +r98 | lh3lh3 | 2009-01-15 16:47:41 +0000 (Thu, 15 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-13 + * fixed the prior for hom indels (Richard pointed this out) + +------------------------------------------------------------------------ +r97 | lh3lh3 | 2009-01-15 16:38:47 +0000 (Thu, 15 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/COPYING + M /branches/dev/samtools/bam_sort.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/source.dot + + * samtools-0.1.1-12 + * fixed a bug in sort + * update source file graph and copyright information + +------------------------------------------------------------------------ +r96 | lh3lh3 | 2009-01-14 21:46:14 +0000 (Wed, 14 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/glf.c + +fixed a typo + +------------------------------------------------------------------------ +r95 | lh3lh3 | 2009-01-14 21:44:53 +0000 (Wed, 14 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/glf.c + +added a main function for glf.c + +------------------------------------------------------------------------ +r94 | lh3lh3 | 2009-01-14 17:14:59 +0000 (Wed, 14 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/bgzf.h + A /branches/dev/samtools/glf.c + M /branches/dev/samtools/glf.h + + * samtools-0.1.1-11 + * generate binary GLFv2 + * added glfview command to dump GLFv2 binary file + +------------------------------------------------------------------------ +r93 | lh3lh3 | 2009-01-14 15:07:44 +0000 (Wed, 14 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_rmdup.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/glf.h + + * samtools-0.1.1-10 + * fixed several bugs in rmdup + * prepare to generate GLF2 + +------------------------------------------------------------------------ +r92 | lh3lh3 | 2009-01-14 13:27:44 +0000 (Wed, 14 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_import.c + A /branches/dev/samtools/bam_rmdup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-9 + * implemented rmdup; NOT tested yet + +------------------------------------------------------------------------ +r91 | lh3lh3 | 2009-01-13 20:15:43 +0000 (Tue, 13 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/examples/00README.txt + +update README for typos + +------------------------------------------------------------------------ +r90 | lh3lh3 | 2009-01-13 19:57:50 +0000 (Tue, 13 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/examples/ex1.sam.gz + +update example + +------------------------------------------------------------------------ +r89 | lh3lh3 | 2009-01-13 17:21:38 +0000 (Tue, 13 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.c + A /branches/dev/samtools/bam_mate.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-8 + * added fixmate command + +------------------------------------------------------------------------ +r88 | lh3lh3 | 2009-01-13 10:48:23 +0000 (Tue, 13 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-7 + * change the reported indel position to the previous way + +------------------------------------------------------------------------ +r87 | lh3lh3 | 2009-01-12 22:12:12 +0000 (Mon, 12 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-6 + * addd glt output + * allow to change indel calling parameters at the command line + +------------------------------------------------------------------------ +r86 | lh3lh3 | 2009-01-12 21:16:48 +0000 (Mon, 12 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-5 + * added two more flags + * allowed to select reads shown in pileup with a mask + +------------------------------------------------------------------------ +r85 | lh3lh3 | 2009-01-12 20:47:51 +0000 (Mon, 12 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-4 + * fixed a bug in indexing (linear index) + * prepare to add glt output from pileup + +------------------------------------------------------------------------ +r84 | lh3lh3 | 2009-01-12 09:22:35 +0000 (Mon, 12 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-3 + * fixed a bug in outputing the coordinate of an indel + +------------------------------------------------------------------------ +r83 | lh3lh3 | 2009-01-11 15:18:01 +0000 (Sun, 11 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-2 + * pileup: allows to output indel sites only + +------------------------------------------------------------------------ +r82 | lh3lh3 | 2009-01-10 23:34:31 +0000 (Sat, 10 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_maqcns.h + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-1 + * implemented a Bayesian indel caller + +------------------------------------------------------------------------ +r81 | lh3lh3 | 2009-01-09 09:54:28 +0000 (Fri, 09 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/examples/00README.txt + D /branches/dev/samtools/examples/ex1.fa.fai + +Let users generate ex1.fa.fai. + +------------------------------------------------------------------------ +r80 | lh3lh3 | 2009-01-08 16:10:08 +0000 (Thu, 08 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/bowtie2sam.pl + +make the bowtie converter works for "-k 2" + +------------------------------------------------------------------------ +r78 | lh3lh3 | 2009-01-03 17:25:24 +0000 (Sat, 03 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/export2sam.pl + +fixed a bug for "QC" reads + +------------------------------------------------------------------------ +r77 | lh3lh3 | 2009-01-01 18:32:06 +0000 (Thu, 01 Jan 2009) | 3 lines +Changed paths: + A /branches/dev/samtools/misc/bowtie2sam.pl + M /branches/dev/samtools/misc/soap2sam.pl + + * soap2sam.pl: added NM tag + * bowtie2sam.pl: converter for bowtie + +------------------------------------------------------------------------ +r76 | lh3lh3 | 2008-12-31 23:24:24 +0000 (Wed, 31 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/misc/soap2sam.pl + +soap2sam.pl: convert soap output to SAM + +------------------------------------------------------------------------ +r75 | lh3lh3 | 2008-12-31 17:54:32 +0000 (Wed, 31 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/misc/wgsim_eval.pl + + * wgsim_eval.pl-0.1.1 + * fixed a bug for a contig name like "NT_012345" + +------------------------------------------------------------------------ +r74 | lh3lh3 | 2008-12-31 16:38:21 +0000 (Wed, 31 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/misc/wgsim_eval.pl + + * evaluate alignment for reads generated by wgsim + +------------------------------------------------------------------------ +r73 | lh3lh3 | 2008-12-31 15:11:22 +0000 (Wed, 31 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/Makefile + M /branches/dev/samtools/misc/wgsim.c + +fixed compiling warnings for wgsim + +------------------------------------------------------------------------ +r72 | lh3lh3 | 2008-12-31 13:40:51 +0000 (Wed, 31 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bam_tview.c + +remove an unused variable (a compiler warning only) + +------------------------------------------------------------------------ +r71 | lh3lh3 | 2008-12-31 13:37:16 +0000 (Wed, 31 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/Makefile + A /branches/dev/samtools/misc/wgsim.c + +wgsim: Paired-end reads simulator + +------------------------------------------------------------------------ +r70 | bhandsaker | 2008-12-29 20:27:16 +0000 (Mon, 29 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_tview.c + +Move definition of bam_nt16_nt4_table so we can build without curses. + +------------------------------------------------------------------------ +r62 | lh3lh3 | 2008-12-22 15:55:13 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/NEWS + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + +Release samtools-0.1.1 + +------------------------------------------------------------------------ +r61 | lh3lh3 | 2008-12-22 15:46:08 +0000 (Mon, 22 Dec 2008) | 10 lines +Changed paths: + M /branches/dev/samtools/bam_aux.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/razf.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-66 + * fixed a bug in razf.c: reset z_eof when razf_seek() is called + * fixed a memory leak in parsing a region + * changed pileup a little bit when -s is in use: output ^ and $ + * when a bam is not indexed, output more meaningful error message + * fixed a bug in indexing for small alignment + * fixed a bug in the viewer when we come to the end of a reference file + * updated documentation + * prepare to release 0.1.1 + +------------------------------------------------------------------------ +r60 | lh3lh3 | 2008-12-22 15:10:16 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/examples + A /branches/dev/samtools/examples/00README.txt + A /branches/dev/samtools/examples/ex1.fa + A /branches/dev/samtools/examples/ex1.fa.fai + A /branches/dev/samtools/examples/ex1.sam.gz + +example + +------------------------------------------------------------------------ +r59 | lh3lh3 | 2008-12-22 09:38:15 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + +update ChangeLog + +------------------------------------------------------------------------ +r58 | lh3lh3 | 2008-12-20 23:06:00 +0000 (Sat, 20 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/misc/export2sam.pl + + * added comments + * fixed several bugs + +------------------------------------------------------------------------ +r57 | lh3lh3 | 2008-12-20 15:44:20 +0000 (Sat, 20 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/misc/export2sam.pl + +convert Export format to SAM; not thoroughly tested + +------------------------------------------------------------------------ +r56 | lh3lh3 | 2008-12-19 22:13:28 +0000 (Fri, 19 Dec 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/source.dot + + * samtools-0.1.0-65 + * pileup: generate maq-like simple output + * pileup: allow to output pileup at required sites + * source.dot: source file relationship graph + * tview: fixed a minor bug + +------------------------------------------------------------------------ +r55 | lh3lh3 | 2008-12-19 20:10:26 +0000 (Fri, 19 Dec 2008) | 2 lines +Changed paths: + D /branches/dev/samtools/misc/all2sam.pl + +remove all2sam.pl + +------------------------------------------------------------------------ +r54 | lh3lh3 | 2008-12-16 22:34:25 +0000 (Tue, 16 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/COPYING + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/faidx.h + M /branches/dev/samtools/khash.h + M /branches/dev/samtools/kseq.h + M /branches/dev/samtools/ksort.h + M /branches/dev/samtools/samtools.1 + +Added copyright information and a bit more documentation. No code change. + +------------------------------------------------------------------------ +r53 | lh3lh3 | 2008-12-16 13:40:18 +0000 (Tue, 16 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-64 + * improved efficiency of the indel caller for spliced alignments + +------------------------------------------------------------------------ +r52 | lh3lh3 | 2008-12-16 10:28:20 +0000 (Tue, 16 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_aux.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-63 + * a bit code cleanup: reduce the dependency between source files + +------------------------------------------------------------------------ +r51 | lh3lh3 | 2008-12-15 14:29:32 +0000 (Mon, 15 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-62 + * fixed a memory leak + +------------------------------------------------------------------------ +r50 | lh3lh3 | 2008-12-15 14:00:13 +0000 (Mon, 15 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/samtools.1 + +update documentation, ChangeLog and a comment + +------------------------------------------------------------------------ +r49 | lh3lh3 | 2008-12-15 13:36:43 +0000 (Mon, 15 Dec 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_maqcns.h + M /branches/dev/samtools/bam_pileup.c + A /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-61 + * moved pileup command to a separate source file + * added indel caller + * added bam_cal_segend(). (NOT WORKING for spliced alignment!!!) + * updated documentation + +------------------------------------------------------------------------ +r48 | lh3lh3 | 2008-12-12 13:55:36 +0000 (Fri, 12 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-60 + * fixed another bug in maqcns when there is a nearby deletion + +------------------------------------------------------------------------ +r47 | lh3lh3 | 2008-12-12 13:42:16 +0000 (Fri, 12 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-59 + * pileup: outputing consensus is now optional + * fixed a bug in glfgen. This bug also exists in maq's glfgen. However, + I am not quite sure why the previous version may have problem. + +------------------------------------------------------------------------ +r46 | lh3lh3 | 2008-12-12 11:44:56 +0000 (Fri, 12 Dec 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-58 + * add maq consensus to pileup. However, I will move this part to a new + command as strictly speaking, consensus callin is not part of pileup, + and imposing it would make it harder to generate for other language + bindings. + +------------------------------------------------------------------------ +r45 | bhandsaker | 2008-12-11 20:43:56 +0000 (Thu, 11 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bgzf.c + +Fix bug in tell() after reads that consume to the exact end of a block. + +------------------------------------------------------------------------ +r44 | lh3lh3 | 2008-12-11 09:36:53 +0000 (Thu, 11 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/samtools.1 + +update manual + +------------------------------------------------------------------------ +r43 | lh3lh3 | 2008-12-11 09:25:36 +0000 (Thu, 11 Dec 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-57 + * fixed a bug in parser when there is auxiliary fields + * made the parser a bit more robust + +------------------------------------------------------------------------ +r42 | lh3lh3 | 2008-12-10 14:57:29 +0000 (Wed, 10 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/bgzf.c + + * samtools-0.1.0-56 + * fixed a bug in bgzf (only reading is affected) + * fixed a typo in bam_index.c + * in bam_index.c, check potential bugs in the underlying I/O library + +------------------------------------------------------------------------ +r41 | lh3lh3 | 2008-12-10 12:53:08 +0000 (Wed, 10 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/samtools.1 + +update manual + +------------------------------------------------------------------------ +r40 | lh3lh3 | 2008-12-10 11:52:10 +0000 (Wed, 10 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-55 + * tried to make pileup work with clipping (previously not), though NOT tested + * removed -v from pileup + * made pileup take the reference sequence + +------------------------------------------------------------------------ +r39 | lh3lh3 | 2008-12-09 11:59:28 +0000 (Tue, 09 Dec 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-54 + * in parser, recognize "=", rather than ",", as a match + * in parser, correctl parse "=" at the MRNM field. + +------------------------------------------------------------------------ +r38 | lh3lh3 | 2008-12-09 11:39:07 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/maq2sam.c + +fixed a bug in handling maq flag 64 and 192 + +------------------------------------------------------------------------ +r37 | lh3lh3 | 2008-12-09 09:53:46 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/md5fa.c + +also calculate unordered md5sum check + +------------------------------------------------------------------------ +r36 | lh3lh3 | 2008-12-09 09:46:21 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/md5fa.c + +fixed a minor bug when there are space in the sequence + +------------------------------------------------------------------------ +r35 | lh3lh3 | 2008-12-09 09:40:45 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/md5fa.c + +fixed a potential memory leak + +------------------------------------------------------------------------ +r34 | lh3lh3 | 2008-12-08 14:52:17 +0000 (Mon, 08 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + + * fixed a bug in import: bin is wrongly calculated + +------------------------------------------------------------------------ +r33 | lh3lh3 | 2008-12-08 14:08:01 +0000 (Mon, 08 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/all2sam.pl + +nothing, really + +------------------------------------------------------------------------ +r32 | lh3lh3 | 2008-12-08 12:56:02 +0000 (Mon, 08 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/kseq.h + M /branches/dev/samtools/misc/Makefile + A /branches/dev/samtools/misc/md5.c + A /branches/dev/samtools/misc/md5.h + A /branches/dev/samtools/misc/md5fa.c + + * fixed two warnings in kseq.h + * added md5sum utilities + +------------------------------------------------------------------------ +r31 | lh3lh3 | 2008-12-08 11:35:29 +0000 (Mon, 08 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/kseq.h + D /branches/dev/samtools/kstream.h + + * samtools-0.1.0-52 + * replace kstream with kseq. kseq is a superset of kstream. I need the + extra functions in kseq.h. + * also compile stand-alone faidx + +------------------------------------------------------------------------ +r30 | lh3lh3 | 2008-12-08 11:17:04 +0000 (Mon, 08 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_sort.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-51 + * sorting by read names is available + +------------------------------------------------------------------------ +r29 | lh3lh3 | 2008-12-08 10:29:02 +0000 (Mon, 08 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bam_sort.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/misc/maq2sam.c + + * samtools-0.1.0-50 + * format change to meet the latest specification + +------------------------------------------------------------------------ +r28 | lh3lh3 | 2008-12-04 16:09:21 +0000 (Thu, 04 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/misc/maq2sam.c + + * minor change in maqcns: special care when n==0 + * change maq2sam to meet the latest specification + +------------------------------------------------------------------------ +r27 | lh3lh3 | 2008-12-04 15:55:44 +0000 (Thu, 04 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/razf.c + M /branches/dev/samtools/razf.h + +considerable code clean up in razf + +------------------------------------------------------------------------ +r26 | lh3lh3 | 2008-12-04 15:08:18 +0000 (Thu, 04 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/faidx.c + +make RAZF optional in faidx.c + +------------------------------------------------------------------------ +r25 | lh3lh3 | 2008-12-01 15:27:22 +0000 (Mon, 01 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_aux.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-49 + * added routines for retrieving aux data, NOT TESTED YET! + +------------------------------------------------------------------------ +r24 | lh3lh3 | 2008-12-01 14:29:43 +0000 (Mon, 01 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/bgzf.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-48 + * bgzf: fixed a potential integer overflow on 32-it machines + * maqcns: set the minimum combined quality as 0 + * supporting hex strings + +------------------------------------------------------------------------ +r23 | lh3lh3 | 2008-11-27 17:14:37 +0000 (Thu, 27 Nov 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-47 + * fixed the bug in maqcns + +------------------------------------------------------------------------ +r22 | lh3lh3 | 2008-11-27 17:08:11 +0000 (Thu, 27 Nov 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + A /branches/dev/samtools/bam_maqcns.c + A /branches/dev/samtools/bam_maqcns.h + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/glf.h + + * samtools-0.1.0-46 + * add MAQ consensus caller, currently BUGGY! + +------------------------------------------------------------------------ +r21 | lh3lh3 | 2008-11-27 13:51:28 +0000 (Thu, 27 Nov 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-45 + * tview: display padded alignment (but not P operation) + * better coordinates and reference sequence + +------------------------------------------------------------------------ +r19 | lh3lh3 | 2008-11-27 09:26:05 +0000 (Thu, 27 Nov 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/ChangeLog + +new ChangeLog + +------------------------------------------------------------------------ +r18 | lh3lh3 | 2008-11-27 09:24:45 +0000 (Thu, 27 Nov 2008) | 3 lines +Changed paths: + D /branches/dev/samtools/ChangeLog + A /branches/dev/samtools/ChangeLog.old (from /branches/dev/samtools/ChangeLog:6) + +Rename ChangeLog to ChangeLog.old. This old ChangeLog is generated from +the log of my personal SVN repository. + +------------------------------------------------------------------------ +r17 | lh3lh3 | 2008-11-27 09:22:55 +0000 (Thu, 27 Nov 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/bgzf.c + + * samtools-0.1.0-44 + * declare fseeko and ftello as some Linux may not do this by default and + missing these declarations will make bgzf buggy + * get rid of some harmless warings + * use BGZF by default, now + +------------------------------------------------------------------------ +r16 | lh3lh3 | 2008-11-26 21:19:11 +0000 (Wed, 26 Nov 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/razf.c + + * samtools-0.1.0-43 + * fixed a bug in razf_read() + * give more warnings when the file is truncated (or due to bugs in I/O library) + +------------------------------------------------------------------------ +r15 | lh3lh3 | 2008-11-26 20:41:39 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bgzf.c + +fixed a bug in bgzf.c at the end of the file + +------------------------------------------------------------------------ +r14 | lh3lh3 | 2008-11-26 17:05:18 +0000 (Wed, 26 Nov 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-42 + * a lot happened to RAZF, although samtools itself is untouched. Better + also update the version number anyway to avoid confusion + +------------------------------------------------------------------------ +r13 | lh3lh3 | 2008-11-26 17:03:48 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/razf.c + +a change from Jue, but I think it should not matter + +------------------------------------------------------------------------ +r12 | lh3lh3 | 2008-11-26 16:48:14 +0000 (Wed, 26 Nov 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/razf.c + +fixed a potential bug in razf. However, it seems still buggy, just +rarely happens, very rarely. + +------------------------------------------------------------------------ +r11 | lh3lh3 | 2008-11-26 14:02:56 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/razf.c + +fixed a bug in razf, with the help of Jue + +------------------------------------------------------------------------ +r10 | lh3lh3 | 2008-11-26 11:55:32 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bam_index.c + +remove a comment + +------------------------------------------------------------------------ +r9 | lh3lh3 | 2008-11-26 11:37:05 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/razf.c + M /branches/dev/samtools/razf.h + + * Jue has updated razf to realize Bob's scheme + +------------------------------------------------------------------------ +r7 | lh3lh3 | 2008-11-25 20:37:37 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/samtools.1 + +the manual page + +------------------------------------------------------------------------ +r6 | lh3lh3 | 2008-11-25 20:37:16 +0000 (Tue, 25 Nov 2008) | 3 lines +Changed paths: + A /branches/dev/samtools/ChangeLog + A /branches/dev/samtools/Makefile + A /branches/dev/samtools/bam.c + A /branches/dev/samtools/bam.h + A /branches/dev/samtools/bam_aux.c + A /branches/dev/samtools/bam_endian.h + A /branches/dev/samtools/bam_import.c + A /branches/dev/samtools/bam_index.c + A /branches/dev/samtools/bam_lpileup.c + A /branches/dev/samtools/bam_pileup.c + A /branches/dev/samtools/bam_sort.c + A /branches/dev/samtools/bam_tview.c + A /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/bgzf.c + A /branches/dev/samtools/bgzf.h + A /branches/dev/samtools/bgzip.c + A /branches/dev/samtools/faidx.c + A /branches/dev/samtools/faidx.h + A /branches/dev/samtools/khash.h + A /branches/dev/samtools/ksort.h + A /branches/dev/samtools/kstream.h + A /branches/dev/samtools/misc + A /branches/dev/samtools/misc/Makefile + A /branches/dev/samtools/misc/all2sam.pl + A /branches/dev/samtools/misc/maq2sam.c + A /branches/dev/samtools/razf.c + A /branches/dev/samtools/razf.h + A /branches/dev/samtools/razip.c + A /branches/dev/samtools/zutil.h + +The initial version of samtools, replicated from my local SVN repository. +The current version is: 0.1.0-42. All future development will happen here. + +------------------------------------------------------------------------ +r5 | lh3lh3 | 2008-11-25 20:30:49 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + A /branches/dev/samtools + +samtools (C version) + +------------------------------------------------------------------------ +------------------------------------------------------------------------ +r703 | lh3 | 2008-11-25 20:20:02 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/samtools.1 + +rename bamtk to samtools + +------------------------------------------------------------------------ +r702 | lh3 | 2008-11-25 20:15:09 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + D /branches/prog/bam/bamtk.1 + A /branches/prog/bam/samtools.1 (from /branches/prog/bam/bamtk.1:679) + +rename bamtk.1 to samtools.1 + +------------------------------------------------------------------------ +r701 | lh3 | 2008-11-25 13:29:10 +0000 (Tue, 25 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/misc/Makefile + + * samtools-0.1.0-41 + * small (but a bit dangerous) changes to meet the latest specification + +------------------------------------------------------------------------ +r700 | lh3 | 2008-11-25 13:15:11 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam/misc/all2sam.pl (from /branches/prog/bam/misc/all2tam.pl:649) + D /branches/prog/bam/misc/all2tam.pl + A /branches/prog/bam/misc/maq2sam.c (from /branches/prog/bam/misc/maq2tam.c:699) + D /branches/prog/bam/misc/maq2tam.c + +rename tam to sam + +------------------------------------------------------------------------ +r699 | lh3 | 2008-11-25 13:14:49 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/misc/maq2tam.c + +change for the new specification + +------------------------------------------------------------------------ +r698 | lh3 | 2008-11-24 13:15:20 +0000 (Mon, 24 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/razf.c + M /branches/prog/bam/razf.h + + * add a fake BGZF mode to razf. It is fake in that it loads razf index into + memory but gives BGZF like virtual offset + +------------------------------------------------------------------------ +r697 | lh3 | 2008-11-24 09:53:44 +0000 (Mon, 24 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam/ChangeLog + +change log + +------------------------------------------------------------------------ +r696 | lh3 | 2008-11-24 09:53:23 +0000 (Mon, 24 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bgzf.c + +updated bgzf, on behalf of Bob + +------------------------------------------------------------------------ +r695 | lh3 | 2008-11-23 11:40:31 +0000 (Sun, 23 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/razf.c + +fixed a bug in razf + +------------------------------------------------------------------------ +r694 | lh3 | 2008-11-22 16:23:52 +0000 (Sat, 22 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_lpileup.c + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bam-0.1.0-40 + * fixed two small memory leaks + * fixed a memory problem when seek outside the length of the sequence + +------------------------------------------------------------------------ +r693 | lh3 | 2008-11-22 16:10:04 +0000 (Sat, 22 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + + * bam-0.1.0-39 + * fixed an uninitialized warning. This does not matter in fact + +------------------------------------------------------------------------ +r692 | lh3 | 2008-11-22 15:44:05 +0000 (Sat, 22 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/razf.c + M /branches/prog/bam/razf.h + +Jue's new razf + +------------------------------------------------------------------------ +r691 | lh3 | 2008-11-21 21:30:39 +0000 (Fri, 21 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/bgzip.c + + * bam-0.1.0-38 + * get rid of some warings in bgzip.c + * potentially improve performance in indexing for BGZF + +------------------------------------------------------------------------ +r690 | lh3 | 2008-11-21 21:15:51 +0000 (Fri, 21 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bgzf.c + +I think I have fixed the bug in bgzf + +------------------------------------------------------------------------ +r689 | lh3 | 2008-11-21 20:48:56 +0000 (Fri, 21 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bgzf.c + +bug fix by Bob + +------------------------------------------------------------------------ +r688 | lh3 | 2008-11-21 20:37:27 +0000 (Fri, 21 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + +fixed a bug due to the name change in _IOLIB + +------------------------------------------------------------------------ +r687 | lh3 | 2008-11-21 14:42:56 +0000 (Fri, 21 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bgzf.c + +fix small things + +------------------------------------------------------------------------ +r686 | lh3 | 2008-11-21 14:37:59 +0000 (Fri, 21 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam/bgzf.c + A /branches/prog/bam/bgzf.h + A /branches/prog/bam/bgzip.c + +Bob's BGZF format, although currently buggy + +------------------------------------------------------------------------ +r685 | lh3 | 2008-11-21 09:48:20 +0000 (Fri, 21 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bam-0.1.0-37 + * improve interface a little bit + +------------------------------------------------------------------------ +r684 | lh3 | 2008-11-21 09:30:18 +0000 (Fri, 21 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bam-0.1.0-36 + * improve the interface of tview, a little bit + +------------------------------------------------------------------------ +r683 | lh3 | 2008-11-20 22:33:54 +0000 (Thu, 20 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam_tview.c + +a little better viewer + +------------------------------------------------------------------------ +r682 | lh3 | 2008-11-20 22:27:01 +0000 (Thu, 20 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-35 + * better viewer + +------------------------------------------------------------------------ +r681 | lh3 | 2008-11-20 20:51:16 +0000 (Thu, 20 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-34 + * tview is now a component of bamtk + +------------------------------------------------------------------------ +r680 | lh3 | 2008-11-20 19:17:30 +0000 (Thu, 20 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam/bam_tview.c + +text alignment viewer + +------------------------------------------------------------------------ +r679 | lh3 | 2008-11-20 19:17:15 +0000 (Thu, 20 Nov 2008) | 5 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_lpileup.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.1 + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/faidx.c + + * bamtk-0.1.0-33 + * added routines to reset pileup bufferes + * fixed a bug in faidx + * add text alignment viewer + +------------------------------------------------------------------------ +r678 | lh3 | 2008-11-20 11:05:02 +0000 (Thu, 20 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/Makefile + A /branches/prog/bam/bam_lpileup.c (from /branches/prog/bam/bam_tview.c:668) + D /branches/prog/bam/bam_tview.c + +rename tview as lpileup + +------------------------------------------------------------------------ +r677 | lh3 | 2008-11-20 10:08:52 +0000 (Thu, 20 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/razf.c + +fixed a bug in razf + +------------------------------------------------------------------------ +r676 | lh3 | 2008-11-19 22:52:20 +0000 (Wed, 19 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/faidx.h + +add documentations + +------------------------------------------------------------------------ +r674 | lh3 | 2008-11-19 21:39:17 +0000 (Wed, 19 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bamtk.1 + M /branches/prog/bam/faidx.h + +update documentation + +------------------------------------------------------------------------ +r673 | lh3 | 2008-11-19 21:19:03 +0000 (Wed, 19 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam/bamtk.1 + +add manual page + +------------------------------------------------------------------------ +r672 | lh3 | 2008-11-19 16:40:49 +0000 (Wed, 19 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/faidx.c + + * bamtk-0.1.0-32 + * make faidx more error resistant + +------------------------------------------------------------------------ +r671 | lh3 | 2008-11-19 16:09:55 +0000 (Wed, 19 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/faidx.h + +add index + +------------------------------------------------------------------------ +r670 | lh3 | 2008-11-19 16:02:39 +0000 (Wed, 19 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/faidx.c + + * bamtk-0.1.0-31 + * show reference sequence in pileup -v (not in the default pileup) + +------------------------------------------------------------------------ +r669 | lh3 | 2008-11-19 14:51:17 +0000 (Wed, 19 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/faidx.c + + * bamtk-0.1.0-30 + * put faidx in bamtk and remove faidx_main.c + +------------------------------------------------------------------------ +r668 | lh3 | 2008-11-19 14:15:05 +0000 (Wed, 19 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + A /branches/prog/bam/faidx.c + A /branches/prog/bam/faidx.h + M /branches/prog/bam/razf.c + + * bamtk-0.1.0-29 + * fixed a bug in tview.c + * prepare to add faidx + +------------------------------------------------------------------------ +r667 | lh3 | 2008-11-19 10:20:45 +0000 (Wed, 19 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/razf.c + M /branches/prog/bam/razf.h + +gzip-compatible razf + +------------------------------------------------------------------------ +r664 | lh3 | 2008-11-18 12:50:23 +0000 (Tue, 18 Nov 2008) | 5 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-28 + * fetch: fixed a bug at an array boundary + * fetch: fixed a bug when the whole chromosome is retrieved + * add linear index + +------------------------------------------------------------------------ +r663 | lh3 | 2008-11-17 21:29:22 +0000 (Mon, 17 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-27 + * put l_qseq into core and move l_aux to bam1_t + +------------------------------------------------------------------------ +r662 | lh3 | 2008-11-17 20:55:16 +0000 (Mon, 17 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-26 + * save seq and qual separately + +------------------------------------------------------------------------ +r661 | lh3 | 2008-11-17 13:09:37 +0000 (Mon, 17 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + +little + +------------------------------------------------------------------------ +r660 | lh3 | 2008-11-17 13:06:14 +0000 (Mon, 17 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + +more documentations + +------------------------------------------------------------------------ +r659 | lh3 | 2008-11-17 12:55:08 +0000 (Mon, 17 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-25 + * make tview work for TAM + +------------------------------------------------------------------------ +r658 | lh3 | 2008-11-17 12:50:21 +0000 (Mon, 17 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-24 + * make tview as an independent module + +------------------------------------------------------------------------ +r657 | lh3 | 2008-11-17 11:26:06 +0000 (Mon, 17 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_pileup.c + +change little + +------------------------------------------------------------------------ +r656 | lh3 | 2008-11-16 21:33:19 +0000 (Sun, 16 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-23 + * also add tview for TAM + +------------------------------------------------------------------------ +r655 | lh3 | 2008-11-16 21:29:46 +0000 (Sun, 16 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-22 + * make tview more efficient for deep depth + +------------------------------------------------------------------------ +r654 | lh3 | 2008-11-16 20:52:19 +0000 (Sun, 16 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_pileup.c + A /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-21 + * fixed bug in the TAM parser: lowercase not recognized + * unfinished function to leveled pileup (tview) + +------------------------------------------------------------------------ +r653 | lh3 | 2008-11-15 12:58:36 +0000 (Sat, 15 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-20 + * pileup now display deleted bases as '*' + +------------------------------------------------------------------------ +r652 | lh3 | 2008-11-15 09:58:39 +0000 (Sat, 15 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-19 + * fixed a bug in fetch() + * reduce memory in indexing + +------------------------------------------------------------------------ +r651 | lh3 | 2008-11-14 21:56:05 +0000 (Fri, 14 Nov 2008) | 5 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-18 + * important changes are made to index: the index size is increased, but + now we have no limit on file sizes and the new method potentially + works with BGZF, Bob's new compression format. + +------------------------------------------------------------------------ +r650 | lh3 | 2008-11-14 16:03:22 +0000 (Fri, 14 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-17 + * more comments in bam.h + * fixed a bug in bam_index.c + +------------------------------------------------------------------------ +r649 | lh3 | 2008-11-13 16:04:18 +0000 (Thu, 13 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bam_sort.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-16 + * use macros to retrieve pointers from bam1_t and thus reduce the size + of bam1_t struct. + +------------------------------------------------------------------------ +r648 | lh3 | 2008-11-13 13:21:39 +0000 (Thu, 13 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_sort.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-15 + * make more things work over pipe + +------------------------------------------------------------------------ +r647 | lh3 | 2008-11-13 12:49:28 +0000 (Thu, 13 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/misc/maq2tam.c + +fixed a bug in maq2tam + +------------------------------------------------------------------------ +r646 | lh3 | 2008-11-13 11:46:59 +0000 (Thu, 13 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/misc/Makefile + M /branches/prog/bam/misc/maq2tam.c + + * bug fix in maq2tam.c + * improve Makefile + +------------------------------------------------------------------------ +r645 | lh3 | 2008-11-13 11:39:46 +0000 (Thu, 13 Nov 2008) | 3 lines +Changed paths: + A /branches/prog/bam/misc/Makefile + M /branches/prog/bam/misc/maq2tam.c + + * corrected maq2tam + * add Makefile + +------------------------------------------------------------------------ +r644 | lh3 | 2008-11-13 11:25:45 +0000 (Thu, 13 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/razf.c + +fixed the bug in buffered write (on behalf of Jue) + +------------------------------------------------------------------------ +r643 | lh3 | 2008-11-13 10:53:42 +0000 (Thu, 13 Nov 2008) | 2 lines +Changed paths: + D /branches/prog/bam/all2tam.pl + A /branches/prog/bam/misc/all2tam.pl (from /branches/prog/bam/all2tam.pl:642) + +move to misc + +------------------------------------------------------------------------ +r642 | lh3 | 2008-11-13 10:53:23 +0000 (Thu, 13 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/all2tam.pl + +change tag + +------------------------------------------------------------------------ +r641 | lh3 | 2008-11-13 10:53:12 +0000 (Thu, 13 Nov 2008) | 2 lines +Changed paths: + D /branches/prog/bam/utils + +has been renamed + +------------------------------------------------------------------------ +r640 | lh3 | 2008-11-13 10:52:50 +0000 (Thu, 13 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam/misc (from /branches/prog/bam/utils:639) + +rename + +------------------------------------------------------------------------ +r639 | lh3 | 2008-11-13 10:52:35 +0000 (Thu, 13 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam/utils + A /branches/prog/bam/utils/maq2tam.c + +utilities (converters and so on) + +------------------------------------------------------------------------ +r638 | lh3 | 2008-11-12 22:24:22 +0000 (Wed, 12 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-14 + * copy the text header to BAM + * add BAM1 header flag + +------------------------------------------------------------------------ +r637 | lh3 | 2008-11-12 14:56:08 +0000 (Wed, 12 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/razf.c + + * bamtk-0.1.0-13 + * fixed a bug in razf + * improved and fixed potential bugs in index + +------------------------------------------------------------------------ +r636 | lh3 | 2008-11-12 11:57:13 +0000 (Wed, 12 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + +update documentation in the HeaderDOC format + +------------------------------------------------------------------------ +r635 | lh3 | 2008-11-12 10:08:38 +0000 (Wed, 12 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-12 + * more documentations + * rename baf1_core_t as bam1_core_t + +------------------------------------------------------------------------ +r634 | lh3 | 2008-11-11 23:00:35 +0000 (Tue, 11 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_pileup.c + +documentation + +------------------------------------------------------------------------ +r633 | lh3 | 2008-11-11 21:23:49 +0000 (Tue, 11 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-11 + * give up regional pileup. We can now use pipe to mimic that. + * for index file, change suffix .idx to .bmi + +------------------------------------------------------------------------ +r632 | lh3 | 2008-11-11 21:00:11 +0000 (Tue, 11 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/razf.c + + * bamtk-0.1.0-10 + * make pileup work on TAM + +------------------------------------------------------------------------ +r631 | lh3 | 2008-11-11 09:20:29 +0000 (Tue, 11 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/razf.c + M /branches/prog/bam/razf.h + M /branches/prog/bam/razip.c + + * bamtk-0.1.0-9 + * razf now supports streaming + * prepare to improve pileup (have not yet) + +------------------------------------------------------------------------ +r630 | lh3 | 2008-11-10 18:34:40 +0000 (Mon, 10 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-8 + * improve the interface of TAM parser + +------------------------------------------------------------------------ +r629 | lh3 | 2008-11-10 13:06:13 +0000 (Mon, 10 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-7 + * almost nothing + +------------------------------------------------------------------------ +r628 | lh3 | 2008-11-10 12:56:36 +0000 (Mon, 10 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-6 + * fixed a bug in bam_pileup.c + +------------------------------------------------------------------------ +r627 | lh3 | 2008-11-10 11:32:46 +0000 (Mon, 10 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/razf.c + + * bamtk-0.1.0-5 + * fixed a bug in razf.c, caused by my modifications + * improve the interface of pileup. Now it will be slower but more flexible + +------------------------------------------------------------------------ +r626 | lh3 | 2008-11-09 20:51:04 +0000 (Sun, 09 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-4 + * view: dumping binary output + +------------------------------------------------------------------------ +r625 | lh3 | 2008-11-09 20:31:54 +0000 (Sun, 09 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bam_sort.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-3 + * rename functions + +------------------------------------------------------------------------ +r624 | lh3 | 2008-11-09 15:07:32 +0000 (Sun, 09 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + +add comments + +------------------------------------------------------------------------ +r623 | lh3 | 2008-11-08 22:32:49 +0000 (Sat, 08 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-2 + * improve indexing for a mixture of long and short reads, although currently + I do not know whether it really works... + +------------------------------------------------------------------------ +r622 | lh3 | 2008-11-08 22:13:58 +0000 (Sat, 08 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-1 + * prepare for improving indexing algorithm + +------------------------------------------------------------------------ +r621 | lh3 | 2008-11-08 20:28:09 +0000 (Sat, 08 Nov 2008) | 4 lines +Changed paths: + A /branches/prog/bam/all2tam.pl + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bamtk.c + D /branches/prog/bam/tam_utils.pl + + * bamtk-0.1.0 + * smarter integers + * rename tam_utils.pl to all2tam.pl + +------------------------------------------------------------------------ +r620 | lh3 | 2008-11-08 17:17:22 +0000 (Sat, 08 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam + A /branches/prog/bam/Makefile + A /branches/prog/bam/bam.c + A /branches/prog/bam/bam.h + A /branches/prog/bam/bam_endian.h + A /branches/prog/bam/bam_import.c + A /branches/prog/bam/bam_index.c + A /branches/prog/bam/bam_pileup.c + A /branches/prog/bam/bam_sort.c + A /branches/prog/bam/bamtk.c + A /branches/prog/bam/khash.h + A /branches/prog/bam/ksort.h + A /branches/prog/bam/kstream.h + A /branches/prog/bam/razf.c + A /branches/prog/bam/razf.h + A /branches/prog/bam/razip.c + A /branches/prog/bam/tam_utils.pl + A /branches/prog/bam/zutil.h + +The Binary Alignment/Mapping format. + +------------------------------------------------------------------------
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/INSTALL Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,30 @@ +System Requirements +=================== + +SAMtools depends on the zlib library <http://www.zlib.net>. Version 1.2.3+ is +preferred and with 1.2.3+ you can compile razip and use it to compress a FASTA +file. SAMtools' faidx is able to index a razip-compressed FASTA file to save +diskspace. Older zlib also works with SAMtools, but razip cannot be compiled. + +The text-based viewer (tview) requires the GNU ncurses library +<http://www.gnu.org/software/ncurses/>, which comes with Mac OS X and most of +the modern Linux/Unix distributions. If you do not have this library installed, +you can still compile the rest of SAMtools by manually changing: +`-D_CURSES_LIB=1' to `-D_CURSES_LIB=0' at the line starting with `DFLAGS=', and +comment out the line starting with `LIBCURSES='. + + +Compilation +=========== + +Type `make' to compile samtools. If you have zlib >= 1.2.2.1, you can compile +razip with `make razip'. + + +Installation +============ + +Copy `samtools', `bcftools/bcftools' and other executables/scripts in `misc' to +a location you want (e.g. a directory in your $PATH). You may also copy +`samtools.1' and `bcftools/bcftools.1' to a directory in your $MANPATH such +that the `man' command may find the manual.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/Makefile Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,101 @@ +CC= gcc +CFLAGS= -g -Wall -O2 +#LDFLAGS= -Wl,-rpath,\$$ORIGIN/../lib +DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=0 +KNETFILE_O= knetfile.o +LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ + bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o bedidx.o \ + $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o +AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ + bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ + bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ + cut_target.o phase.o bam2depth.o padding.o bedcov.o bamshuf.o \ + bam_tview_curses.o bam_tview_html.o +PROG= samtools +INCLUDES= -I. +SUBDIRS= . bcftools misc +LIBPATH= +LIBCURSES= # -lXCurses + +.SUFFIXES:.c .o +.PHONY: all lib + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all-recur lib-recur clean-recur cleanlocal-recur install-recur: + @target=`echo $@ | sed s/-recur//`; \ + wdir=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + cd $$subdir; \ + $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ + INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ + cd $$wdir; \ + done; + +all:$(PROG) + +.PHONY:all lib clean cleanlocal +.PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur + +lib:libbam.a + +libbam.a:$(LOBJS) + $(AR) -csru $@ $(LOBJS) + +samtools:lib-recur $(AOBJS) + $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LDFLAGS) libbam.a -Lbcftools -lbcf $(LIBPATH) $(LIBCURSES) -lm -lz -lpthread + +razip:razip.o razf.o $(KNETFILE_O) + $(CC) $(CFLAGS) -o $@ $^ -lz + +bgzip:bgzip.o bgzf.o $(KNETFILE_O) + $(CC) $(CFLAGS) -o $@ $^ -lz -lpthread + +bgzf.o:bgzf.c bgzf.h + $(CC) -c $(CFLAGS) $(DFLAGS) -DBGZF_CACHE $(INCLUDES) bgzf.c -o $@ + +razip.o:razf.h +bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h +sam.o:sam.h bam.h +bam_import.o:bam.h kseq.h khash.h razf.h +bam_pileup.o:bam.h razf.h ksort.h +bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h +bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h +bam_lpileup.o:bam.h ksort.h +bam_tview.o:bam.h faidx.h bam_tview.h +bam_tview_curses.o:bam.h faidx.h bam_tview.h +bam_tview_html.o:bam.h faidx.h bam_tview.h +bam_sort.o:bam.h ksort.h razf.h +bam_md.o:bam.h faidx.h +sam_header.o:sam_header.h khash.h +bcf.o:bcftools/bcf.h +bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h +bam2bcf_indel.o:bam2bcf.h +errmod.o:errmod.h +phase.o:bam.h khash.h ksort.h +bamtk.o:bam.h + +faidx.o:faidx.h razf.h khash.h +faidx_main.o:faidx.h razf.h + + +libbam.1.dylib-local:$(LOBJS) + libtool -dynamic $(LOBJS) -o libbam.1.dylib -lc -lz + +libbam.so.1-local:$(LOBJS) + $(CC) -shared -Wl,-soname,libbam.so -o libbam.so.1 $(LOBJS) -lc -lz + +dylib: + @$(MAKE) cleanlocal; \ + case `uname` in \ + Linux) $(MAKE) CFLAGS="$(CFLAGS) -fPIC" libbam.so.1-local;; \ + Darwin) $(MAKE) CFLAGS="$(CFLAGS) -fPIC" libbam.1.dylib-local;; \ + *) echo 'Unknown OS';; \ + esac + + +cleanlocal: + rm -fr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib + +clean:cleanlocal-recur
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/Makefile.mingw Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,63 @@ +CC= gcc.exe +AR= ar.exe +CFLAGS= -g -Wall -O2 +DFLAGS= -D_USE_KNETFILE -D_CURSES_LIB=2 +KNETFILE_O= knetfile.o +LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ + bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o \ + $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bedidx.o +AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ + bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ + bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ + cut_target.o phase.o bam_cat.o bam2depth.o +BCFOBJS= bcftools/bcf.o bcftools/fet.o bcftools/bcf2qcall.o bcftools/bcfutils.o \ + bcftools/call1.o bcftools/index.o bcftools/kfunc.o bcftools/em.o \ + bcftools/kmin.o bcftools/prob1.o bcftools/vcf.o bcftools/mut.o +PROG= samtools.exe bcftools.exe +INCLUDES= -I. -Iwin32 +SUBDIRS= . +LIBPATH= + +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all:$(PROG) + +.PHONY:all lib clean cleanlocal +.PHONY:all-recur lib-recur clean-recur cleanlocal-recur install-recur + +lib:libbam.a + +libbam.a:$(LOBJS) + $(AR) -cru $@ $(LOBJS) + +samtools.exe:$(AOBJS) libbam.a $(BCFOBJS) + $(CC) $(CFLAGS) -o $@ $(AOBJS) $(BCFOBJS) $(LIBPATH) -lm -L. -lbam -Lwin32 -lz -lcurses -lws2_32 + +bcftools.exe:$(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o + $(CC) $(CFLAGS) -o $@ $(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o -lm -Lwin32 -lz -lws2_32 + +razip.o:razf.h +bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h +sam.o:sam.h bam.h +bam_import.o:bam.h kseq.h khash.h razf.h +bam_pileup.o:bam.h razf.h ksort.h +bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h +bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h +bam_lpileup.o:bam.h ksort.h +bam_tview.o:bam.h faidx.h +bam_sort.o:bam.h ksort.h razf.h +bam_md.o:bam.h faidx.h +sam_header.o:sam_header.h khash.h +bcf.o:bcftools/bcf.h +bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h +bam2bcf_indel.o:bam2bcf.h +errmod.o:errmod.h + +faidx.o:faidx.h razf.h khash.h +faidx_main.o:faidx.h razf.h + +clean: + rm -fr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/NEWS Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,836 @@ +Beta Release 0.1.19 (15 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in samtools and bcftools: + + * The latest source code and development moved to github, + http://github.com/samtools/samtools + + * Many important bugfixes and contributions by many people. Thanks to all! + + * Performance improvements (multi-threading) + + * Important changes in calling, see + - samtools mpileup -p + - bcftools view -m + + * New annotations useful for filtering (RPB, HWE, QBD, MDV) + + * New tools, bamcheck and plot-bamcheck + + * New features in samtools tview + + * And much more.. + +For a detailed list of commits, please see +http://github.com/samtools/samtools/commits/master + +(0.1.19: 15 March 2013, commit 96b5f2294ac0054230e88913c4983d548069ea4e) + + +Beta Release 0.1.18 (2 September, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in samtools: + + * Support the new =/X CIGAR operators (by Peter Cock). + + * Allow to subsample BAM while keeping the pairing intact (view -s). + + * Implemented variant distance bias as a new filter (by Petr Danecek). + + * Bugfix: huge memory usage during indexing + + * Bugfix: use of uninitialized variable in mpileup (rare) + + * Bugfix: wrong BAQ probability (rare) + +Notable changes in bcftools: + + * Support indel in the contrast caller. + + * Bugfix: LRT2=nan in rare cases + +(0.1.18: 2 September 2011, r982:295) + + + +Beta Release 0.1.17 (6 July, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With the maturity of `mpileup' and the lack of update in the `pileup' command, +the `pileup' command is now formally dropped. Most of the pileup functionality, +such as outputting mapping quality and read positions, have been added +`mpileup'. + +Since this release, `bcftools view' is able to perform contrast SNP calling +(option -T) for discovering de novo and/or somatic mutations between a pair of +samples or in a family trio. Potential mutations are scored by a log likelihood +ratio, which is very simple in math, but should be comparable to more +sophisticated methods. Note that getting the score is only the very first step. +A lot more need to be done to reduce systematical errors due to mapping and +reference errors and structural variations. + +Other notable changes in samtools: + + * Improved sorting order checking during indexing. + + * Improved region parsing. Colons in reference sequence names are parsed + properly. + + * Fixed an issue where mpileup does not apply BAQ for the first few reads when + a region is specified. + + * Fixed an issue where `faidx' does not work with FASTA files with long lines. + + * Bugfix: wrong SP genotype information in the BCF output. + +Other notable changes in bcftools: + + * Output the ML esitmate of the allele count. + + * Added the HWE plus F<0 filter to varFilter. For multiple samples, it + effectively filters false heterozygous calls around centromeres. + + * For association mapping, perform both 1-degree and 2-degree test. The + 2-degree test is conservative but more robust to HWE violation. + +(0.1.17: 6 July 2011, r973:277) + + + +Beta Release 0.1.16 (21 April, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in samtools: + + * Support the new SAM/BAM type `B' in the latest SAM spec v1.4. + + * When the output file of `samtools merge' exists, do not overwrite it unless + a new command-line option `-f' is applied. + + * Bugfix: BED support is not working when the input BED is not sorted. + + * Bugfix: some reads without coordinates but given on the reverse strand are + lost in merging. + +Notable changes in bcftools: + + * Code cleanup: separated max-likelihood inference and Bayesian inference. + + * Test Hardy-Weinberg equilibrium with a likelihood-ratio test. + + * Provided another association test P-value by likelihood-ratio test. + + * Use Brent's method to estimate the site allele frequency when EM converges + slowly. The resulting ML estimate of allele frequnecy is more accurate. + + * Added the `ldpair' command, which computes r^2 between SNP pairs given in + an input file. + +Also, the `pileup' command, which has been deprecated by `mpileup' since +version 0.1.10, will be dropped in the next release. The old `pileup' command +is substandard and causing a lot of confusion. + +(0.1.16: 21 April 2011, r963:234) + + + +Beta Release 0.1.15 (10 April, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Noteable changes: + + * Allow to perform variant calling or to extract information in multiple + regions specified by a BED file (`samtools mpileup -l', `samtools view -L' + and `bcftools view -l'). + + * Added the `depth' command to samtools to compute the per-base depth with a + simpler interface. File `bam2depth.c', which implements this command, is the + recommended example on how to use the mpileup APIs. + + * Estimate genotype frequencies with ML; perform chi^2 based Hardy-Weinberg + test using this estimate. + + * For `samtools view', when `-R' is specified, drop read groups in the header + that are not contained in the specified file. + + * For `samtools flagstat', separate QC-pass and QC-fail reads. + + * Improved the command line help of `samtools mpileup' and `bcftools view'. + + * Use a global variable to control the verbose level of samtools stderr + output. Nonetheless, it has not been full utilized. + + * Fixed an issue in association test which may report false associations, + possibly due to floating point underflow. + +(0.1.15: 10 April 2011, r949:203) + + + +Beta release 0.1.14 (21 March, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release implements a method for testing associations for case-control +data. The method does not call genotypes but instead sums over all genotype +configurations to compute a chi^2 based test statistics. It can be potentially +applied to comparing a pair of samples (e.g. a tumor-normal pair), but this +has not been evaluated on real data. + +Another new feature is to make X chromosome variant calls when female and male +samples are both present. The user needs to provide a file indicating the +ploidy of each sample (see also manual bcftools/bcftools.1). + +Other notable changes: + + * Added `bcftools view -F' to parse BCF files generated by samtools r921 or + older which encodes PL in a different way. + + * Changed the behavior of `bcftools view -s'. Now when a list of samples is + provided, the samples in the output will be reordered to match the ordering + in the sample list. This change is mainly designed for association test. + + * Sped up `bcftools view -v' for target sequencing given thousands of samples. + Also added a new option `view -d' to skip loci where only a few samples are + covered by reads. + + * Dropped HWE test. This feature has never been implemented properly. An EM + should be much better. To be implemented in future. + + * Added the `cat' command to samtools. This command concatenate BAMs with + identical sequence dictionaries in an efficient way. Modified from bam_cat.c + written by Chris Saunders. + + * Added `samtools view -1' to write BAMs at a low compression level but twice + faster to create. The `sort' command generates temporary files at a low + compression level as well. + + * Added `samtools mpileup -6' to accept "BAM" with Illumina 1.3+ quality + strings (strictly speaking, such a file is not BAM). + + * Added `samtools mpileup -L' to skip INDEL calling in regions with + excessively high coverage. Such regions dramatically slow down mpileup. + + * Updated `misc/export2sam.pl', provided by Chris Saunders from Illumina Inc. + +(0.1.14: 21 March 2011, r933:170) + + + +Beta release 0.1.13 (1 March, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The most important though largely invisible modification is the change of the +order of genotypes in the PL VCF/BCF tag. This is to conform the upcoming VCF +spec v4.1. The change means that 0.1.13 is not backward compatible with VCF/BCF +generated by samtools older than r921 inclusive. VCF/BCF generated by the new +samtools will contain a line `##fileformat=VCFv4.1' as well as the samtools +version number. + +Single Individual Haplotyping (SIH) is added as an experimental feature. It +originally aims to produce haploid consensus from fosmid pool sequencing, but +also works with short-read data. For short reads, phased blocks are usually too +short to be useful in many applications, but they can help to rule out part of +SNPs close to INDELs or between copies of CNVs. + + +Other notable changes in samtools: + + * Construct per-sample consensus to reduce the effect of nearby SNPs in INDEL + calling. This reduces the power but improves specificity. + + * Improved sorting order checking in indexing. Now indexing is the preferred way + to check if a BAM is sorted. + + * Added a switch `-E' to mpileup and calmd. This option uses an alternative way + to apply BAQ, which increases sensistivity, especially to MNPs, at the cost of + a little loss in specificity. + + * Added `mpileup -A' to allow to use reads in anomalous pairs in SNP calling. + + * Added `mpileup -m' to allow fine control of the collection of INDEL candidates. + + * Added `mpileup -S' to compute per-sample strand bias P-value. + + * Added `mpileup -G' to exclude read groups in variant calling. + + * Fixed segfault in indel calling related to unmapped and refskip reads. + + * Fixed an integer overflow in INDEL calling. This bug produces wrong INDEL + genotypes for longer short INDELs, typically over 10bp. + + * Fixed a bug in tview on big-endian machines. + + * Fixed a very rare memory issue in bam_md.c + + * Fixed an out-of-boundary bug in mpileup when the read base is `N'. + + * Fixed a compiling error when the knetfile library is not used. Fixed a + library compiling error due to the lack of bam_nt16_nt4_table[] table. + Suppress a compiling warning related to the latest zlib. + + +Other notable changes in bcftools: + + * Updated the BCF spec. + + * Added the `FQ' VCF INFO field, which gives the phred-scaled probability + of all samples being the same (identical to the reference or all homozygous + variants). Option `view -f' has been dropped. + + * Implementated of "vcfutils.pl vcf2fq" to generate a consensus sequence + similar to "samtools.pl pileup2fq". + + * Make sure the GT FORMAT field is always the first FORMAT to conform the VCF + spec. Drop bcf-fix.pl. + + * Output bcftools specific INFO and FORMAT in the VCF header. + + * Added `view -s' to call variants from a subset of samples. + + * Properly convert VCF to BCF with a user provided sequence dictionary. Nonetheless, + custom fields are still unparsed and will be stored as a missing value. + + * Fixed a minor bug in Fisher's exact test; the results are rarely changed. + + +(0.1.13: 1 March 2011, r926:134) + + + +Beta release 0.1.12a (2 December, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is another bug fix release: + + * Fixed a memory violation in mpileup, which causes segfault. Release + 0.1.9 and above are affected. + + * Fixed a memory violation in the indel caller, which does not causes + segfault, but may potentially affect deletion calls in an unexpected + way. Release 0.1.10 and above are affected. + + * Fixed a bug in computing r-square in bcftools. Few are using this + functionality and it only has minor effect. + + * Fixed a memory leak in bam_fetch(). + + * Fixed a bug in writing meta information to the BAM index for the last + sequence. This bug is invisible to most users, but it is a bug anyway. + + * Fixed a bug in bcftools which causes false "DP4=0,0,0,0" annotations. + +(0.1.12: 2 December 2010, r862) + + + +Beta release 0.1.11 (21 November, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is mainly a bug fix release: + + * Fixed a bug in random retrieval (since 0.1.8). It occurs when reads + are retrieved from a small region containing no reads. + + * Fixed a bug in pileup (since 0.1.9). The bug causes an assertion + failure when the first CIGAR operation is a deletion. + + * Improved fault tolerence in remote access. + +One minor feature has been implemented in bcftools: + + * Added a reference-free variant calling mode. In this mode, a site is + regarded as a variat iff the sample(s) contains two or more alleles; + the meaning of the QUAL field in the VCF output is changed + accordingly. Effectively, the reference allele is irrelevant to the + result in the new mode, although the reference sequence has to be + used in realignment when SAMtools computes genotype likelihoods. + +In addition, since 0.1.10, the `pileup' command has been deprecated by +`mpileup' which is more powerful and more accurate. The `pileup' command +will not be removed in the next few releases, but new features will not +be added. + +(0.1.11: 21 November 2010, r851) + + + +Beta Release 0.1.10 (16 November, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release is featured as the first major improvement to the indel +caller. The method is similar to the old one implemented in the pileup +command, but the details are handled more carefully both in theory and +in practice. As a result, the new indel caller usually gives more +accurate indel calls, though at the cost of sensitivity. The caller is +implemented in the mpileup command and is invoked by default. It works +with multiple samples. + +Other notable changes: + + * With the -r option, the calmd command writes the difference between + the original base quality and the BAQ capped base quality at the BQ + tag but does not modify the base quality. Please use -Ar to overwrite + the original base quality (the 0.1.9 behavior). + + * Allow to set a maximum per-sample read depth to reduce memory. In + 0.1.9, most of memory is wasted for the ultra high read depth in some + regions (e.g. the chr1 centromere). + + * Optionally write per-sample read depth and per-sample strand bias + P-value. + + * Compute equal-tail (Bayesian) credible interval of site allele + frequency at the CI95 VCF annotation. + + * Merged the vcfutils.pl varFilter and filter4vcf for better SNP/indel + filtering. + +(0.1.10: 16 November 2010, r829) + + + +Beta Release 0.1.9 (27 October, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release is featured as the first major improvement to the samtools' +SNP caller. It comes with a revised MAQ error model, the support of +multi-sample SNP calling and the computation of base alignment quality +(BAQ). + +The revised MAQ error model is based on the original model. It solves an +issue of miscalling SNPs in repetitive regions. Althought such SNPs can +usually be filtered at a later step, they mess up unfiltered calls. This +is a theoretical flaw in the original model. The revised MAQ model +deprecates the orginal MAQ model and the simplified SOAPsnp model. + +Multi-sample SNP calling is separated in two steps. The first is done by +samtools mpileup and the second by a new program, bcftools, which is +included in the samtools source code tree. Multi-sample SNP calling also +works for single sample and has the advantage of enabling more powerful +filtration. It is likely to deprecate pileup in future once a proper +indel calling method is implemented. + +BAQ is the Phred-scaled probability of a read base being wrongly +aligned. Capping base quality by BAQ has been shown to be very effective +in suppressing false SNPs caused by misalignments around indels or in +low-complexity regions with acceptable compromise on computation +time. This strategy is highly recommended and can be used with other SNP +callers as well. + +In addition to the three major improvements, other notable changes are: + + * Changes to the pileup format. A reference skip (the N CIGAR operator) + is shown as '<' or '>' depending on the strand. Tview is also changed + accordingly. + + * Accelerated pileup. The plain pileup is about 50% faster. + + * Regional merge. The merge command now accepts a new option to merge + files in a specified region. + + * Fixed a bug in bgzip and razip which causes source files to be + deleted even if option -c is applied. + + * In APIs, propogate errors to downstream callers and make samtools + return non-zero values once errors occur. + +(0.1.9: 27 October 2010, r783) + + + +Beta Release 0.1.8 (11 July, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable functional changes: + + * Added the `reheader' command which replaces a BAM header with a new + header. This command is much faster than replacing header by + BAM->SAM->BAM conversions. + + * Added the `mpileup' command which computes the pileup of multiple + alignments. + + * The `index' command now stores the number of mapped and unmapped + reads in the index file. This information can be retrieved quickly by + the new `idxstats' command. + + * By default, pileup used the SOAPsnp model for SNP calling. This + avoids the floating overflow in the MAQ model which leads to spurious + calls in repetitive regions, although these calls will be immediately + filtered by varFilter. + + * The `tview' command now correctly handles CIGARs like 7I10M and + 10M1P1I10M which cause assertion failure in earlier versions. + + * Tview accepts a region like `=10,000' where `=' stands for the + current sequence name. This saves typing for long sequence names. + + * Added the `-d' option to `pileup' which avoids slow indel calling + in ultradeep regions by subsampling reads locally. + + * Added the `-R' option to `view' which retrieves alignments in read + groups listed in the specified file. + +Performance improvements: + + * The BAM->SAM conversion is up to twice faster, depending on the + characteristic of the input. + + * Parsing SAM headers with a lot of reference sequences is now much + faster. + + * The number of lseek() calls per query is reduced when the query + region contains no read alignments. + +Bug fixes: + + * Fixed an issue in the indel caller that leads to miscall of indels. + Note that this solution may not work well when the sequencing indel + error rate is higher than the rate of SNPs. + + * Fixed another issue in the indel caller which may lead to incorrect + genotype. + + * Fixed a bug in `sort' when option `-o' is applied. + + * Fixed a bug in `view -r'. + +APIs and other changes: + + * Added iterator interfaces to random access and pileup. The callback + interfaces directly call the iterator interfaces. + + * The BGZF blocks holding the BAM header are indepedent of alignment + BGZF blocks. Alignment records shorter than 64kB is guaranteed to be + fully contained in one BGZF block. This change is fully compatible + with the old version of samtools/picard. + +Changes in other utilities: + + * Updated export2sam.pl by Chris Saunders. + + * Improved the sam2vcf.pl script. + + * Added a Python version of varfilter.py by Aylwyn Scally. + +(0.1.8: 11 July 2010, r613) + + + +Beta Release 0.1.7 (10 November, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Improved the indel caller in complex scenariors, in particular for + long reads. The indel caller is now able to make reasonable indel + calls from Craig Venter capillary reads. + + * Rewrote single-end duplicate removal with improved + performance. Paired-end reads are not touched. + + * Duplicate removal is now library aware. Samtools remove potential + PCR/optical dupliates inside a library rather than across libraries. + + * SAM header is now fully parsed, although this functionality is not + used in merging and so on. + + * In samtools merge, optionally take the input file name as RG-ID and + attach the RG tag to each alignment. + + * Added FTP support in the RAZF library. RAZF-compressed reference + sequence can be retrieved remotely. + + * Improved network support for Win32. + + * Samtools sort and merge are now stable. + +Changes in other utilities: + + * Implemented sam2vcf.pl that converts the pileup format to the VCF + format. + + * This release of samtools is known to work with the latest + Bio-Samtools Perl module. + +(0.1.7: 10 November 2009, r510) + + + +Beta Release 0.1.6 (2 September, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * In tview, do not show a blank screen when no reads mapped to the + corresponding region. + + * Implemented native HTTP support in the BGZF library. Samtools is now + able to directly open a BAM file on HTTP. HTTP proxy is also + supported via the "http_proxy" environmental variable. + + * Samtools is now compitable with the MinGW (win32) compiler and the + PDCurses library. + + * The calmd (or fillmd) command now calculates the NM tag and replaces + MD tags if they are wrong. + + * The view command now recognizes and optionally prints FLAG in HEXs or + strings to make a SAM file more friendly to human eyes. This is a + samtools-C extension, not implemented in Picard for the time + being. Please type `samtools view -?' for more information. + + * BAM files now have an end-of-file (EOF) marker to facilitate + truncation detection. A warning will be given if an on-disk BAM file + does not have this marker. The warning will be seen on BAM files + generated by an older version of samtools. It does NO harm. + + * New key bindings in tview: `r' to show read names and `s' to show + reference skip (N operation) as deletions. + + * Fixed a bug in `samtools merge -n'. + + * Samtools merge now optionally copies the header of a user specified + SAM file to the resultant BAM output. + + * Samtools pileup/tview works with a CIGAR with the first or the last + operation is an indel. + + * Fixed a bug in bam_aux_get(). + + +Changes in other utilies: + + * Fixed wrong FLAG in maq2sam. + + +(0.1.6: 2 September 2009, r453) + + + +Beta Release 0.1.5 (7 July, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Support opening a BAM alignment on FTP. Users can now use "tview" to + view alignments at the NCBI ftp site. Please read manual for more + information. + + * In library, propagate errors rather than exit or complain assertion + failure. + + * Simplified the building system and fixed compiling errors caused by + zlib<1.2.2.1. + + * Fixed an issue about lost header information when a SAM is imported + with "view -t". + + * Implemented "samtool.pl varFilter" which filters both SNPs and short + indels. This command replaces "indelFilter". + + * Implemented "samtools.pl pileup2fq" to generate FASTQ consensus from + pileup output. + + * In pileup, cap mapping quality at 60. This helps filtering when + different aligners are in use. + + * In pileup, allow to output variant sites only. + + * Made pileup generate correct calls in repetitive region. At the same + time, I am considering to implement a simplified model in SOAPsnp, + although this has not happened yet. + + * In view, added '-u' option to output BAM without compression. This + option is preferred when the output is piped to other commands. + + * In view, added '-l' and '-r' to get the alignments for one library or + read group. The "@RG" header lines are now partially parsed. + + * Do not include command line utilities to libbam.a. + + * Fixed memory leaks in pileup and bam_view1(). + + * Made faidx more tolerant to empty lines right before or after FASTA > + lines. + + +Changes in other utilities: + + * Updated novo2sam.pl by Colin Hercus, the key developer of novoalign. + + +This release involves several modifications to the key code base which +may potentially introduce new bugs even though we have tried to minimize +this by testing on several examples. Please let us know if you catch +bugs. + +(0.1.5: 7 July 2009, r373) + + + +Beta Release 0.1.4 (21 May, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes: + + * Added the 'rmdupse' command: removing duplicates for SE reads. + + * Fixed a critical bug in the indel caller: clipped alignments are not + processed correctly. + + * Fixed a bug in the tview: gapped alignment may be incorrectly + displayed. + + * Unified the interface to BAM and SAM I/O. This is done by + implementing a wrapper on top of the old APIs and therefore old APIs + are still valid. The new I/O APIs also recognize the @SQ header + lines. + + * Generate the MD tag. + + * Generate "=" bases. However, the indel caller will not work when "=" + bases are present. + + * Enhanced support of color-read display (by Nils Homer). + + * Implemented the GNU building system. However, currently the building + system does not generate libbam.a. We will improve this later. For + the time being, `make -f Makefile.generic' is preferred. + + * Fixed a minor bug in pileup: the first read in a chromosome may be + skipped. + + * Fixed bugs in bam_aux.c. These bugs do not affect other components as + they were not used previously. + + * Output the 'SM' tag from maq2sam. + +(0.1.4: 21 May 2009, r297) + + + +Beta Release 0.1.3 (15 April, 2009) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in SAMtools: + + * SAMtools is more consistent with the specification: a) '*' in the + QUAL field is allowed; b) the field separator is TAB only and SPACE + is treated as a character in a field; c) empty header is allowed. + + * Implemented GLFv3 support in pileup. + + * Fixed a severe bug in fixmate: strand information is wrongly + overwritten. + + * Fixed a bug in alignment retrieval: alignments bridging n*16384bp are + not correctly retrieved sometimes. + + * Fixed a bug in rmdup: segfault if unmapped reads are present. + + * Move indel_filter.pl to samtools.pl and improved the filtering by + checking the actual number of alignments containing indels. The indel + pileup line is also changed a little to make this filtration easier. + + * Fixed a minor bug in indexing: the bin number of an unmapped read is + wrongly calculated. + + * Added `flagstat' command to show statistics on the FLAG field. + + * Improved indel caller by setting the maximum window size in local + realignment. + +Changes in other utilities: + + * Fixed a bug in maq2sam: a tag name is obsolete. + + * Improvement to wgsim: a) added support for SOLiD read simulation; b) + show the number of substitutions/indels/errors in read name; c) + considerable code clean up. + + * Various converters: improved functionality in general. + + * Updated the example SAM due to the previous bug in fixmate. + +(0.1.3: 15 April 2009, r227) + + + +Beta Release 0.1.2 (28 January, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in SAMtools: + + * Implemented a Bayesian indel caller. The new caller generate scores + and genotype and is potentially more accurate than Maq's indel + caller. The pileup format is also changed accordingly. + + * Implemented rmdup command: remove potential PCR duplicates. Note that + this command ONLY works for FR orientation and requires ISIZE is + correctly set. + + * Added fixmate command: fill in mate coordinates, ISIZE and mate + related flags from a name-sorted alignment. + + * Fixed a bug in indexing: reads bridging 16x kbp were not retrieved. + + * Allow to select reads shown in the pileup output with a mask. + + * Generate GLFv2 from pileup. + + * Added two more flags for flagging PCR/optical duplicates and for QC + failure. + + * Fixed a bug in sort command: name sorting for large alignment did not + work. + + * Allow to completely disable RAZF (using Makefile.lite) as some people + have problem to compile it. + + * Fixed a bug in import command when there are reads without + coordinates. + + * Fixed a bug in tview: clipping broke the alignment viewer. + + * Fixed a compiling error when _NO_CURSES is applied. + + * Fixed a bug in merge command. + +Changes in other utilities: + + * Added wgsim, a paired-end reads simulator. Wgsim was adapted from + maq's reads simulator. Colin Hercus further improved it to allow + longer indels. + + * Added wgsim_eval.pl, a script that evaluates the accuracy of + alignment on reads generated by wgsim. + + * Added soap2sam.pl, a SOAP2->SAM converter. This converter does not + work properly when multiple hits are output. + + * Added bowtie2sam.pl, a Bowtie->SAM converter. Only the top hit will + be retained when multiple hits are present. + + * Fixed a bug in export2sam.pl for QC reads. + + * Support RG tag at MAQ->SAM converter. + + * Added novo2sam.pl, a NovoAlign->SAM converter. Multiple hits and + indel are not properly handled, though. + + * Added zoom2sam.pl, a ZOOM->SAM converter. It only works with the + default Illumina output. + +(0.1.2: 28 January 2008; r116) + + + +Beta Release 0.1.1 (22 December, 2008) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The is the first public release of samtools. For more information, +please check the manual page `samtools.1' and the samtools website +http://samtools.sourceforge.net
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,474 @@ +#include <stdio.h> +#include <ctype.h> +#include <errno.h> +#include <assert.h> +#include "bam.h" +#include "bam_endian.h" +#include "kstring.h" +#include "sam_header.h" + +int bam_is_be = 0, bam_verbose = 2, bam_no_B = 0; +char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0"; + +/************************** + * CIGAR related routines * + **************************/ + +uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar) +{ + int k, end = c->pos; + for (k = 0; k < c->n_cigar; ++k) { + int op = bam_cigar_op(cigar[k]); + int len = bam_cigar_oplen(cigar[k]); + if (op == BAM_CBACK) { // move backward + int l, u, v; + if (k == c->n_cigar - 1) break; // skip trailing 'B' + for (l = k - 1, u = v = 0; l >= 0; --l) { + int op1 = bam_cigar_op(cigar[l]); + int len1 = bam_cigar_oplen(cigar[l]); + if (bam_cigar_type(op1)&1) { // consume query + if (u + len1 >= len) { // stop + if (bam_cigar_type(op1)&2) v += len - u; + break; + } else u += len1; + } + if (bam_cigar_type(op1)&2) v += len1; + } + end = l < 0? c->pos : end - v; + } else if (bam_cigar_type(op)&2) end += bam_cigar_oplen(cigar[k]); + } + return end; +} + +int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar) +{ + uint32_t k; + int32_t l = 0; + for (k = 0; k < c->n_cigar; ++k) + if (bam_cigar_type(bam_cigar_op(cigar[k]))&1) + l += bam_cigar_oplen(cigar[k]); + return l; +} + +/******************** + * BAM I/O routines * + ********************/ + +bam_header_t *bam_header_init() +{ + bam_is_be = bam_is_big_endian(); + return (bam_header_t*)calloc(1, sizeof(bam_header_t)); +} + +void bam_header_destroy(bam_header_t *header) +{ + int32_t i; + extern void bam_destroy_header_hash(bam_header_t *header); + if (header == 0) return; + if (header->target_name) { + for (i = 0; i < header->n_targets; ++i) + free(header->target_name[i]); + free(header->target_name); + free(header->target_len); + } + free(header->text); + if (header->dict) sam_header_free(header->dict); + if (header->rg2lib) sam_tbl_destroy(header->rg2lib); + bam_destroy_header_hash(header); + free(header); +} + +bam_header_t *bam_header_read(bamFile fp) +{ + bam_header_t *header; + char buf[4]; + int magic_len; + int32_t i = 1, name_len; + // check EOF + i = bgzf_check_EOF(fp); + if (i < 0) { + // If the file is a pipe, checking the EOF marker will *always* fail + // with ESPIPE. Suppress the error message in this case. + if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF"); + } + else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n"); + // read "BAM1" + magic_len = bam_read(fp, buf, 4); + if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { + fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n"); + return 0; + } + header = bam_header_init(); + // read plain text and the number of reference sequences + bam_read(fp, &header->l_text, 4); + if (bam_is_be) bam_swap_endian_4p(&header->l_text); + header->text = (char*)calloc(header->l_text + 1, 1); + bam_read(fp, header->text, header->l_text); + bam_read(fp, &header->n_targets, 4); + if (bam_is_be) bam_swap_endian_4p(&header->n_targets); + // read reference sequence names and lengths + header->target_name = (char**)calloc(header->n_targets, sizeof(char*)); + header->target_len = (uint32_t*)calloc(header->n_targets, 4); + for (i = 0; i != header->n_targets; ++i) { + bam_read(fp, &name_len, 4); + if (bam_is_be) bam_swap_endian_4p(&name_len); + header->target_name[i] = (char*)calloc(name_len, 1); + bam_read(fp, header->target_name[i], name_len); + bam_read(fp, &header->target_len[i], 4); + if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]); + } + return header; +} + +int bam_header_write(bamFile fp, const bam_header_t *header) +{ + char buf[4]; + int32_t i, name_len, x; + // write "BAM1" + strncpy(buf, "BAM\001", 4); + bam_write(fp, buf, 4); + // write plain text and the number of reference sequences + if (bam_is_be) { + x = bam_swap_endian_4(header->l_text); + bam_write(fp, &x, 4); + if (header->l_text) bam_write(fp, header->text, header->l_text); + x = bam_swap_endian_4(header->n_targets); + bam_write(fp, &x, 4); + } else { + bam_write(fp, &header->l_text, 4); + if (header->l_text) bam_write(fp, header->text, header->l_text); + bam_write(fp, &header->n_targets, 4); + } + // write sequence names and lengths + for (i = 0; i != header->n_targets; ++i) { + char *p = header->target_name[i]; + name_len = strlen(p) + 1; + if (bam_is_be) { + x = bam_swap_endian_4(name_len); + bam_write(fp, &x, 4); + } else bam_write(fp, &name_len, 4); + bam_write(fp, p, name_len); + if (bam_is_be) { + x = bam_swap_endian_4(header->target_len[i]); + bam_write(fp, &x, 4); + } else bam_write(fp, &header->target_len[i], 4); + } + bgzf_flush(fp); + return 0; +} + +static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) +{ + uint8_t *s; + uint32_t i, *cigar = (uint32_t*)(data + c->l_qname); + s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2; + for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]); + while (s < data + data_len) { + uint8_t type; + s += 2; // skip key + type = toupper(*s); ++s; // skip type + if (type == 'C' || type == 'A') ++s; + else if (type == 'S') { bam_swap_endian_2p(s); s += 2; } + else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; } + else if (type == 'D') { bam_swap_endian_8p(s); s += 8; } + else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } + else if (type == 'B') { + int32_t n, Bsize = bam_aux_type2size(*s); + memcpy(&n, s + 1, 4); + if (1 == Bsize) { + } else if (2 == Bsize) { + for (i = 0; i < n; i += 2) + bam_swap_endian_2p(s + 5 + i); + } else if (4 == Bsize) { + for (i = 0; i < n; i += 4) + bam_swap_endian_4p(s + 5 + i); + } + bam_swap_endian_4p(s+1); + } + } +} + +int bam_read1(bamFile fp, bam1_t *b) +{ + bam1_core_t *c = &b->core; + int32_t block_len, ret, i; + uint32_t x[8]; + + assert(BAM_CORE_SIZE == 32); + if ((ret = bam_read(fp, &block_len, 4)) != 4) { + if (ret == 0) return -1; // normal end-of-file + else return -2; // truncated + } + if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3; + if (bam_is_be) { + bam_swap_endian_4p(&block_len); + for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); + } + c->tid = x[0]; c->pos = x[1]; + c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; + c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; + c->l_qseq = x[4]; + c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; + b->data_len = block_len - BAM_CORE_SIZE; + if (b->m_data < b->data_len) { + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4; + b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; + if (bam_is_be) swap_endian_data(c, b->data_len, b->data); + if (bam_no_B) bam_remove_B(b); + return 4 + block_len; +} + +inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data) +{ + uint32_t x[8], block_len = data_len + BAM_CORE_SIZE, y; + int i; + assert(BAM_CORE_SIZE == 32); + x[0] = c->tid; + x[1] = c->pos; + x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname; + x[3] = (uint32_t)c->flag<<16 | c->n_cigar; + x[4] = c->l_qseq; + x[5] = c->mtid; + x[6] = c->mpos; + x[7] = c->isize; + bgzf_flush_try(fp, 4 + block_len); + if (bam_is_be) { + for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); + y = block_len; + bam_write(fp, bam_swap_endian_4p(&y), 4); + swap_endian_data(c, data_len, data); + } else bam_write(fp, &block_len, 4); + bam_write(fp, x, BAM_CORE_SIZE); + bam_write(fp, data, data_len); + if (bam_is_be) swap_endian_data(c, data_len, data); + return 4 + block_len; +} + +int bam_write1(bamFile fp, const bam1_t *b) +{ + return bam_write1_core(fp, &b->core, b->data_len, b->data); +} + +char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) +{ + uint8_t *s = bam1_seq(b), *t = bam1_qual(b); + int i; + const bam1_core_t *c = &b->core; + kstring_t str; + str.l = str.m = 0; str.s = 0; + + kputsn(bam1_qname(b), c->l_qname-1, &str); kputc('\t', &str); + if (of == BAM_OFDEC) { kputw(c->flag, &str); kputc('\t', &str); } + else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag); + else { // BAM_OFSTR + for (i = 0; i < 16; ++i) + if ((c->flag & 1<<i) && bam_flag2char_table[i]) + kputc(bam_flag2char_table[i], &str); + kputc('\t', &str); + } + if (c->tid < 0) kputsn("*\t", 2, &str); + else { + if (header) kputs(header->target_name[c->tid] , &str); + else kputw(c->tid, &str); + kputc('\t', &str); + } + kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str); + if (c->n_cigar == 0) kputc('*', &str); + else { + uint32_t *cigar = bam1_cigar(b); + for (i = 0; i < c->n_cigar; ++i) { + kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); + kputc(bam_cigar_opchr(cigar[i]), &str); + } + } + kputc('\t', &str); + if (c->mtid < 0) kputsn("*\t", 2, &str); + else if (c->mtid == c->tid) kputsn("=\t", 2, &str); + else { + if (header) kputs(header->target_name[c->mtid], &str); + else kputw(c->mtid, &str); + kputc('\t', &str); + } + kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str); + if (c->l_qseq) { + for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str); + kputc('\t', &str); + if (t[0] == 0xff) kputc('*', &str); + else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str); + } else kputsn("*\t*", 3, &str); + s = bam1_aux(b); + while (s < b->data + b->data_len) { + uint8_t type, key[2]; + key[0] = s[0]; key[1] = s[1]; + s += 2; type = *s; ++s; + kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str); + if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; } + else if (type == 'C') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; } + else if (type == 'c') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; } + else if (type == 'S') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; } + else if (type == 's') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; } + else if (type == 'I') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; } + else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; } + else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } + else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } + else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; } + else if (type == 'B') { + uint8_t sub_type = *(s++); + int32_t n; + memcpy(&n, s, 4); + s += 4; // no point to the start of the array + kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); // write the typing + for (i = 0; i < n; ++i) { + kputc(',', &str); + if ('c' == sub_type || 'c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; } + else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; } + else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; } + else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; } + else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; } + else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; } + else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; } + } + } + } + return str.s; +} + +char *bam_format1(const bam_header_t *header, const bam1_t *b) +{ + return bam_format1_core(header, b, BAM_OFDEC); +} + +void bam_view1(const bam_header_t *header, const bam1_t *b) +{ + char *s = bam_format1(header, b); + puts(s); + free(s); +} + +int bam_validate1(const bam_header_t *header, const bam1_t *b) +{ + char *s; + + if (b->core.tid < -1 || b->core.mtid < -1) return 0; + if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0; + + if (b->data_len < b->core.l_qname) return 0; + s = memchr(bam1_qname(b), '\0', b->core.l_qname); + if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0; + + // FIXME: Other fields could also be checked, especially the auxiliary data + + return 1; +} + +// FIXME: we should also check the LB tag associated with each alignment +const char *bam_get_library(bam_header_t *h, const bam1_t *b) +{ + const uint8_t *rg; + if (h->dict == 0) h->dict = sam_header_parse2(h->text); + if (h->rg2lib == 0) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB"); + rg = bam_aux_get(b, "RG"); + return (rg == 0)? 0 : sam_tbl_get(h->rg2lib, (const char*)(rg + 1)); +} + +/************ + * Remove B * + ************/ + +int bam_remove_B(bam1_t *b) +{ + int i, j, end_j, k, l, no_qual; + uint32_t *cigar, *new_cigar; + uint8_t *seq, *qual, *p; + // test if removal is necessary + if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing + cigar = bam1_cigar(b); + for (k = 0; k < b->core.n_cigar; ++k) + if (bam_cigar_op(cigar[k]) == BAM_CBACK) break; + if (k == b->core.n_cigar) return 0; // no 'B' + if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed + // allocate memory for the new CIGAR + if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory + b->m_data = b->data_len + b->core.n_cigar * 4; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + cigar = bam1_cigar(b); // after realloc, cigar may be changed + } + new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data + // the core loop + seq = bam1_seq(b); qual = bam1_qual(b); + no_qual = (qual[0] == 0xff); // test whether base quality is available + i = j = 0; end_j = -1; + for (k = l = 0; k < b->core.n_cigar; ++k) { + int op = bam_cigar_op(cigar[k]); + int len = bam_cigar_oplen(cigar[k]); + if (op == BAM_CBACK) { // the backward operation + int t, u; + if (k == b->core.n_cigar - 1) break; // ignore 'B' at the end of CIGAR + if (len > j) goto rmB_err; // an excessively long backward + for (t = l - 1, u = 0; t >= 0; --t) { // look back + int op1 = bam_cigar_op(new_cigar[t]); + int len1 = bam_cigar_oplen(new_cigar[t]); + if (bam_cigar_type(op1)&1) { // consume the query + if (u + len1 >= len) { // stop + new_cigar[t] -= (len - u) << BAM_CIGAR_SHIFT; + break; + } else u += len1; + } + } + if (bam_cigar_oplen(new_cigar[t]) == 0) --t; // squeeze out the zero-length operation + l = t + 1; + end_j = j; j -= len; + } else { // other CIGAR operations + new_cigar[l++] = cigar[k]; + if (bam_cigar_type(op)&1) { // consume the query + if (i != j) { // no need to copy if i == j + int u, c, c0; + for (u = 0; u < len; ++u) { // construct the consensus + c = bam1_seqi(seq, i+u); + if (j + u < end_j) { // in an overlap + c0 = bam1_seqi(seq, j+u); + if (c != c0) { // a mismatch; choose the better base + if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better + bam1_seq_seti(seq, j+u, c); + qual[j+u] = qual[i+u] - qual[j+u]; + } else qual[j+u] -= qual[i+u]; // the 1st is better; reduce base quality + } else qual[j+u] = qual[j+u] > qual[i+u]? qual[j+u] : qual[i+u]; + } else { // not in an overlap; copy over + bam1_seq_seti(seq, j+u, c); + qual[j+u] = qual[i+u]; + } + } + } + i += len, j += len; + } + } + } + if (no_qual) qual[0] = 0xff; // in very rare cases, this may be modified + // merge adjacent operations if possible + for (k = 1; k < l; ++k) + if (bam_cigar_op(new_cigar[k]) == bam_cigar_op(new_cigar[k-1])) + new_cigar[k] += new_cigar[k-1] >> BAM_CIGAR_SHIFT << BAM_CIGAR_SHIFT, new_cigar[k-1] &= 0xf; + // kill zero length operations + for (k = i = 0; k < l; ++k) + if (new_cigar[k] >> BAM_CIGAR_SHIFT) + new_cigar[i++] = new_cigar[k]; + l = i; + // update b + memcpy(cigar, new_cigar, l * 4); // set CIGAR + p = b->data + b->core.l_qname + l * 4; + memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ + memmove(p, qual, j); p += j; // set QUAL + memmove(p, bam1_aux(b), b->l_aux); p += b->l_aux; // set optional fields + b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length + b->data_len = p - b->data; // update record length + return 0; + +rmB_err: + b->core.flag |= BAM_FUNMAP; + return -1; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,793 @@ +/* The MIT License + + Copyright (c) 2008-2010 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li <lh3@sanger.ac.uk> */ + +#ifndef BAM_BAM_H +#define BAM_BAM_H + +/*! + @header + + BAM library provides I/O and various operations on manipulating files + in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) + format. It now supports importing from or exporting to SAM, sorting, + merging, generating pileup, and quickly retrieval of reads overlapped + with a specified region. + + @copyright Genome Research Ltd. + */ + +#define BAM_VERSION "0.1.19-44428cd" + +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> + +#ifndef BAM_LITE +#define BAM_VIRTUAL_OFFSET16 +#include "bgzf.h" +/*! @abstract BAM file handler */ +typedef BGZF *bamFile; +#define bam_open(fn, mode) bgzf_open(fn, mode) +#define bam_dopen(fd, mode) bgzf_fdopen(fd, mode) +#define bam_close(fp) bgzf_close(fp) +#define bam_read(fp, buf, size) bgzf_read(fp, buf, size) +#define bam_write(fp, buf, size) bgzf_write(fp, buf, size) +#define bam_tell(fp) bgzf_tell(fp) +#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir) +#else +#define BAM_TRUE_OFFSET +#include <zlib.h> +typedef gzFile bamFile; +#define bam_open(fn, mode) gzopen(fn, mode) +#define bam_dopen(fd, mode) gzdopen(fd, mode) +#define bam_close(fp) gzclose(fp) +#define bam_read(fp, buf, size) gzread(fp, buf, size) +/* no bam_write/bam_tell/bam_seek() here */ +#endif + +/*! @typedef + @abstract Structure for the alignment header. + @field n_targets number of reference sequences + @field target_name names of the reference sequences + @field target_len lengths of the referene sequences + @field dict header dictionary + @field hash hash table for fast name lookup + @field rg2lib hash table for @RG-ID -> LB lookup + @field l_text length of the plain text in the header + @field text plain text + + @discussion Field hash points to null by default. It is a private + member. + */ +typedef struct { + int32_t n_targets; + char **target_name; + uint32_t *target_len; + void *dict, *hash, *rg2lib; + uint32_t l_text, n_text; + char *text; +} bam_header_t; + +/*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */ +#define BAM_FPAIRED 1 +/*! @abstract the read is mapped in a proper pair */ +#define BAM_FPROPER_PAIR 2 +/*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */ +#define BAM_FUNMAP 4 +/*! @abstract the mate is unmapped */ +#define BAM_FMUNMAP 8 +/*! @abstract the read is mapped to the reverse strand */ +#define BAM_FREVERSE 16 +/*! @abstract the mate is mapped to the reverse strand */ +#define BAM_FMREVERSE 32 +/*! @abstract this is read1 */ +#define BAM_FREAD1 64 +/*! @abstract this is read2 */ +#define BAM_FREAD2 128 +/*! @abstract not primary alignment */ +#define BAM_FSECONDARY 256 +/*! @abstract QC failure */ +#define BAM_FQCFAIL 512 +/*! @abstract optical or PCR duplicate */ +#define BAM_FDUP 1024 + +#define BAM_OFDEC 0 +#define BAM_OFHEX 1 +#define BAM_OFSTR 2 + +/*! @abstract defautl mask for pileup */ +#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) + +#define BAM_CORE_SIZE sizeof(bam1_core_t) + +/** + * Describing how CIGAR operation/length is packed in a 32-bit integer. + */ +#define BAM_CIGAR_SHIFT 4 +#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) + +/* + CIGAR operations. + */ +/*! @abstract CIGAR: M = match or mismatch*/ +#define BAM_CMATCH 0 +/*! @abstract CIGAR: I = insertion to the reference */ +#define BAM_CINS 1 +/*! @abstract CIGAR: D = deletion from the reference */ +#define BAM_CDEL 2 +/*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */ +#define BAM_CREF_SKIP 3 +/*! @abstract CIGAR: S = clip on the read with clipped sequence + present in qseq */ +#define BAM_CSOFT_CLIP 4 +/*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */ +#define BAM_CHARD_CLIP 5 +/*! @abstract CIGAR: P = padding */ +#define BAM_CPAD 6 +/*! @abstract CIGAR: equals = match */ +#define BAM_CEQUAL 7 +/*! @abstract CIGAR: X = mismatch */ +#define BAM_CDIFF 8 +#define BAM_CBACK 9 + +#define BAM_CIGAR_STR "MIDNSHP=XB" +#define BAM_CIGAR_TYPE 0x3C1A7 + +#define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK) +#define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT) +#define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)]) +#define bam_cigar_gen(l, o) ((l)<<BAM_CIGAR_SHIFT|(o)) +#define bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference + +/*! @typedef + @abstract Structure for core alignment information. + @field tid chromosome ID, defined by bam_header_t + @field pos 0-based leftmost coordinate + @field bin bin calculated by bam_reg2bin() + @field qual mapping quality + @field l_qname length of the query name + @field flag bitwise flag + @field n_cigar number of CIGAR operations + @field l_qseq length of the query sequence (read) + */ +typedef struct { + int32_t tid; + int32_t pos; + uint32_t bin:16, qual:8, l_qname:8; + uint32_t flag:16, n_cigar:16; + int32_t l_qseq; + int32_t mtid; + int32_t mpos; + int32_t isize; +} bam1_core_t; + +/*! @typedef + @abstract Structure for one alignment. + @field core core information about the alignment + @field l_aux length of auxiliary data + @field data_len current length of bam1_t::data + @field m_data maximum length of bam1_t::data + @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux + + @discussion Notes: + + 1. qname is zero tailing and core.l_qname includes the tailing '\0'. + 2. l_qseq is calculated from the total length of an alignment block + on reading or from CIGAR. + 3. cigar data is encoded 4 bytes per CIGAR operation. + 4. seq is nybble-encoded according to bam_nt16_table. + */ +typedef struct { + bam1_core_t core; + int l_aux, data_len, m_data; + uint8_t *data; +} bam1_t; + +typedef struct __bam_iter_t *bam_iter_t; + +#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0) +#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0) + +/*! @function + @abstract Get the CIGAR array + @param b pointer to an alignment + @return pointer to the CIGAR array + + @discussion In the CIGAR array, each element is a 32-bit integer. The + lower 4 bits gives a CIGAR operation and the higher 28 bits keep the + length of a CIGAR. + */ +#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname)) + +/*! @function + @abstract Get the name of the query + @param b pointer to an alignment + @return pointer to the name string, null terminated + */ +#define bam1_qname(b) ((char*)((b)->data)) + +/*! @function + @abstract Get query sequence + @param b pointer to an alignment + @return pointer to sequence + + @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, + 8 for T and 15 for N. Two bases are packed in one byte with the base + at the higher 4 bits having smaller coordinate on the read. It is + recommended to use bam1_seqi() macro to get the base. + */ +#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) + +/*! @function + @abstract Get query quality + @param b pointer to an alignment + @return pointer to quality string + */ +#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) + +/*! @function + @abstract Get a base on read + @param s Query sequence returned by bam1_seq() + @param i The i-th position, 0-based + @return 4-bit integer representing the base. + */ +//#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) +#define bam1_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf) + +#define bam1_seq_seti(s, i, c) ( (s)[(i)>>1] = ((s)[(i)>>1] & 0xf<<(((i)&1)<<2)) | (c)<<((~(i)&1)<<2) ) + +/*! @function + @abstract Get query sequence and quality + @param b pointer to an alignment + @return pointer to the concatenated auxiliary data + */ +#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) + +#ifndef kroundup32 +/*! @function + @abstract Round an integer to the next closest power-2 integer. + @param x integer to be rounded (in place) + @discussion x will be modified. + */ +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +/*! + @abstract Whether the machine is big-endian; modified only in + bam_header_init(). + */ +extern int bam_is_be; + +/*! + @abstract Verbose level between 0 and 3; 0 is supposed to disable all + debugging information, though this may not have been implemented. + */ +extern int bam_verbose; + +extern int bam_no_B; + +/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ +extern unsigned char bam_nt16_table[256]; + +/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */ +extern char *bam_nt16_rev_table; + +extern char bam_nt16_nt4_table[]; + +#ifdef __cplusplus +extern "C" { +#endif + + /********************* + * Low-level SAM I/O * + *********************/ + + /*! @abstract TAM file handler */ + typedef struct __tamFile_t *tamFile; + + /*! + @abstract Open a SAM file for reading, either uncompressed or compressed by gzip/zlib. + @param fn SAM file name + @return SAM file handler + */ + tamFile sam_open(const char *fn); + + /*! + @abstract Close a SAM file handler + @param fp SAM file handler + */ + void sam_close(tamFile fp); + + /*! + @abstract Read one alignment from a SAM file handler + @param fp SAM file handler + @param header header information (ordered names of chromosomes) + @param b read alignment; all members in b will be updated + @return 0 if successful; otherwise negative + */ + int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b); + + /*! + @abstract Read header information from a TAB-delimited list file. + @param fn_list file name for the list + @return a pointer to the header structure + + @discussion Each line in this file consists of chromosome name and + the length of chromosome. + */ + bam_header_t *sam_header_read2(const char *fn_list); + + /*! + @abstract Read header from a SAM file (if present) + @param fp SAM file handler + @return pointer to header struct; 0 if no @SQ lines available + */ + bam_header_t *sam_header_read(tamFile fp); + + /*! + @abstract Parse @SQ lines a update a header struct + @param h pointer to the header struct to be updated + @return number of target sequences + + @discussion bam_header_t::{n_targets,target_len,target_name} will + be destroyed in the first place. + */ + int sam_header_parse(bam_header_t *h); + int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); + + /*! + @abstract Parse @RG lines a update a header struct + @param h pointer to the header struct to be updated + @return number of @RG lines + + @discussion bam_header_t::rg2lib will be destroyed in the first + place. + */ + int sam_header_parse_rg(bam_header_t *h); + +#define sam_write1(header, b) bam_view1(header, b) + + + /******************************** + * APIs for string dictionaries * + ********************************/ + + int bam_strmap_put(void *strmap, const char *rg, const char *lib); + const char *bam_strmap_get(const void *strmap, const char *rg); + void *bam_strmap_dup(const void*); + void *bam_strmap_init(); + void bam_strmap_destroy(void *strmap); + + + /********************* + * Low-level BAM I/O * + *********************/ + + /*! + @abstract Initialize a header structure. + @return the pointer to the header structure + + @discussion This function also modifies the global variable + bam_is_be. + */ + bam_header_t *bam_header_init(); + + /*! + @abstract Destroy a header structure. + @param header pointer to the header + */ + void bam_header_destroy(bam_header_t *header); + + /*! + @abstract Read a header structure from BAM. + @param fp BAM file handler, opened by bam_open() + @return pointer to the header structure + + @discussion The file position indicator must be placed at the + beginning of the file. Upon success, the position indicator will + be set at the start of the first alignment. + */ + bam_header_t *bam_header_read(bamFile fp); + + /*! + @abstract Write a header structure to BAM. + @param fp BAM file handler + @param header pointer to the header structure + @return always 0 currently + */ + int bam_header_write(bamFile fp, const bam_header_t *header); + + /*! + @abstract Read an alignment from BAM. + @param fp BAM file handler + @param b read alignment; all members are updated. + @return number of bytes read from the file + + @discussion The file position indicator must be + placed right before an alignment. Upon success, this function + will set the position indicator to the start of the next + alignment. This function is not affected by the machine + endianness. + */ + int bam_read1(bamFile fp, bam1_t *b); + + int bam_remove_B(bam1_t *b); + + /*! + @abstract Write an alignment to BAM. + @param fp BAM file handler + @param c pointer to the bam1_core_t structure + @param data_len total length of variable size data related to + the alignment + @param data pointer to the concatenated data + @return number of bytes written to the file + + @discussion This function is not affected by the machine + endianness. + */ + int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data); + + /*! + @abstract Write an alignment to BAM. + @param fp BAM file handler + @param b alignment to write + @return number of bytes written to the file + + @abstract It is equivalent to: + bam_write1_core(fp, &b->core, b->data_len, b->data) + */ + int bam_write1(bamFile fp, const bam1_t *b); + + /*! @function + @abstract Initiate a pointer to bam1_t struct + */ +#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t))) + + /*! @function + @abstract Free the memory allocated for an alignment. + @param b pointer to an alignment + */ +#define bam_destroy1(b) do { \ + if (b) { free((b)->data); free(b); } \ + } while (0) + + /*! + @abstract Format a BAM record in the SAM format + @param header pointer to the header structure + @param b alignment to print + @return a pointer to the SAM string + */ + char *bam_format1(const bam_header_t *header, const bam1_t *b); + + char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of); + + /*! + @abstract Check whether a BAM record is plausibly valid + @param header associated header structure, or NULL if unavailable + @param b alignment to validate + @return 0 if the alignment is invalid; non-zero otherwise + + @discussion Simple consistency check of some of the fields of the + alignment record. If the header is provided, several additional checks + are made. Not all fields are checked, so a non-zero result is not a + guarantee that the record is valid. However it is usually good enough + to detect when bam_seek() has been called with a virtual file offset + that is not the offset of an alignment record. + */ + int bam_validate1(const bam_header_t *header, const bam1_t *b); + + const char *bam_get_library(bam_header_t *header, const bam1_t *b); + + + /*************** + * pileup APIs * + ***************/ + + /*! @typedef + @abstract Structure for one alignment covering the pileup position. + @field b pointer to the alignment + @field qpos position of the read base at the pileup site, 0-based + @field indel indel length; 0 for no indel, positive for ins and negative for del + @field is_del 1 iff the base on the padded read is a deletion + @field level the level of the read in the "viewer" mode + + @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The + difference between the two functions is that the former does not + set bam_pileup1_t::level, while the later does. Level helps the + implementation of alignment viewers, but calculating this has some + overhead. + */ + typedef struct { + bam1_t *b; + int32_t qpos; + int indel, level; + uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28; + } bam_pileup1_t; + + typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); + + struct __bam_plp_t; + typedef struct __bam_plp_t *bam_plp_t; + + bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data); + int bam_plp_push(bam_plp_t iter, const bam1_t *b); + const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); + const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); + void bam_plp_set_mask(bam_plp_t iter, int mask); + void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); + void bam_plp_reset(bam_plp_t iter); + void bam_plp_destroy(bam_plp_t iter); + + struct __bam_mplp_t; + typedef struct __bam_mplp_t *bam_mplp_t; + + bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data); + void bam_mplp_destroy(bam_mplp_t iter); + void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); + int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); + + /*! @typedef + @abstract Type of function to be called by bam_plbuf_push(). + @param tid chromosome ID as is defined in the header + @param pos start coordinate of the alignment, 0-based + @param n number of elements in pl array + @param pl array of alignments + @param data user provided data + @discussion See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t. + */ + typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); + + typedef struct { + bam_plp_t iter; + bam_pileup_f func; + void *data; + } bam_plbuf_t; + + void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask); + void bam_plbuf_reset(bam_plbuf_t *buf); + bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data); + void bam_plbuf_destroy(bam_plbuf_t *buf); + int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf); + + int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data); + + struct __bam_lplbuf_t; + typedef struct __bam_lplbuf_t bam_lplbuf_t; + + void bam_lplbuf_reset(bam_lplbuf_t *buf); + + /*! @abstract bam_plbuf_init() equivalent with level calculated. */ + bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data); + + /*! @abstract bam_plbuf_destroy() equivalent with level calculated. */ + void bam_lplbuf_destroy(bam_lplbuf_t *tv); + + /*! @abstract bam_plbuf_push() equivalent with level calculated. */ + int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf); + + + /********************* + * BAM indexing APIs * + *********************/ + + struct __bam_index_t; + typedef struct __bam_index_t bam_index_t; + + /*! + @abstract Build index for a BAM file. + @discussion Index file "fn.bai" will be created. + @param fn name of the BAM file + @return always 0 currently + */ + int bam_index_build(const char *fn); + + /*! + @abstract Load index from file "fn.bai". + @param fn name of the BAM file (NOT the index file) + @return pointer to the index structure + */ + bam_index_t *bam_index_load(const char *fn); + + /*! + @abstract Destroy an index structure. + @param idx pointer to the index structure + */ + void bam_index_destroy(bam_index_t *idx); + + /*! @typedef + @abstract Type of function to be called by bam_fetch(). + @param b the alignment + @param data user provided data + */ + typedef int (*bam_fetch_f)(const bam1_t *b, void *data); + + /*! + @abstract Retrieve the alignments that are overlapped with the + specified region. + + @discussion A user defined function will be called for each + retrieved alignment ordered by its start position. + + @param fp BAM file handler + @param idx pointer to the alignment index + @param tid chromosome ID as is defined in the header + @param beg start coordinate, 0-based + @param end end coordinate, 0-based + @param data user provided data (will be transferred to func) + @param func user defined function + */ + int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func); + + bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end); + int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b); + void bam_iter_destroy(bam_iter_t iter); + + /*! + @abstract Parse a region in the format: "chr2:100,000-200,000". + @discussion bam_header_t::hash will be initialized if empty. + @param header pointer to the header structure + @param str string to be parsed + @param ref_id the returned chromosome ID + @param begin the returned start coordinate + @param end the returned end coordinate + @return 0 on success; -1 on failure + */ + int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end); + + + /************************** + * APIs for optional tags * + **************************/ + + /*! + @abstract Retrieve data of a tag + @param b pointer to an alignment struct + @param tag two-character tag to be retrieved + + @return pointer to the type and data. The first character is the + type that can be 'iIsScCdfAZH'. + + @discussion Use bam_aux2?() series to convert the returned data to + the corresponding type. + */ + uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); + + int32_t bam_aux2i(const uint8_t *s); + float bam_aux2f(const uint8_t *s); + double bam_aux2d(const uint8_t *s); + char bam_aux2A(const uint8_t *s); + char *bam_aux2Z(const uint8_t *s); + + int bam_aux_del(bam1_t *b, uint8_t *s); + void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data); + uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get() + + + /***************** + * Miscellaneous * + *****************/ + + /*! + @abstract Calculate the rightmost coordinate of an alignment on the + reference genome. + + @param c pointer to the bam1_core_t structure + @param cigar the corresponding CIGAR array (from bam1_t::cigar) + @return the rightmost coordinate, 0-based + */ + uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar); + + /*! + @abstract Calculate the length of the query sequence from CIGAR. + @param c pointer to the bam1_core_t structure + @param cigar the corresponding CIGAR array (from bam1_t::cigar) + @return length of the query sequence + */ + int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar); + +#ifdef __cplusplus +} +#endif + +/*! + @abstract Calculate the minimum bin that contains a region [beg,end). + @param beg start of the region, 0-based + @param end end of the region, 0-based + @return bin + */ +static inline int bam_reg2bin(uint32_t beg, uint32_t end) +{ + --end; + if (beg>>14 == end>>14) return 4681 + (beg>>14); + if (beg>>17 == end>>17) return 585 + (beg>>17); + if (beg>>20 == end>>20) return 73 + (beg>>20); + if (beg>>23 == end>>23) return 9 + (beg>>23); + if (beg>>26 == end>>26) return 1 + (beg>>26); + return 0; +} + +/*! + @abstract Copy an alignment + @param bdst destination alignment struct + @param bsrc source alignment struct + @return pointer to the destination alignment struct + */ +static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) +{ + uint8_t *data = bdst->data; + int m_data = bdst->m_data; // backup data and m_data + if (m_data < bsrc->data_len) { // double the capacity + m_data = bsrc->data_len; kroundup32(m_data); + data = (uint8_t*)realloc(data, m_data); + } + memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data + *bdst = *bsrc; // copy the rest + // restore the backup + bdst->m_data = m_data; + bdst->data = data; + return bdst; +} + +/*! + @abstract Duplicate an alignment + @param src source alignment struct + @return pointer to the destination alignment struct + */ +static inline bam1_t *bam_dup1(const bam1_t *src) +{ + bam1_t *b; + b = bam_init1(); + *b = *src; + b->m_data = b->data_len; + b->data = (uint8_t*)calloc(b->data_len, 1); + memcpy(b->data, src->data, b->data_len); + return b; +} + +static inline int bam_aux_type2size(int x) +{ + if (x == 'C' || x == 'c' || x == 'A') return 1; + else if (x == 'S' || x == 's') return 2; + else if (x == 'I' || x == 'i' || x == 'f' || x == 'F') return 4; + else return 0; +} + +/********************************* + *** Compatibility with htslib *** + *********************************/ + +typedef bam_header_t bam_hdr_t; + +#define bam_get_qname(b) bam1_qname(b) +#define bam_get_cigar(b) bam1_cigar(b) + +#define bam_hdr_read(fp) bam_header_read(fp) +#define bam_hdr_write(fp, h) bam_header_write(fp, h) +#define bam_hdr_destroy(fp) bam_header_destroy(fp) + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam2bcf.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,467 @@ +#include <math.h> +#include <stdint.h> +#include <assert.h> +#include "bam.h" +#include "kstring.h" +#include "bam2bcf.h" +#include "errmod.h" +#include "bcftools/bcf.h" + +extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); + +#define CALL_ETA 0.03f +#define CALL_MAX 256 +#define CALL_DEFTHETA 0.83f +#define DEF_MAPQ 20 + +#define CAP_DIST 25 + +bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) +{ + bcf_callaux_t *bca; + if (theta <= 0.) theta = CALL_DEFTHETA; + bca = calloc(1, sizeof(bcf_callaux_t)); + bca->capQ = 60; + bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100; + bca->min_baseQ = min_baseQ; + bca->e = errmod_init(1. - theta); + bca->min_frac = 0.002; + bca->min_support = 1; + bca->per_sample_flt = 0; + bca->npos = 100; + bca->ref_pos = calloc(bca->npos, sizeof(int)); + bca->alt_pos = calloc(bca->npos, sizeof(int)); + return bca; +} + + +static int get_position(const bam_pileup1_t *p, int *len) +{ + int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1; + for (icig=0; icig<p->b->core.n_cigar; icig++) + { + // Conversion from uint32_t to MIDNSHP + // 0123456 + // MIDNSHP + int cig = bam1_cigar(p->b)[icig] & BAM_CIGAR_MASK; + int ncig = bam1_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT; + if ( cig==0 ) + { + n_tot_bases += ncig; + iread += ncig; + } + else if ( cig==1 ) + { + n_tot_bases += ncig; + iread += ncig; + } + else if ( cig==4 ) + { + iread += ncig; + if ( iread<=p->qpos ) edist -= ncig; + } + } + *len = n_tot_bases; + return edist; +} + +void bcf_call_destroy(bcf_callaux_t *bca) +{ + if (bca == 0) return; + errmod_destroy(bca->e); + if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; } + free(bca->bases); free(bca->inscns); free(bca); +} +/* ref_base is the 4-bit representation of the reference base. It is + * negative if we are looking at an indel. */ +int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r) +{ + int i, n, ref4, is_indel, ori_depth = 0; + memset(r, 0, sizeof(bcf_callret1_t)); + if (ref_base >= 0) { + ref4 = bam_nt16_nt4_table[ref_base]; + is_indel = 0; + } else ref4 = 4, is_indel = 1; + if (_n == 0) return -1; + // enlarge the bases array if necessary + if (bca->max_bases < _n) { + bca->max_bases = _n; + kroundup32(bca->max_bases); + bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); + } + // fill the bases array + for (i = n = r->n_supp = 0; i < _n; ++i) { + const bam_pileup1_t *p = pl + i; + int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; + // set base + if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue; + ++ori_depth; + baseQ = q = is_indel? p->aux&0xff : (int)bam1_qual(p->b)[p->qpos]; // base/indel quality + seqQ = is_indel? (p->aux>>8&0xff) : 99; + if (q < bca->min_baseQ) continue; + if (q > seqQ) q = seqQ; + mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255 + mapQ = mapQ < bca->capQ? mapQ : bca->capQ; + if (q > mapQ) q = mapQ; + if (q > 63) q = 63; + if (q < 4) q = 4; + if (!is_indel) { + b = bam1_seqi(bam1_seq(p->b), p->qpos); // base + b = bam_nt16_nt4_table[b? b : ref_base]; // b is the 2-bit base + is_diff = (ref4 < 4 && b == ref4)? 0 : 1; + } else { + b = p->aux>>16&0x3f; + is_diff = (b != 0); + } + if (is_diff) ++r->n_supp; + bca->bases[n++] = q<<5 | (int)bam1_strand(p->b)<<4 | b; + // collect annotations + if (b < 4) r->qsum[b] += q; + ++r->anno[0<<2|is_diff<<1|bam1_strand(p->b)]; + min_dist = p->b->core.l_qseq - 1 - p->qpos; + if (min_dist > p->qpos) min_dist = p->qpos; + if (min_dist > CAP_DIST) min_dist = CAP_DIST; + r->anno[1<<2|is_diff<<1|0] += baseQ; + r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ; + r->anno[2<<2|is_diff<<1|0] += mapQ; + r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; + r->anno[3<<2|is_diff<<1|0] += min_dist; + r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist; + + // collect read positions for ReadPosBias + int len, pos = get_position(p, &len); + int epos = (double)pos/(len+1) * bca->npos; + if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) + bca->ref_pos[epos]++; + else + bca->alt_pos[epos]++; + } + r->depth = n; r->ori_depth = ori_depth; + // glfgen + errmod_cal(bca->e, n, 5, bca->bases, r->p); + return r->depth; +} + +double mann_whitney_1947(int n, int m, int U) +{ + if (U<0) return 0; + if (n==0||m==0) return U==0 ? 1 : 0; + return (double)n/(n+m)*mann_whitney_1947(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947(n,m-1,U); +} + +void calc_ReadPosBias(bcf_callaux_t *bca, bcf_call_t *call) +{ + int i, nref = 0, nalt = 0; + unsigned long int U = 0; + for (i=0; i<bca->npos; i++) + { + nref += bca->ref_pos[i]; + nalt += bca->alt_pos[i]; + U += nref*bca->alt_pos[i]; + bca->ref_pos[i] = 0; + bca->alt_pos[i] = 0; + } +#if 0 +//todo + double var = 0, avg = (double)(nref+nalt)/bca->npos; + for (i=0; i<bca->npos; i++) + { + double ediff = bca->ref_pos[i] + bca->alt_pos[i] - avg; + var += ediff*ediff; + bca->ref_pos[i] = 0; + bca->alt_pos[i] = 0; + } + call->read_pos.avg = avg; + call->read_pos.var = sqrt(var/bca->npos); + call->read_pos.dp = nref+nalt; +#endif + if ( !nref || !nalt ) + { + call->read_pos_bias = -1; + return; + } + + if ( nref>=8 || nalt>=8 ) + { + // normal approximation + double mean = ((double)nref*nalt+1.0)/2.0; + double var2 = (double)nref*nalt*(nref+nalt+1.0)/12.0; + double z = (U-mean)/sqrt(var2); + call->read_pos_bias = z; + //fprintf(stderr,"nref=%d nalt=%d U=%ld mean=%e var=%e zval=%e\n", nref,nalt,U,mean,sqrt(var2),call->read_pos_bias); + } + else + { + double p = mann_whitney_1947(nalt,nref,U); + // biased form claimed by GATK to behave better empirically + // double var2 = (1.0+1.0/(nref+nalt+1.0))*(double)nref*nalt*(nref+nalt+1.0)/12.0; + double var2 = (double)nref*nalt*(nref+nalt+1.0)/12.0; + double z; + if ( p >= 1./sqrt(var2*2*M_PI) ) z = 0; // equal to mean + else + { + if ( U >= nref*nalt/2. ) z = sqrt(-2*log(sqrt(var2*2*M_PI)*p)); + else z = -sqrt(-2*log(sqrt(var2*2*M_PI)*p)); + } + call->read_pos_bias = z; + //fprintf(stderr,"nref=%d nalt=%d U=%ld p=%e var2=%e zval=%e\n", nref,nalt,U, p,var2,call->read_pos_bias); + } +} + +float mean_diff_to_prob(float mdiff, int dp, int readlen) +{ + if ( dp==2 ) + { + if ( mdiff==0 ) + return (2.0*readlen + 4.0*(readlen-1.0))/((float)readlen*readlen); + else + return 8.0*(readlen - 4.0*mdiff)/((float)readlen*readlen); + } + + // This is crude empirical approximation and is not very accurate for + // shorter read lengths (<100bp). There certainly is a room for + // improvement. + const float mv[24][2] = { {0,0}, {0,0}, {0,0}, + { 9.108, 4.934}, { 9.999, 3.991}, {10.273, 3.485}, {10.579, 3.160}, + {10.828, 2.889}, {11.014, 2.703}, {11.028, 2.546}, {11.244, 2.391}, + {11.231, 2.320}, {11.323, 2.138}, {11.403, 2.123}, {11.394, 1.994}, + {11.451, 1.928}, {11.445, 1.862}, {11.516, 1.815}, {11.560, 1.761}, + {11.544, 1.728}, {11.605, 1.674}, {11.592, 1.652}, {11.674, 1.613}, + {11.641, 1.570} }; + + float m, v; + if ( dp>=24 ) + { + m = readlen/8.; + if (dp>100) dp = 100; + v = 1.476/(0.182*pow(dp,0.514)); + v = v*(readlen/100.); + } + else + { + m = mv[dp][0]; + v = mv[dp][1]; + m = m*readlen/100.; + v = v*readlen/100.; + v *= 1.2; // allow more variability + } + return 1.0/(v*sqrt(2*M_PI)) * exp(-0.5*((mdiff-m)/v)*((mdiff-m)/v)); +} + +void calc_vdb(bcf_callaux_t *bca, bcf_call_t *call) +{ + int i, dp = 0; + float mean_pos = 0, mean_diff = 0; + for (i=0; i<bca->npos; i++) + { + if ( !bca->alt_pos[i] ) continue; + dp += bca->alt_pos[i]; + int j = i<bca->npos/2 ? i : bca->npos - i; + mean_pos += bca->alt_pos[i]*j; + } + if ( dp<2 ) + { + call->vdb = -1; + return; + } + mean_pos /= dp; + for (i=0; i<bca->npos; i++) + { + if ( !bca->alt_pos[i] ) continue; + int j = i<bca->npos/2 ? i : bca->npos - i; + mean_diff += bca->alt_pos[i] * fabs(j - mean_pos); + } + mean_diff /= dp; + call->vdb = mean_diff_to_prob(mean_diff, dp, bca->npos); +} + +/** + * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles + * @n: number of samples + * @calls: each sample's calls + * @bca: auxiliary data structure for holding temporary values + * @ref_base: the reference base + * @call: filled with the annotations + */ +int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call) +{ + int ref4, i, j, qsum[4]; + int64_t tmp; + if (ref_base >= 0) { + call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base]; + if (ref4 > 4) ref4 = 4; + } else call->ori_ref = -1, ref4 = 0; + // calculate qsum + memset(qsum, 0, 4 * sizeof(int)); + for (i = 0; i < n; ++i) + for (j = 0; j < 4; ++j) + qsum[j] += calls[i].qsum[j]; + int qsum_tot=0; + for (j=0; j<4; j++) { qsum_tot += qsum[j]; call->qsum[j] = 0; } + for (j = 0; j < 4; ++j) qsum[j] = qsum[j] << 2 | j; + // find the top 2 alleles + for (i = 1; i < 4; ++i) // insertion sort + for (j = i; j > 0 && qsum[j] < qsum[j-1]; --j) + tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp; + // set the reference allele and alternative allele(s) + for (i = 0; i < 5; ++i) call->a[i] = -1; + call->unseen = -1; + call->a[0] = ref4; + for (i = 3, j = 1; i >= 0; --i) { + if ((qsum[i]&3) != ref4) { + if (qsum[i]>>2 != 0) + { + if ( j<4 ) call->qsum[j] = (float)(qsum[i]>>2)/qsum_tot; // ref N can make j>=4 + call->a[j++] = qsum[i]&3; + } + else break; + } + else + call->qsum[0] = (float)(qsum[i]>>2)/qsum_tot; + } + if (ref_base >= 0) { // for SNPs, find the "unseen" base + if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0) + call->unseen = j, call->a[j++] = qsum[i]&3; + call->n_alleles = j; + } else { + call->n_alleles = j; + if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything + } + // set the PL array + if (call->n < n) { + call->n = n; + call->PL = realloc(call->PL, 15 * n); + } + { + int x, g[15], z; + double sum_min = 0.; + x = call->n_alleles * (call->n_alleles + 1) / 2; + // get the possible genotypes + for (i = z = 0; i < call->n_alleles; ++i) + for (j = 0; j <= i; ++j) + g[z++] = call->a[j] * 5 + call->a[i]; + for (i = 0; i < n; ++i) { + uint8_t *PL = call->PL + x * i; + const bcf_callret1_t *r = calls + i; + float min = 1e37; + for (j = 0; j < x; ++j) + if (min > r->p[g[j]]) min = r->p[g[j]]; + sum_min += min; + for (j = 0; j < x; ++j) { + int y; + y = (int)(r->p[g[j]] - min + .499); + if (y > 255) y = 255; + PL[j] = y; + } + } +// if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen); + call->shift = (int)(sum_min + .499); + } + // combine annotations + memset(call->anno, 0, 16 * sizeof(int)); + for (i = call->depth = call->ori_depth = 0, tmp = 0; i < n; ++i) { + call->depth += calls[i].depth; + call->ori_depth += calls[i].ori_depth; + for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; + } + + calc_vdb(bca, call); + calc_ReadPosBias(bca, call); + + return 0; +} + +int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag, + const bcf_callaux_t *bca, const char *ref) +{ + extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); + kstring_t s; + int i, j; + b->n_smpl = bc->n; + b->tid = tid; b->pos = pos; b->qual = 0; + s.s = b->str; s.m = b->m_str; s.l = 0; + kputc('\0', &s); + if (bc->ori_ref < 0) { // an indel + // write REF + kputc(ref[pos], &s); + for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+1+j], &s); + kputc('\0', &s); + // write ALT + kputc(ref[pos], &s); + for (i = 1; i < 4; ++i) { + if (bc->a[i] < 0) break; + if (i > 1) { + kputc(',', &s); kputc(ref[pos], &s); + } + if (bca->indel_types[bc->a[i]] < 0) { // deletion + for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j) + kputc(ref[pos+1+j], &s); + } else { // insertion; cannot be a reference unless a bug + char *inscns = &bca->inscns[bc->a[i] * bca->maxins]; + for (j = 0; j < bca->indel_types[bc->a[i]]; ++j) + kputc("ACGTN"[(int)inscns[j]], &s); + for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+1+j], &s); + } + } + kputc('\0', &s); + } else { // a SNP + kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s); + for (i = 1; i < 5; ++i) { + if (bc->a[i] < 0) break; + if (i > 1) kputc(',', &s); + kputc(bc->unseen == i? 'X' : "ACGT"[bc->a[i]], &s); + } + kputc('\0', &s); + } + kputc('\0', &s); + // INFO + if (bc->ori_ref < 0) ksprintf(&s,"INDEL;IS=%d,%f;", bca->max_support, bca->max_frac); + kputs("DP=", &s); kputw(bc->ori_depth, &s); kputs(";I16=", &s); + for (i = 0; i < 16; ++i) { + if (i) kputc(',', &s); + kputw(bc->anno[i], &s); + } + //ksprintf(&s,";RPS=%d,%f,%f", bc->read_pos.dp,bc->read_pos.avg,bc->read_pos.var); + ksprintf(&s,";QS=%f,%f,%f,%f", bc->qsum[0],bc->qsum[1],bc->qsum[2],bc->qsum[3]); + if (bc->vdb != -1) + ksprintf(&s, ";VDB=%e", bc->vdb); + if (bc->read_pos_bias != -1 ) + ksprintf(&s, ";RPB=%e", bc->read_pos_bias); + kputc('\0', &s); + // FMT + kputs("PL", &s); + if (bcr && fmt_flag) { + if (fmt_flag & B2B_FMT_DP) kputs(":DP", &s); + if (fmt_flag & B2B_FMT_DV) kputs(":DV", &s); + if (fmt_flag & B2B_FMT_SP) kputs(":SP", &s); + } + kputc('\0', &s); + b->m_str = s.m; b->str = s.s; b->l_str = s.l; + bcf_sync(b); + memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n); + if (bcr && fmt_flag) { + uint16_t *dp = (fmt_flag & B2B_FMT_DP)? b->gi[1].data : 0; + uint16_t *dv = (fmt_flag & B2B_FMT_DV)? b->gi[1 + ((fmt_flag & B2B_FMT_DP) != 0)].data : 0; + int32_t *sp = (fmt_flag & B2B_FMT_SP)? b->gi[1 + ((fmt_flag & B2B_FMT_DP) != 0) + ((fmt_flag & B2B_FMT_DV) != 0)].data : 0; + for (i = 0; i < bc->n; ++i) { + bcf_callret1_t *p = bcr + i; + if (dp) dp[i] = p->depth < 0xffff? p->depth : 0xffff; + if (dv) dv[i] = p->n_supp < 0xffff? p->n_supp : 0xffff; + if (sp) { + if (p->anno[0] + p->anno[1] < 2 || p->anno[2] + p->anno[3] < 2 + || p->anno[0] + p->anno[2] < 2 || p->anno[1] + p->anno[3] < 2) + { + sp[i] = 0; + } else { + double left, right, two; + int x; + kt_fisher_exact(p->anno[0], p->anno[1], p->anno[2], p->anno[3], &left, &right, &two); + x = (int)(-4.343 * log(two) + .499); + if (x > 255) x = 255; + sp[i] = x; + } + } + } + } + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam2bcf.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,67 @@ +#ifndef BAM2BCF_H +#define BAM2BCF_H + +#include <stdint.h> +#include "errmod.h" +#include "bcftools/bcf.h" + +#define B2B_INDEL_NULL 10000 + +#define B2B_FMT_DP 0x1 +#define B2B_FMT_SP 0x2 +#define B2B_FMT_DV 0x4 + +typedef struct __bcf_callaux_t { + int capQ, min_baseQ; + int openQ, extQ, tandemQ; // for indels + int min_support, max_support; // for collecting indel candidates + double min_frac, max_frac; // for collecting indel candidates + int per_sample_flt; // indel filtering strategy + int *ref_pos, *alt_pos, npos; // for ReadPosBias + // for internal uses + int max_bases; + int indel_types[4]; + int maxins, indelreg; + int read_len; + char *inscns; + uint16_t *bases; + errmod_t *e; + void *rghash; +} bcf_callaux_t; + +typedef struct { + int depth, n_supp, ori_depth, qsum[4]; + unsigned int anno[16]; + float p[25]; +} bcf_callret1_t; + +typedef struct { + int a[5]; // alleles: ref, alt, alt2, alt3 + float qsum[4]; + int n, n_alleles, shift, ori_ref, unseen; + int n_supp; // number of supporting non-reference reads + unsigned int anno[16], depth, ori_depth; + uint8_t *PL; + float vdb; // variant distance bias + float read_pos_bias; + struct { float avg, var; int dp; } read_pos; +} bcf_call_t; + +#ifdef __cplusplus +extern "C" { +#endif + + bcf_callaux_t *bcf_call_init(double theta, int min_baseQ); + void bcf_call_destroy(bcf_callaux_t *bca); + int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r); + int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call); + int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag, + const bcf_callaux_t *bca, const char *ref); + int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, + const void *rghash); + +#ifdef __cplusplus +} +#endif + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam2bcf_indel.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,498 @@ +#include <assert.h> +#include <ctype.h> +#include <string.h> +#include "bam.h" +#include "bam2bcf.h" +#include "kaln.h" +#include "kprobaln.h" +#include "khash.h" +KHASH_SET_INIT_STR(rg) + +#include "ksort.h" +KSORT_INIT_GENERIC(uint32_t) + +#define MINUS_CONST 0x10000000 +#define INDEL_WINDOW_SIZE 50 + +void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list) +{ + const char *s, *p, *q, *r, *t; + khash_t(rg) *hash; + if (list == 0 || hdtext == 0) return _hash; + if (_hash == 0) _hash = kh_init(rg); + hash = (khash_t(rg)*)_hash; + if ((s = strstr(hdtext, "@RG\t")) == 0) return hash; + do { + t = strstr(s + 4, "@RG\t"); // the next @RG + if ((p = strstr(s, "\tID:")) != 0) p += 4; + if ((q = strstr(s, "\tPL:")) != 0) q += 4; + if (p && q && (t == 0 || (p < t && q < t))) { // ID and PL are both present + int lp, lq; + char *x; + for (r = p; *r && *r != '\t' && *r != '\n'; ++r); lp = r - p; + for (r = q; *r && *r != '\t' && *r != '\n'; ++r); lq = r - q; + x = calloc((lp > lq? lp : lq) + 1, 1); + for (r = q; *r && *r != '\t' && *r != '\n'; ++r) x[r-q] = *r; + if (strstr(list, x)) { // insert ID to the hash table + khint_t k; + int ret; + for (r = p; *r && *r != '\t' && *r != '\n'; ++r) x[r-p] = *r; + x[r-p] = 0; + k = kh_get(rg, hash, x); + if (k == kh_end(hash)) k = kh_put(rg, hash, x, &ret); + else free(x); + } else free(x); + } + s = t; + } while (s); + return hash; +} + +void bcf_call_del_rghash(void *_hash) +{ + khint_t k; + khash_t(rg) *hash = (khash_t(rg)*)_hash; + if (hash == 0) return; + for (k = kh_begin(hash); k < kh_end(hash); ++k) + if (kh_exist(hash, k)) + free((char*)kh_key(hash, k)); + kh_destroy(rg, hash); +} + +static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) +{ + int k, x = c->pos, y = 0, last_y = 0; + *_tpos = c->pos; + for (k = 0; k < c->n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; + int l = cigar[k] >> BAM_CIGAR_SHIFT; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + if (c->pos > tpos) return y; + if (x + l > tpos) { + *_tpos = tpos; + return y + (tpos - x); + } + x += l; y += l; + last_y = y; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { + if (x + l > tpos) { + *_tpos = is_left? x : x + l; + return y; + } + x += l; + } + } + *_tpos = x; + return last_y; +} +// FIXME: check if the inserted sequence is consistent with the homopolymer run +// l is the relative gap length and l_run is the length of the homopolymer on the reference +static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run) +{ + int q, qh; + q = bca->openQ + bca->extQ * (abs(l) - 1); + qh = l_run >= 3? (int)(bca->tandemQ * (double)abs(l) / l_run + .499) : 1000; + return q < qh? q : qh; +} + +static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) +{ + int i, j, max = 0, max_i = pos, score = 0; + l = abs(l); + for (i = pos + 1, j = 0; ref[i]; ++i, ++j) { + if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1; + else score += (toupper(ref[i]) != toupper(ref[pos+1+j%l]))? -10 : 1; + if (score < 0) break; + if (max < score) max = score, max_i = i; + } + return max_i - pos; +} + +/* + * @n: number of samples + */ +int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, + const void *rghash) +{ + int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2; + int N, K, l_run, ref_type, n_alt; + char *inscns = 0, *ref2, *query, **ref_sample; + khash_t(rg) *hash = (khash_t(rg)*)rghash; + if (ref == 0 || bca == 0) return -1; + // mark filtered reads + if (rghash) { + N = 0; + for (s = N = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + const uint8_t *rg = bam_aux_get(p->b, "RG"); + p->aux = 1; // filtered by default + if (rg) { + khint_t k = kh_get(rg, hash, (const char*)(rg + 1)); + if (k != kh_end(hash)) p->aux = 0, ++N; // not filtered + } + } + } + if (N == 0) return -1; // no reads left + } + // determine if there is a gap + for (s = N = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) + if (plp[s][i].indel != 0) break; + if (i < n_plp[s]) break; + } + if (s == n) return -1; // there is no indel at this position. + for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads + { // find out how many types of indels are present + bca->max_support = bca->max_frac = 0; + int m, n_alt = 0, n_tot = 0, indel_support_ok = 0; + uint32_t *aux; + aux = calloc(N + 1, 4); + m = max_rd_len = 0; + aux[m++] = MINUS_CONST; // zero indel is always a type + for (s = 0; s < n; ++s) { + int na = 0, nt = 0; + for (i = 0; i < n_plp[s]; ++i) { + const bam_pileup1_t *p = plp[s] + i; + if (rghash == 0 || p->aux == 0) { + ++nt; + if (p->indel != 0) { + ++na; + aux[m++] = MINUS_CONST + p->indel; + } + } + j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b)); + if (j > max_rd_len) max_rd_len = j; + } + float frac = (float)na/nt; + if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac ) + indel_support_ok = 1; + if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac; + n_alt += na; + n_tot += nt; + } + // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases), + // check the number of N's in the sequence and skip places where half or more reference bases are Ns. + int nN=0; for (i=pos; i-pos<max_rd_len && ref[i]; i++) if ( ref[i]=='N' ) nN++; + if ( nN*2>i ) { free(aux); return -1; } + + ks_introsort(uint32_t, m, aux); + // squeeze out identical types + for (i = 1, n_types = 1; i < m; ++i) + if (aux[i] != aux[i-1]) ++n_types; + // Taking totals makes it hard to call rare indels + if ( !bca->per_sample_flt ) + indel_support_ok = ( (float)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1; + if ( n_types == 1 || !indel_support_ok ) { // then skip + free(aux); return -1; + } + if (n_types >= 64) { + free(aux); + if (bam_verbose >= 2) + fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); + return -1; + } + types = (int*)calloc(n_types, sizeof(int)); + t = 0; + types[t++] = aux[0] - MINUS_CONST; + for (i = 1; i < m; ++i) + if (aux[i] != aux[i-1]) + types[t++] = aux[i] - MINUS_CONST; + free(aux); + for (t = 0; t < n_types; ++t) + if (types[t] == 0) break; + ref_type = t; // the index of the reference type (0) + } + { // calculate left and right boundary + left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; + right = pos + INDEL_WINDOW_SIZE; + if (types[0] < 0) right -= types[0]; + // in case the alignments stand out the reference + for (i = pos; i < right; ++i) + if (ref[i] == 0) break; + right = i; + } + /* The following block fixes a long-existing flaw in the INDEL + * calling model: the interference of nearby SNPs. However, it also + * reduces the power because sometimes, substitutions caused by + * indels are not distinguishable from true mutations. Multiple + * sequence realignment helps to increase the power. + * + * Masks mismatches present in at least 70% of the reads with 'N'. + */ + { // construct per-sample consensus + int L = right - left + 1, max_i, max2_i; + uint32_t *cns, max, max2; + char *ref0, *r; + ref_sample = calloc(n, sizeof(void*)); + cns = calloc(L, 4); + ref0 = calloc(L, 1); + for (i = 0; i < right - left; ++i) + ref0[i] = bam_nt16_table[(int)ref[i+left]]; + for (s = 0; s < n; ++s) { + r = ref_sample[s] = calloc(L, 1); + memset(cns, 0, sizeof(int) * L); + // collect ref and non-ref counts + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + bam1_t *b = p->b; + uint32_t *cigar = bam1_cigar(b); + uint8_t *seq = bam1_seq(b); + int x = b->core.pos, y = 0; + for (k = 0; k < b->core.n_cigar; ++k) { + int op = cigar[k]&0xf; + int j, l = cigar[k]>>4; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + for (j = 0; j < l; ++j) + if (x + j >= left && x + j < right) + cns[x+j-left] += (bam1_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; + x += l; y += l; + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + } + } + // determine the consensus + for (i = 0; i < right - left; ++i) r[i] = ref0[i]; + max = max2 = 0; max_i = max2_i = -1; + for (i = 0; i < right - left; ++i) { + if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i; + else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i; + } + if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1; + if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; + if (max_i >= 0) r[max_i] = 15; + if (max2_i >= 0) r[max2_i] = 15; + //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr); + } + free(ref0); free(cns); + } + { // the length of the homopolymer run around the current position + int c = bam_nt16_table[(int)ref[pos + 1]]; + if (c == 15) l_run = 1; + else { + for (i = pos + 2; ref[i]; ++i) + if (bam_nt16_table[(int)ref[i]] != c) break; + l_run = i; + for (i = pos; i >= 0; --i) + if (bam_nt16_table[(int)ref[i]] != c) break; + l_run -= i + 1; + } + } + // construct the consensus sequence + max_ins = types[n_types - 1]; // max_ins is at least 0 + if (max_ins > 0) { + int *inscns_aux = calloc(5 * n_types * max_ins, sizeof(int)); + // count the number of occurrences of each base at each position for each type of insertion + for (t = 0; t < n_types; ++t) { + if (types[t] > 0) { + for (s = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + if (p->indel == types[t]) { + uint8_t *seq = bam1_seq(p->b); + for (k = 1; k <= p->indel; ++k) { + int c = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos + k)]; + assert(c<5); + ++inscns_aux[(t*max_ins+(k-1))*5 + c]; + } + } + } + } + } + } + // use the majority rule to construct the consensus + inscns = calloc(n_types * max_ins, 1); + for (t = 0; t < n_types; ++t) { + for (j = 0; j < types[t]; ++j) { + int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5]; + for (k = 0; k < 5; ++k) + if (ia[k] > max) + max = ia[k], max_k = k; + inscns[t*max_ins + j] = max? max_k : 4; + if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's + } + } + free(inscns_aux); + } + // compute the likelihood given each type of indel for each read + max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]); + ref2 = calloc(max_ref2, 1); + query = calloc(right - left + max_rd_len + max_ins + 2, 1); + score1 = calloc(N * n_types, sizeof(int)); + score2 = calloc(N * n_types, sizeof(int)); + bca->indelreg = 0; + for (t = 0; t < n_types; ++t) { + int l, ir; + kpa_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 }; + apf1.bw = apf2.bw = abs(types[t]) + 3; + // compute indelreg + if (types[t] == 0) ir = 0; + else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]); + else ir = est_indelreg(pos, ref, -types[t], 0); + if (ir > bca->indelreg) bca->indelreg = ir; +// fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir); + // realignment + for (s = K = 0; s < n; ++s) { + // write ref2 + for (k = 0, j = left; j <= pos; ++j) + ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]]; + if (types[t] <= 0) j += -types[t]; + else for (l = 0; l < types[t]; ++l) + ref2[k++] = inscns[t*max_ins + l]; + for (; j < right && ref[j]; ++j) + ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]]; + for (; k < max_ref2; ++k) ref2[k] = 4; + if (j < right) right = j; + // align each read to ref2 + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + int qbeg, qend, tbeg, tend, sc, kk; + uint8_t *seq = bam1_seq(p->b); + uint32_t *cigar = bam1_cigar(p->b); + if (p->b->core.flag&4) continue; // unmapped reads + // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway. + for (kk = 0; kk < p->b->core.n_cigar; ++kk) + if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break; + if (kk < p->b->core.n_cigar) continue; + // FIXME: the following skips soft clips, but using them may be more sensitive. + // determine the start and end of sequences for alignment + qbeg = tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg); + qend = tpos2qpos(&p->b->core, bam1_cigar(p->b), right, 1, &tend); + if (types[t] < 0) { + int l = -types[t]; + tbeg = tbeg - l > left? tbeg - l : left; + } + // write the query sequence + for (l = qbeg; l < qend; ++l) + query[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(seq, l)]; + { // do realignment; this is the bottleneck + const uint8_t *qual = bam1_qual(p->b), *bq; + uint8_t *qq; + qq = calloc(qend - qbeg, 1); + bq = (uint8_t*)bam_aux_get(p->b, "ZQ"); + if (bq) ++bq; // skip type + for (l = qbeg; l < qend; ++l) { + qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l]; + if (qq[l - qbeg] > 30) qq[l - qbeg] = 30; + if (qq[l - qbeg] < 7) qq[l - qbeg] = 7; + } + sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), + (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0); + l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below + if (l > 255) l = 255; + score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l; + if (sc > 5) { + sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]), + (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0); + l = (int)(100. * sc / (qend - qbeg) + .499); + if (l > 255) l = 255; + score2[K*n_types + t] = sc<<8 | l; + } + free(qq); + } +/* + for (l = 0; l < tend - tbeg + abs(types[t]); ++l) + fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr); + fputc('\n', stderr); + for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr); + fputc('\n', stderr); + fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc); +*/ + } + } + } + free(ref2); free(query); + { // compute indelQ + int *sc, tmp, *sumq; + sc = alloca(n_types * sizeof(int)); + sumq = alloca(n_types * sizeof(int)); + memset(sumq, 0, sizeof(int) * n_types); + for (s = K = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i, ++K) { + bam_pileup1_t *p = plp[s] + i; + int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ; + for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sc[j] < sc[j-1]; --j) + tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; + /* errmod_cal() assumes that if the call is wrong, the + * likelihoods of other events are equal. This is about + * right for substitutions, but is not desired for + * indels. To reuse errmod_cal(), I have to make + * compromise for multi-allelic indels. + */ + if ((sc[0]&0x3f) == ref_type) { + indelQ1 = (sc[1]>>14) - (sc[0]>>14); + seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run); + } else { + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sc[t]&0x3f) == ref_type) break; + indelQ1 = (sc[t]>>14) - (sc[0]>>14); + seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); + } + tmp = sc[0]>>6 & 0xff; + indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ + sct = &score2[K*n_types]; + for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sc[j] < sc[j-1]; --j) + tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp; + if ((sc[0]&0x3f) == ref_type) { + indelQ2 = (sc[1]>>14) - (sc[0]>>14); + } else { + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sc[t]&0x3f) == ref_type) break; + indelQ2 = (sc[t]>>14) - (sc[0]>>14); + } + tmp = sc[0]>>6 & 0xff; + indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499); + // pick the smaller between indelQ1 and indelQ2 + indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2; + if (indelQ > 255) indelQ = 255; + if (seqQ > 255) seqQ = 255; + p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total + sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; +// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); + } + } + // determine bca->indel_types[] and bca->inscns + bca->maxins = max_ins; + bca->inscns = realloc(bca->inscns, bca->maxins * 4); + for (t = 0; t < n_types; ++t) + sumq[t] = sumq[t]<<6 | t; + for (t = 1; t < n_types; ++t) // insertion sort + for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j) + tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp; + for (t = 0; t < n_types; ++t) // look for the reference type + if ((sumq[t]&0x3f) == ref_type) break; + if (t) { // then move the reference type to the first + tmp = sumq[t]; + for (; t > 0; --t) sumq[t] = sumq[t-1]; + sumq[0] = tmp; + } + for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL; + for (t = 0; t < 4 && t < n_types; ++t) { + bca->indel_types[t] = types[sumq[t]&0x3f]; + memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins); + } + // update p->aux + for (s = n_alt = 0; s < n; ++s) { + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + int x = types[p->aux>>16&0x3f]; + for (j = 0; j < 4; ++j) + if (x == bca->indel_types[j]) break; + p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); + if ((p->aux>>16&0x3f) > 0) ++n_alt; +// fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d q=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), p->aux>>16&63, bca->indel_types[p->aux>>16&63], p->aux&0xff, p->aux>>8&0xff); + } + } + } + free(score1); free(score2); + // free + for (i = 0; i < n; ++i) free(ref_sample[i]); + free(ref_sample); + free(types); free(inscns); + return n_alt > 0? 0 : -1; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam2depth.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,143 @@ +/* This program demonstrates how to generate pileup from multiple BAMs + * simutaneously, to achieve random access and to use the BED interface. + * To compile this program separately, you may: + * + * gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -L. -lbam -lz + */ +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <unistd.h> +#include "bam.h" + +typedef struct { // auxiliary data structure + bamFile fp; // the file handler + bam_iter_t iter; // NULL if a region not specified + int min_mapQ, min_len; // mapQ filter; length filter +} aux_t; + +void *bed_read(const char *fn); // read a BED or position list file +void bed_destroy(void *_h); // destroy the BED data structure +int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps + +// This function reads a BAM alignment from one BAM file. +static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup +{ + aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure + int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); + if (!(b->core.flag&BAM_FUNMAP)) { + if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; + else if (aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len) b->core.flag |= BAM_FUNMAP; + } + return ret; +} + +int read_file_list(const char *file_list,int *n,char **argv[]); + +#ifdef _MAIN_BAM2DEPTH +int main(int argc, char *argv[]) +#else +int main_depth(int argc, char *argv[]) +#endif +{ + int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, nfiles; + const bam_pileup1_t **plp; + char *reg = 0; // specified region + void *bed = 0; // BED data structure + char *file_list = NULL, **fn = NULL; + bam_header_t *h = 0; // BAM header of the 1st input + aux_t **data; + bam_mplp_t mplp; + + // parse the command line + while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) { + switch (n) { + case 'l': min_len = atoi(optarg); break; // minimum query length + case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header + case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now + case 'q': baseQ = atoi(optarg); break; // base quality threshold + case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold + case 'f': file_list = optarg; break; + } + } + if (optind == argc && !file_list) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -b <bed> list of positions or regions\n"); + fprintf(stderr, " -f <list> list of input BAM filenames, one per line [null]\n"); + fprintf(stderr, " -l <int> minQLen\n"); + fprintf(stderr, " -q <int> base quality threshold\n"); + fprintf(stderr, " -Q <int> mapping quality threshold\n"); + fprintf(stderr, " -r <chr:from-to> region\n"); + fprintf(stderr, "\n"); + return 1; + } + + // initialize the auxiliary data structures + if (file_list) + { + if ( read_file_list(file_list,&nfiles,&fn) ) return 1; + n = nfiles; + argv = fn; + optind = 0; + } + else + n = argc - optind; // the number of BAMs on the command line + data = calloc(n, sizeof(void*)); // data[i] for the i-th input + beg = 0; end = 1<<30; tid = -1; // set the default region + for (i = 0; i < n; ++i) { + bam_header_t *htmp; + data[i] = calloc(1, sizeof(aux_t)); + data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM + data[i]->min_mapQ = mapQ; // set the mapQ filter + data[i]->min_len = min_len; // set the qlen filter + htmp = bam_header_read(data[i]->fp); // read the BAM header + if (i == 0) { + h = htmp; // keep the header of the 1st BAM + if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region + } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header + if (tid >= 0) { // if a region is specified and parsed successfully + bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index + data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator + bam_index_destroy(idx); // the index is not needed any more; phase out of the memory + } + } + + // the core multi-pileup loop + mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization + n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM + plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp) + while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position + if (pos < beg || pos >= end) continue; // out of range; skip + if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip + fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster + for (i = 0; i < n; ++i) { // base level filters have to go here + int j, m = 0; + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know + if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos + else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality + } + printf("\t%d", n_plp[i] - m); // this the depth to output + } + putchar('\n'); + } + free(n_plp); free(plp); + bam_mplp_destroy(mplp); + + bam_header_destroy(h); + for (i = 0; i < n; ++i) { + bam_close(data[i]->fp); + if (data[i]->iter) bam_iter_destroy(data[i]->iter); + free(data[i]); + } + free(data); free(reg); + if (bed) bed_destroy(bed); + if ( file_list ) + { + for (i=0; i<n; i++) free(fn[i]); + free(fn); + } + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_aux.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,217 @@ +#include <ctype.h> +#include "bam.h" +#include "khash.h" +typedef char *str_p; +KHASH_MAP_INIT_STR(s, int) +KHASH_MAP_INIT_STR(r2l, str_p) + +void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data) +{ + int ori_len = b->data_len; + b->data_len += 3 + len; + b->l_aux += 3 + len; + if (b->m_data < b->data_len) { + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1]; + b->data[ori_len + 2] = type; + memcpy(b->data + ori_len + 3, data, len); +} + +uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) +{ + return bam_aux_get(b, tag); +} + +#define __skip_tag(s) do { \ + int type = toupper(*(s)); \ + ++(s); \ + if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ + else if (type == 'B') (s) += 5 + bam_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \ + else (s) += bam_aux_type2size(type); \ + } while(0) + +uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) +{ + uint8_t *s; + int y = tag[0]<<8 | tag[1]; + s = bam1_aux(b); + while (s < b->data + b->data_len) { + int x = (int)s[0]<<8 | s[1]; + s += 2; + if (x == y) return s; + __skip_tag(s); + } + return 0; +} +// s MUST BE returned by bam_aux_get() +int bam_aux_del(bam1_t *b, uint8_t *s) +{ + uint8_t *p, *aux; + aux = bam1_aux(b); + p = s - 2; + __skip_tag(s); + memmove(p, s, b->l_aux - (s - aux)); + b->data_len -= s - p; + b->l_aux -= s - p; + return 0; +} + +int bam_aux_drop_other(bam1_t *b, uint8_t *s) +{ + if (s) { + uint8_t *p, *aux; + aux = bam1_aux(b); + p = s - 2; + __skip_tag(s); + memmove(aux, p, s - p); + b->data_len -= b->l_aux - (s - p); + b->l_aux = s - p; + } else { + b->data_len -= b->l_aux; + b->l_aux = 0; + } + return 0; +} + +void bam_init_header_hash(bam_header_t *header) +{ + if (header->hash == 0) { + int ret, i; + khiter_t iter; + khash_t(s) *h; + header->hash = h = kh_init(s); + for (i = 0; i < header->n_targets; ++i) { + iter = kh_put(s, h, header->target_name[i], &ret); + kh_value(h, iter) = i; + } + } +} + +void bam_destroy_header_hash(bam_header_t *header) +{ + if (header->hash) + kh_destroy(s, (khash_t(s)*)header->hash); +} + +int32_t bam_get_tid(const bam_header_t *header, const char *seq_name) +{ + khint_t k; + khash_t(s) *h = (khash_t(s)*)header->hash; + k = kh_get(s, h, seq_name); + return k == kh_end(h)? -1 : kh_value(h, k); +} + +int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end) +{ + char *s; + int i, l, k, name_end; + khiter_t iter; + khash_t(s) *h; + + bam_init_header_hash(header); + h = (khash_t(s)*)header->hash; + + *ref_id = *beg = *end = -1; + name_end = l = strlen(str); + s = (char*)malloc(l+1); + // remove space + for (i = k = 0; i < l; ++i) + if (!isspace(str[i])) s[k++] = str[i]; + s[k] = 0; l = k; + // determine the sequence name + for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end + if (i >= 0) name_end = i; + if (name_end < l) { // check if this is really the end + int n_hyphen = 0; + for (i = name_end + 1; i < l; ++i) { + if (s[i] == '-') ++n_hyphen; + else if (!isdigit(s[i]) && s[i] != ',') break; + } + if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name + s[name_end] = 0; + iter = kh_get(s, h, s); + if (iter == kh_end(h)) { // cannot find the sequence name + iter = kh_get(s, h, str); // try str as the name + if (iter == kh_end(h)) { + if (bam_verbose >= 2) fprintf(stderr, "[%s] fail to determine the sequence name.\n", __func__); + free(s); return -1; + } else s[name_end] = ':', name_end = l; + } + } else iter = kh_get(s, h, str); + if (iter == kh_end(h)) { + free(s); + return -1; + } + *ref_id = kh_val(h, iter); + // parse the interval + if (name_end < l) { + for (i = k = name_end + 1; i < l; ++i) + if (s[i] != ',') s[k++] = s[i]; + s[k] = 0; + *beg = atoi(s + name_end + 1); + for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; + *end = i < k? atoi(s + i + 1) : 1<<29; + if (*beg > 0) --*beg; + } else *beg = 0, *end = 1<<29; + free(s); + return *beg <= *end? 0 : -1; +} + +int32_t bam_aux2i(const uint8_t *s) +{ + int type; + if (s == 0) return 0; + type = *s++; + if (type == 'c') return (int32_t)*(int8_t*)s; + else if (type == 'C') return (int32_t)*(uint8_t*)s; + else if (type == 's') return (int32_t)*(int16_t*)s; + else if (type == 'S') return (int32_t)*(uint16_t*)s; + else if (type == 'i' || type == 'I') return *(int32_t*)s; + else return 0; +} + +float bam_aux2f(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0.0; + if (type == 'f') return *(float*)s; + else return 0.0; +} + +double bam_aux2d(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0.0; + if (type == 'd') return *(double*)s; + else return 0.0; +} + +char bam_aux2A(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0; + if (type == 'A') return *(char*)s; + else return 0; +} + +char *bam_aux2Z(const uint8_t *s) +{ + int type; + type = *s++; + if (s == 0) return 0; + if (type == 'Z' || type == 'H') return (char*)s; + else return 0; +} + +#ifdef _WIN32 +double drand48() +{ + return (double)rand() / RAND_MAX; +} +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_cat.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,185 @@ +/* + +bam_cat -- efficiently concatenates bam files + +bam_cat can be used to concatenate BAM files. Under special +circumstances, it can be used as an alternative to 'samtools merge' to +concatenate multiple sorted files into a single sorted file. For this +to work each file must be sorted, and the sorted files must be given +as command line arguments in order such that the final read in file i +is less than or equal to the first read in file i+1. + +This code is derived from the bam_reheader function in samtools 0.1.8 +and modified to perform concatenation by Chris Saunders on behalf of +Illumina. + + +########## License: + +The MIT License + +Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd. +Modified SAMtools work copyright (c) 2010 Illumina, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +*/ + + +/* +makefile: +""" +CC=gcc +CFLAGS+=-g -Wall -O2 -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -I$(SAMTOOLS_DIR) +LDFLAGS+=-L$(SAMTOOLS_DIR) +LDLIBS+=-lbam -lz + +all:bam_cat +""" +*/ + + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "knetfile.h" +#include "bgzf.h" +#include "bam.h" + +#define BUF_SIZE 0x10000 + +#define GZIPID1 31 +#define GZIPID2 139 + +#define BGZF_EMPTY_BLOCK_SIZE 28 + + +int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam) +{ + BGZF *fp; + FILE* fp_file; + uint8_t *buf; + uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE]; + const int es=BGZF_EMPTY_BLOCK_SIZE; + int i; + + fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w"); + if (fp == 0) { + fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam); + return 1; + } + if (h) bam_header_write(fp, h); + + buf = (uint8_t*) malloc(BUF_SIZE); + for(i = 0; i < nfn; ++i){ + BGZF *in; + bam_header_t *old; + int len,j; + + in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(fileno(stdin), "r"); + if (in == 0) { + fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); + return -1; + } + if (in->is_write) return -1; + + old = bam_header_read(in); + if (h == 0 && i == 0) bam_header_write(fp, old); + + if (in->block_offset < in->block_length) { + bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); + bgzf_flush(fp); + } + + j=0; +#ifdef _USE_KNETFILE + fp_file = fp->fp; + while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0) { +#else + fp_file = fp->fp; + while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) { +#endif + if(len<es){ + int diff=es-len; + if(j==0) { + fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]); + return -1; + } + fwrite(ebuf, 1, len, fp_file); + memcpy(ebuf,ebuf+len,diff); + memcpy(ebuf+diff,buf,len); + } else { + if(j!=0) fwrite(ebuf, 1, es, fp_file); + len-= es; + memcpy(ebuf,buf+len,es); + fwrite(buf, 1, len, fp_file); + } + j=1; + } + + /* check final gzip block */ + { + const uint8_t gzip1=ebuf[0]; + const uint8_t gzip2=ebuf[1]; + const uint32_t isize=*((uint32_t*)(ebuf+es-4)); + if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) { + fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]); + fprintf(stderr, " Possible output corruption.\n"); + fwrite(ebuf, 1, es, fp_file); + } + } + bam_header_destroy(old); + bgzf_close(in); + } + free(buf); + bgzf_close(fp); + return 0; +} + + + +int main_cat(int argc, char *argv[]) +{ + bam_header_t *h = 0; + char *outfn = 0; + int c, ret; + while ((c = getopt(argc, argv, "h:o:")) >= 0) { + switch (c) { + case 'h': { + tamFile fph = sam_open(optarg); + if (fph == 0) { + fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]); + return 1; + } + h = sam_header_read(fph); + sam_close(fph); + break; + } + case 'o': outfn = strdup(optarg); break; + } + } + if (argc - optind < 2) { + fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n"); + return 1; + } + ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-"); + free(outfn); + return ret; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_color.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,145 @@ +#include <ctype.h> +#include "bam.h" + +/*! + @abstract Get the color encoding the previous and current base + @param b pointer to an alignment + @param i The i-th position, 0-based + @return color + + @discussion Returns 0 no color information is found. + */ +char bam_aux_getCSi(bam1_t *b, int i) +{ + uint8_t *c = bam_aux_get(b, "CS"); + char *cs = NULL; + + // return the base if the tag was not found + if(0 == c) return 0; + + cs = bam_aux2Z(c); + // adjust for strandedness and leading adaptor + if(bam1_strand(b)) { + i = strlen(cs) - 1 - i; + // adjust for leading hard clip + uint32_t cigar = bam1_cigar(b)[0]; + if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) { + i -= cigar >> BAM_CIGAR_SHIFT; + } + } else { i++; } + return cs[i]; +} + +/*! + @abstract Get the color quality of the color encoding the previous and current base + @param b pointer to an alignment + @param i The i-th position, 0-based + @return color quality + + @discussion Returns 0 no color information is found. + */ +char bam_aux_getCQi(bam1_t *b, int i) +{ + uint8_t *c = bam_aux_get(b, "CQ"); + char *cq = NULL; + + // return the base if the tag was not found + if(0 == c) return 0; + + cq = bam_aux2Z(c); + // adjust for strandedness + if(bam1_strand(b)) { + i = strlen(cq) - 1 - i; + // adjust for leading hard clip + uint32_t cigar = bam1_cigar(b)[0]; + if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) { + i -= (cigar >> BAM_CIGAR_SHIFT); + } + } + return cq[i]; +} + +char bam_aux_nt2int(char a) +{ + switch(toupper(a)) { + case 'A': + return 0; + break; + case 'C': + return 1; + break; + case 'G': + return 2; + break; + case 'T': + return 3; + break; + default: + return 4; + break; + } +} + +char bam_aux_ntnt2cs(char a, char b) +{ + a = bam_aux_nt2int(a); + b = bam_aux_nt2int(b); + if(4 == a || 4 == b) return '4'; + return "0123"[(int)(a ^ b)]; +} + +/*! + @abstract Get the color error profile at the give position + @param b pointer to an alignment + @return the original color if the color was an error, '-' (dash) otherwise + + @discussion Returns 0 no color information is found. + */ +char bam_aux_getCEi(bam1_t *b, int i) +{ + int cs_i; + uint8_t *c = bam_aux_get(b, "CS"); + char *cs = NULL; + char prev_b, cur_b; + char cur_color, cor_color; + + // return the base if the tag was not found + if(0 == c) return 0; + + cs = bam_aux2Z(c); + + // adjust for strandedness and leading adaptor + if(bam1_strand(b)) { //reverse strand + cs_i = strlen(cs) - 1 - i; + // adjust for leading hard clip + uint32_t cigar = bam1_cigar(b)[0]; + if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) { + cs_i -= cigar >> BAM_CIGAR_SHIFT; + } + // get current color + cur_color = cs[cs_i]; + // get previous base. Note: must rc adaptor + prev_b = (cs_i == 1) ? "TGCAN"[(int)bam_aux_nt2int(cs[0])] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)]; + // get current base + cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; + } + else { + cs_i=i+1; + // get current color + cur_color = cs[cs_i]; + // get previous base + prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)]; + // get current base + cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; + } + + // corrected color + cor_color = bam_aux_ntnt2cs(prev_b, cur_b); + + if(cur_color == cor_color) { + return '-'; + } + else { + return cur_color; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_endian.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,42 @@ +#ifndef BAM_ENDIAN_H +#define BAM_ENDIAN_H + +#include <stdint.h> + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} +static inline void *bam_swap_endian_2p(void *x) +{ + *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); + return x; +} +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} +static inline void *bam_swap_endian_4p(void *x) +{ + *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); + return x; +} +static inline uint64_t bam_swap_endian_8(uint64_t v) +{ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} +static inline void *bam_swap_endian_8p(void *x) +{ + *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); + return x; +} + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_import.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,489 @@ +#include <zlib.h> +#include <stdio.h> +#include <ctype.h> +#include <string.h> +#include <stdlib.h> +#include <unistd.h> +#include <assert.h> +#ifdef _WIN32 +#include <fcntl.h> +#endif +#include "kstring.h" +#include "bam.h" +#include "sam_header.h" +#include "kseq.h" +#include "khash.h" + +KSTREAM_INIT(gzFile, gzread, 16384) +KHASH_MAP_INIT_STR(ref, uint64_t) + +void bam_init_header_hash(bam_header_t *header); +void bam_destroy_header_hash(bam_header_t *header); +int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); + +unsigned char bam_nt16_table[256] = { + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 +}; + +unsigned short bam_char2flag_table[256] = { + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,BAM_FREAD1,BAM_FREAD2,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + BAM_FPROPER_PAIR,0,BAM_FMREVERSE,0, 0,BAM_FMUNMAP,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, BAM_FDUP,0,BAM_FQCFAIL,0, 0,0,0,0, 0,0,0,0, + BAM_FPAIRED,0,BAM_FREVERSE,BAM_FSECONDARY, 0,BAM_FUNMAP,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 +}; + +char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN"; + +struct __tamFile_t { + gzFile fp; + kstream_t *ks; + kstring_t *str; + uint64_t n_lines; + int is_first; +}; + +char **__bam_get_lines(const char *fn, int *_n) // for bam_plcmd.c only +{ + char **list = 0, *s; + int n = 0, dret, m = 0; + gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); + kstream_t *ks; + kstring_t *str; + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + ks = ks_init(fp); + while (ks_getuntil(ks, '\n', str, &dret) > 0) { + if (n == m) { + m = m? m << 1 : 16; + list = (char**)realloc(list, m * sizeof(char*)); + } + if (str->s[str->l-1] == '\r') + str->s[--str->l] = '\0'; + s = list[n++] = (char*)calloc(str->l + 1, 1); + strcpy(s, str->s); + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + *_n = n; + return list; +} + +static bam_header_t *hash2header(const kh_ref_t *hash) +{ + bam_header_t *header; + khiter_t k; + header = bam_header_init(); + header->n_targets = kh_size(hash); + header->target_name = (char**)calloc(kh_size(hash), sizeof(char*)); + header->target_len = (uint32_t*)calloc(kh_size(hash), 4); + for (k = kh_begin(hash); k != kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + int i = (int)kh_value(hash, k); + header->target_name[i] = (char*)kh_key(hash, k); + header->target_len[i] = kh_value(hash, k)>>32; + } + } + bam_init_header_hash(header); + return header; +} +bam_header_t *sam_header_read2(const char *fn) +{ + bam_header_t *header; + int c, dret, ret, error = 0; + gzFile fp; + kstream_t *ks; + kstring_t *str; + kh_ref_t *hash; + khiter_t k; + if (fn == 0) return 0; + fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); + if (fp == 0) return 0; + hash = kh_init(ref); + ks = ks_init(fp); + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + while (ks_getuntil(ks, 0, str, &dret) > 0) { + char *s = strdup(str->s); + int len, i; + i = kh_size(hash); + ks_getuntil(ks, 0, str, &dret); + len = atoi(str->s); + k = kh_put(ref, hash, s, &ret); + if (ret == 0) { + fprintf(stderr, "[sam_header_read2] duplicated sequence name: %s\n", s); + error = 1; + } + kh_value(hash, k) = (uint64_t)len<<32 | i; + if (dret != '\n') + while ((c = ks_getc(ks)) != '\n' && c != -1); + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash)); + if (error) return 0; + header = hash2header(hash); + kh_destroy(ref, hash); + return header; +} +static inline uint8_t *alloc_data(bam1_t *b, int size) +{ + if (b->m_data < size) { + b->m_data = size; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + return b->data; +} +static inline void parse_error(int64_t n_lines, const char * __restrict msg) +{ + fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg); + abort(); +} +static inline void append_text(bam_header_t *header, kstring_t *str) +{ + size_t x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null + kroundup32(x); kroundup32(y); + if (x < y) + { + header->n_text = y; + header->text = (char*)realloc(header->text, y); + if ( !header->text ) + { + fprintf(stderr,"realloc failed to alloc %ld bytes\n", y); + abort(); + } + } + // Sanity check + if ( header->l_text+str->l+1 >= header->n_text ) + { + fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n", header->l_text+str->l+1,(long)header->n_text,x,y); + abort(); + } + strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here. + header->l_text += str->l + 1; + header->text[header->l_text] = 0; +} + +int sam_header_parse(bam_header_t *h) +{ + char **tmp; + int i; + free(h->target_len); free(h->target_name); + h->n_targets = 0; h->target_len = 0; h->target_name = 0; + if (h->l_text < 3) return 0; + if (h->dict == 0) h->dict = sam_header_parse2(h->text); + tmp = sam_header2list(h->dict, "SQ", "SN", &h->n_targets); + if (h->n_targets == 0) return 0; + h->target_name = calloc(h->n_targets, sizeof(void*)); + for (i = 0; i < h->n_targets; ++i) + h->target_name[i] = strdup(tmp[i]); + free(tmp); + tmp = sam_header2list(h->dict, "SQ", "LN", &h->n_targets); + h->target_len = calloc(h->n_targets, 4); + for (i = 0; i < h->n_targets; ++i) + h->target_len[i] = atoi(tmp[i]); + free(tmp); + return h->n_targets; +} + +bam_header_t *sam_header_read(tamFile fp) +{ + int ret, dret; + bam_header_t *header = bam_header_init(); + kstring_t *str = fp->str; + while ((ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret)) >= 0 && str->s[0] == '@') { // skip header + str->s[str->l] = dret; // note that str->s is NOT null terminated!! + append_text(header, str); + if (dret != '\n') { + ret = ks_getuntil(fp->ks, '\n', str, &dret); + str->s[str->l] = '\n'; // NOT null terminated!! + append_text(header, str); + } + ++fp->n_lines; + } + sam_header_parse(header); + bam_init_header_hash(header); + fp->is_first = 1; + return header; +} + +int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) +{ + int ret, doff, doff0, dret, z = 0; + bam1_core_t *c = &b->core; + kstring_t *str = fp->str; + kstream_t *ks = fp->ks; + + if (fp->is_first) { + fp->is_first = 0; + ret = str->l; + } else { + do { // special consideration for empty lines + ret = ks_getuntil(fp->ks, KS_SEP_TAB, str, &dret); + if (ret >= 0) z += str->l + 1; + } while (ret == 0); + } + if (ret < 0) return -1; + ++fp->n_lines; + doff = 0; + + { // name + c->l_qname = strlen(str->s) + 1; + memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname); + doff += c->l_qname; + } + { // flag + long flag; + char *s; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + flag = strtol((char*)str->s, &s, 0); + if (*s) { // not the end of the string + flag = 0; + for (s = str->s; *s; ++s) + flag |= bam_char2flag_table[(int)*s]; + } + c->flag = flag; + } + { // tid, pos, qual + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->tid = bam_get_tid(header, str->s); + if (c->tid < 0 && strcmp(str->s, "*")) { + if (header->n_targets == 0) { + fprintf(stderr, "[sam_read1] missing header? Abort!\n"); + exit(1); + } else fprintf(stderr, "[sam_read1] reference '%s' is recognized as '*'.\n", str->s); + } + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; c->qual = isdigit(str->s[0])? atoi(str->s) : 0; + if (ret < 0) return -2; + } + { // cigar + char *s, *t; + int i, op; + long x; + c->n_cigar = 0; + if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3; + z += str->l + 1; + if (str->s[0] != '*') { + uint32_t *cigar; + for (s = str->s; *s; ++s) { + if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar; + else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character"); + } + b->data = alloc_data(b, doff + c->n_cigar * 4); + cigar = bam1_cigar(b); + for (i = 0, s = str->s; i != c->n_cigar; ++i) { + x = strtol(s, &t, 10); + op = toupper(*t); + if (op == 'M') op = BAM_CMATCH; + else if (op == 'I') op = BAM_CINS; + else if (op == 'D') op = BAM_CDEL; + else if (op == 'N') op = BAM_CREF_SKIP; + else if (op == 'S') op = BAM_CSOFT_CLIP; + else if (op == 'H') op = BAM_CHARD_CLIP; + else if (op == 'P') op = BAM_CPAD; + else if (op == '=') op = BAM_CEQUAL; + else if (op == 'X') op = BAM_CDIFF; + else if (op == 'B') op = BAM_CBACK; + else parse_error(fp->n_lines, "invalid CIGAR operation"); + s = t + 1; + cigar[i] = bam_cigar_gen(x, op); + } + if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation"); + c->bin = bam_reg2bin(c->pos, bam_calend(c, cigar)); + doff += c->n_cigar * 4; + } else { + if (!(c->flag&BAM_FUNMAP)) { + fprintf(stderr, "Parse warning at line %lld: mapped sequence without CIGAR\n", (long long)fp->n_lines); + c->flag |= BAM_FUNMAP; + } + c->bin = bam_reg2bin(c->pos, c->pos + 1); + } + } + { // mtid, mpos, isize + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1; + ret = ks_getuntil(ks, KS_SEP_TAB, str, &dret); z += str->l + 1; + c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0; + if (ret < 0) return -4; + } + { // seq and qual + int i; + uint8_t *p = 0; + if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -5; // seq + z += str->l + 1; + if (strcmp(str->s, "*")) { + c->l_qseq = strlen(str->s); + if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) { + fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n", + (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b))); + parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); + } + p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff; + memset(p, 0, (c->l_qseq+1)/2); + for (i = 0; i < c->l_qseq; ++i) + p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2); + } else c->l_qseq = 0; + if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -6; // qual + z += str->l + 1; + if (strcmp(str->s, "*") && c->l_qseq != strlen(str->s)) + parse_error(fp->n_lines, "sequence and quality are inconsistent"); + p += (c->l_qseq+1)/2; + if (strcmp(str->s, "*") == 0) for (i = 0; i < c->l_qseq; ++i) p[i] = 0xff; + else for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33; + doff += c->l_qseq + (c->l_qseq+1)/2; + } + doff0 = doff; + if (dret != '\n' && dret != '\r') { // aux + while (ks_getuntil(ks, KS_SEP_TAB, str, &dret) >= 0) { + uint8_t *s, type, key[2]; + z += str->l + 1; + if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':') + parse_error(fp->n_lines, "missing colon in auxiliary data"); + key[0] = str->s[0]; key[1] = str->s[1]; + type = str->s[3]; + s = alloc_data(b, doff + 3) + doff; + s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2; + if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility + s = alloc_data(b, doff + 2) + doff; + *s++ = 'A'; *s = str->s[5]; + doff += 2; + } else if (type == 'I' || type == 'i') { + long long x; + s = alloc_data(b, doff + 5) + doff; + x = (long long)atoll(str->s + 5); + if (x < 0) { + if (x >= -127) { + *s++ = 'c'; *(int8_t*)s = (int8_t)x; + s += 1; doff += 2; + } else if (x >= -32767) { + *s++ = 's'; *(int16_t*)s = (int16_t)x; + s += 2; doff += 3; + } else { + *s++ = 'i'; *(int32_t*)s = (int32_t)x; + s += 4; doff += 5; + if (x < -2147483648ll) + fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.", + (long long)fp->n_lines, x); + } + } else { + if (x <= 255) { + *s++ = 'C'; *s++ = (uint8_t)x; + doff += 2; + } else if (x <= 65535) { + *s++ = 'S'; *(uint16_t*)s = (uint16_t)x; + s += 2; doff += 3; + } else { + *s++ = 'I'; *(uint32_t*)s = (uint32_t)x; + s += 4; doff += 5; + if (x > 4294967295ll) + fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.", + (long long)fp->n_lines, x); + } + } + } else if (type == 'f') { + s = alloc_data(b, doff + 5) + doff; + *s++ = 'f'; + *(float*)s = (float)atof(str->s + 5); + s += 4; doff += 5; + } else if (type == 'd') { + s = alloc_data(b, doff + 9) + doff; + *s++ = 'd'; + *(float*)s = (float)atof(str->s + 9); + s += 8; doff += 9; + } else if (type == 'Z' || type == 'H') { + int size = 1 + (str->l - 5) + 1; + if (type == 'H') { // check whether the hex string is valid + int i; + if ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even"); + for (i = 0; i < str->l - 5; ++i) { + int c = toupper(str->s[5 + i]); + if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))) + parse_error(fp->n_lines, "invalid hex character"); + } + } + s = alloc_data(b, doff + size) + doff; + *s++ = type; + memcpy(s, str->s + 5, str->l - 5); + s[str->l - 5] = 0; + doff += size; + } else if (type == 'B') { + int32_t n = 0, Bsize, k = 0, size; + char *p; + if (str->l < 8) parse_error(fp->n_lines, "too few values in aux type B"); + Bsize = bam_aux_type2size(str->s[5]); // the size of each element + for (p = (char*)str->s + 6; *p; ++p) // count the number of elements in the array + if (*p == ',') ++n; + p = str->s + 7; // now p points to the first number in the array + size = 6 + Bsize * n; // total number of bytes allocated to this tag + s = alloc_data(b, doff + 6 * Bsize * n) + doff; // allocate memory + *s++ = 'B'; *s++ = str->s[5]; + memcpy(s, &n, 4); s += 4; // write the number of elements + if (str->s[5] == 'c') while (p < str->s + str->l) ((int8_t*)s)[k++] = (int8_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'C') while (p < str->s + str->l) ((uint8_t*)s)[k++] = (uint8_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 's') while (p < str->s + str->l) ((int16_t*)s)[k++] = (int16_t)strtol(p, &p, 0), ++p; // FIXME: avoid unaligned memory + else if (str->s[5] == 'S') while (p < str->s + str->l) ((uint16_t*)s)[k++] = (uint16_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'i') while (p < str->s + str->l) ((int32_t*)s)[k++] = (int32_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'I') while (p < str->s + str->l) ((uint32_t*)s)[k++] = (uint32_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'f') while (p < str->s + str->l) ((float*)s)[k++] = (float)strtod(p, &p), ++p; + else parse_error(fp->n_lines, "unrecognized array type"); + s += Bsize * n; doff += size; + } else parse_error(fp->n_lines, "unrecognized type"); + if (dret == '\n' || dret == '\r') break; + } + } + b->l_aux = doff - doff0; + b->data_len = doff; + if (bam_no_B) bam_remove_B(b); + return z; +} + +tamFile sam_open(const char *fn) +{ + tamFile fp; + gzFile gzfp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "rb") : gzopen(fn, "rb"); + if (gzfp == 0) return 0; + fp = (tamFile)calloc(1, sizeof(struct __tamFile_t)); + fp->str = (kstring_t*)calloc(1, sizeof(kstring_t)); + fp->fp = gzfp; + fp->ks = ks_init(fp->fp); + return fp; +} + +void sam_close(tamFile fp) +{ + if (fp) { + ks_destroy(fp->ks); + gzclose(fp->fp); + free(fp->str->s); free(fp->str); + free(fp); + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_index.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,726 @@ +#include <ctype.h> +#include <assert.h> +#include "bam.h" +#include "khash.h" +#include "ksort.h" +#include "bam_endian.h" +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +/*! + @header + + Alignment indexing. Before indexing, BAM must be sorted based on the + leftmost coordinate of alignments. In indexing, BAM uses two indices: + a UCSC binning index and a simple linear index. The binning index is + efficient for alignments spanning long distance, while the auxiliary + linear index helps to reduce unnecessary seek calls especially for + short alignments. + + The UCSC binning scheme was suggested by Richard Durbin and Lincoln + Stein and is explained by Kent et al. (2002). In this scheme, each bin + represents a contiguous genomic region which can be fully contained in + another bin; each alignment is associated with a bin which represents + the smallest region containing the entire alignment. The binning + scheme is essentially another representation of R-tree. A distinct bin + uniquely corresponds to a distinct internal node in a R-tree. Bin A is + a child of Bin B if region A is contained in B. + + In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin + 0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp, + 585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to + find the alignments overlapped with a region [rbeg,rend), we need to + calculate the list of bins that may be overlapped the region and test + the alignments in the bins to confirm the overlaps. If the specified + region is short, typically only a few alignments in six bins need to + be retrieved. The overlapping alignments can be quickly fetched. + + */ + +#define BAM_MIN_CHUNK_GAP 32768 +// 1<<14 is the size of minimum bin. +#define BAM_LIDX_SHIFT 14 + +#define BAM_MAX_BIN 37450 // =(8^6-1)/7+1 + +typedef struct { + uint64_t u, v; +} pair64_t; + +#define pair64_lt(a,b) ((a).u < (b).u) +KSORT_INIT(off, pair64_t, pair64_lt) + +typedef struct { + uint32_t m, n; + pair64_t *list; +} bam_binlist_t; + +typedef struct { + int32_t n, m; + uint64_t *offset; +} bam_lidx_t; + +KHASH_MAP_INIT_INT(i, bam_binlist_t) + +struct __bam_index_t { + int32_t n; + uint64_t n_no_coor; // unmapped reads without coordinate + khash_t(i) **index; + bam_lidx_t *index2; +}; + +// requirement: len <= LEN_MASK +static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end) +{ + khint_t k; + bam_binlist_t *l; + int ret; + k = kh_put(i, h, bin, &ret); + l = &kh_value(h, k); + if (ret) { // not present + l->m = 1; l->n = 0; + l->list = (pair64_t*)calloc(l->m, 16); + } + if (l->n == l->m) { + l->m <<= 1; + l->list = (pair64_t*)realloc(l->list, l->m * 16); + } + l->list[l->n].u = beg; l->list[l->n++].v = end; +} + +static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset) +{ + int i, beg, end; + beg = b->core.pos >> BAM_LIDX_SHIFT; + end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT; + if (index2->m < end + 1) { + int old_m = index2->m; + index2->m = end + 1; + kroundup32(index2->m); + index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); + memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); + } + if (beg == end) { + if (index2->offset[beg] == 0) index2->offset[beg] = offset; + } else { + for (i = beg; i <= end; ++i) + if (index2->offset[i] == 0) index2->offset[i] = offset; + } + index2->n = end + 1; +} + +static void merge_chunks(bam_index_t *idx) +{ +#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16) + khash_t(i) *index; + int i, l, m; + khint_t k; + for (i = 0; i < idx->n; ++i) { + index = idx->index[i]; + for (k = kh_begin(index); k != kh_end(index); ++k) { + bam_binlist_t *p; + if (!kh_exist(index, k) || kh_key(index, k) == BAM_MAX_BIN) continue; + p = &kh_value(index, k); + m = 0; + for (l = 1; l < p->n; ++l) { +#ifdef BAM_TRUE_OFFSET + if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v; +#else + if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v; +#endif + else p->list[++m] = p->list[l]; + } // ~for(l) + p->n = m + 1; + } // ~for(k) + } // ~for(i) +#endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF) +} + +static void fill_missing(bam_index_t *idx) +{ + int i, j; + for (i = 0; i < idx->n; ++i) { + bam_lidx_t *idx2 = &idx->index2[i]; + for (j = 1; j < idx2->n; ++j) + if (idx2->offset[j] == 0) + idx2->offset[j] = idx2->offset[j-1]; + } +} + +bam_index_t *bam_index_core(bamFile fp) +{ + bam1_t *b; + bam_header_t *h; + int i, ret; + bam_index_t *idx; + uint32_t last_bin, save_bin; + int32_t last_coor, last_tid, save_tid; + bam1_core_t *c; + uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor; + + h = bam_header_read(fp); + if(h == 0) { + fprintf(stderr, "[bam_index_core] Invalid BAM header."); + return NULL; + } + + idx = (bam_index_t*)calloc(1, sizeof(bam_index_t)); + b = (bam1_t*)calloc(1, sizeof(bam1_t)); + c = &b->core; + + idx->n = h->n_targets; + bam_header_destroy(h); + idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*)); + for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i); + idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t)); + + save_bin = save_tid = last_tid = last_bin = 0xffffffffu; + save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu; + n_mapped = n_unmapped = n_no_coor = off_end = 0; + off_beg = off_end = bam_tell(fp); + while ((ret = bam_read1(fp, b)) >= 0) { + if (c->tid < 0) ++n_no_coor; + if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes + last_tid = c->tid; + last_bin = 0xffffffffu; + } else if ((uint32_t)last_tid > (uint32_t)c->tid) { + fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n", + bam1_qname(b), last_tid+1, c->tid+1); + return NULL; + } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) { + fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n", + bam1_qname(b), last_coor, c->pos, c->tid+1); + return NULL; + } + if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off); + if (c->bin != last_bin) { // then possibly write the binning index + if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record + insert_offset(idx->index[save_tid], save_bin, save_off, last_off); + if (last_bin == 0xffffffffu && save_tid != 0xffffffffu) { // write the meta element + off_end = last_off; + insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end); + insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped); + n_mapped = n_unmapped = 0; + off_beg = off_end; + } + save_off = last_off; + save_bin = last_bin = c->bin; + save_tid = c->tid; + if (save_tid < 0) break; + } + if (bam_tell(fp) <= last_off) { + fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n", + (unsigned long long)bam_tell(fp), (unsigned long long)last_off); + return NULL; + } + if (c->flag & BAM_FUNMAP) ++n_unmapped; + else ++n_mapped; + last_off = bam_tell(fp); + last_coor = b->core.pos; + } + if (save_tid >= 0) { + insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp)); + insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp)); + insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped); + } + merge_chunks(idx); + fill_missing(idx); + if (ret >= 0) { + while ((ret = bam_read1(fp, b)) >= 0) { + ++n_no_coor; + if (c->tid >= 0 && n_no_coor) { + fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n"); + return NULL; + } + } + } + if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret); + free(b->data); free(b); + idx->n_no_coor = n_no_coor; + return idx; +} + +void bam_index_destroy(bam_index_t *idx) +{ + khint_t k; + int i; + if (idx == 0) return; + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index = idx->index[i]; + bam_lidx_t *index2 = idx->index2 + i; + for (k = kh_begin(index); k != kh_end(index); ++k) { + if (kh_exist(index, k)) + free(kh_value(index, k).list); + } + kh_destroy(i, index); + free(index2->offset); + } + free(idx->index); free(idx->index2); + free(idx); +} + +void bam_index_save(const bam_index_t *idx, FILE *fp) +{ + int32_t i, size; + khint_t k; + fwrite("BAI\1", 1, 4, fp); + if (bam_is_be) { + uint32_t x = idx->n; + fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + } else fwrite(&idx->n, 4, 1, fp); + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index = idx->index[i]; + bam_lidx_t *index2 = idx->index2 + i; + // write binning index + size = kh_size(index); + if (bam_is_be) { // big endian + uint32_t x = size; + fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + } else fwrite(&size, 4, 1, fp); + for (k = kh_begin(index); k != kh_end(index); ++k) { + if (kh_exist(index, k)) { + bam_binlist_t *p = &kh_value(index, k); + if (bam_is_be) { // big endian + uint32_t x; + x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + for (x = 0; (int)x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + fwrite(p->list, 16, p->n, fp); + for (x = 0; (int)x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + } else { + fwrite(&kh_key(index, k), 4, 1, fp); + fwrite(&p->n, 4, 1, fp); + fwrite(p->list, 16, p->n, fp); + } + } + } + // write linear index (index2) + if (bam_is_be) { + int x = index2->n; + fwrite(bam_swap_endian_4p(&x), 4, 1, fp); + } else fwrite(&index2->n, 4, 1, fp); + if (bam_is_be) { // big endian + int x; + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + fwrite(index2->offset, 8, index2->n, fp); + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + } else fwrite(index2->offset, 8, index2->n, fp); + } + { // write the number of reads coor-less records. + uint64_t x = idx->n_no_coor; + if (bam_is_be) bam_swap_endian_8p(&x); + fwrite(&x, 8, 1, fp); + } + fflush(fp); +} + +static bam_index_t *bam_index_load_core(FILE *fp) +{ + int i; + char magic[4]; + bam_index_t *idx; + if (fp == 0) { + fprintf(stderr, "[bam_index_load_core] fail to load index.\n"); + return 0; + } + fread(magic, 1, 4, fp); + if (strncmp(magic, "BAI\1", 4)) { + fprintf(stderr, "[bam_index_load] wrong magic number.\n"); + fclose(fp); + return 0; + } + idx = (bam_index_t*)calloc(1, sizeof(bam_index_t)); + fread(&idx->n, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&idx->n); + idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*)); + idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t)); + for (i = 0; i < idx->n; ++i) { + khash_t(i) *index; + bam_lidx_t *index2 = idx->index2 + i; + uint32_t key, size; + khint_t k; + int j, ret; + bam_binlist_t *p; + index = idx->index[i] = kh_init(i); + // load binning index + fread(&size, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&size); + for (j = 0; j < (int)size; ++j) { + fread(&key, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&key); + k = kh_put(i, index, key, &ret); + p = &kh_value(index, k); + fread(&p->n, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&p->n); + p->m = p->n; + p->list = (pair64_t*)malloc(p->m * 16); + fread(p->list, 16, p->n, fp); + if (bam_is_be) { + int x; + for (x = 0; x < p->n; ++x) { + bam_swap_endian_8p(&p->list[x].u); + bam_swap_endian_8p(&p->list[x].v); + } + } + } + // load linear index + fread(&index2->n, 4, 1, fp); + if (bam_is_be) bam_swap_endian_4p(&index2->n); + index2->m = index2->n; + index2->offset = (uint64_t*)calloc(index2->m, 8); + fread(index2->offset, index2->n, 8, fp); + if (bam_is_be) + for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); + } + if (fread(&idx->n_no_coor, 8, 1, fp) == 0) idx->n_no_coor = 0; + if (bam_is_be) bam_swap_endian_8p(&idx->n_no_coor); + return idx; +} + +bam_index_t *bam_index_load_local(const char *_fn) +{ + FILE *fp; + char *fnidx, *fn; + + if (strstr(_fn, "ftp://") == _fn || strstr(_fn, "http://") == _fn) { + const char *p; + int l = strlen(_fn); + for (p = _fn + l - 1; p >= _fn; --p) + if (*p == '/') break; + fn = strdup(p + 1); + } else fn = strdup(_fn); + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".bai"); + fp = fopen(fnidx, "rb"); + if (fp == 0) { // try "{base}.bai" + char *s = strstr(fn, "bam"); + if (s == fn + strlen(fn) - 3) { + strcpy(fnidx, fn); + fnidx[strlen(fn)-1] = 'i'; + fp = fopen(fnidx, "rb"); + } + } + free(fnidx); free(fn); + if (fp) { + bam_index_t *idx = bam_index_load_core(fp); + fclose(fp); + return idx; + } else return 0; +} + +#ifdef _USE_KNETFILE +static void download_from_remote(const char *url) +{ + const int buf_size = 1 * 1024 * 1024; + char *fn; + FILE *fp; + uint8_t *buf; + knetFile *fp_remote; + int l; + if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return; + l = strlen(url); + for (fn = (char*)url + l - 1; fn >= url; --fn) + if (*fn == '/') break; + ++fn; // fn now points to the file name + fp_remote = knet_open(url, "r"); + if (fp_remote == 0) { + fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); + return; + } + if ((fp = fopen(fn, "wb")) == 0) { + fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); + knet_close(fp_remote); + return; + } + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + fwrite(buf, 1, l, fp); + free(buf); + fclose(fp); + knet_close(fp_remote); +} +#else +static void download_from_remote(const char *url) +{ + return; +} +#endif + +bam_index_t *bam_index_load(const char *fn) +{ + bam_index_t *idx; + idx = bam_index_load_local(fn); + if (idx == 0 && (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn)) { + char *fnidx = calloc(strlen(fn) + 5, 1); + strcat(strcpy(fnidx, fn), ".bai"); + fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n"); + download_from_remote(fnidx); + free(fnidx); + idx = bam_index_load_local(fn); + } + if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n"); + return idx; +} + +int bam_index_build2(const char *fn, const char *_fnidx) +{ + char *fnidx; + FILE *fpidx; + bamFile fp; + bam_index_t *idx; + if ((fp = bam_open(fn, "r")) == 0) { + fprintf(stderr, "[bam_index_build2] fail to open the BAM file.\n"); + return -1; + } + idx = bam_index_core(fp); + bam_close(fp); + if(idx == 0) { + fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n"); + return -1; + } + if (_fnidx == 0) { + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".bai"); + } else fnidx = strdup(_fnidx); + fpidx = fopen(fnidx, "wb"); + if (fpidx == 0) { + fprintf(stderr, "[bam_index_build2] fail to create the index file.\n"); + free(fnidx); + bam_index_destroy(idx); + return -1; + } + bam_index_save(idx, fpidx); + bam_index_destroy(idx); + fclose(fpidx); + free(fnidx); + return 0; +} + +int bam_index_build(const char *fn) +{ + return bam_index_build2(fn, 0); +} + +int bam_index(int argc, char *argv[]) +{ + if (argc < 2) { + fprintf(stderr, "Usage: samtools index <in.bam> [out.index]\n"); + return 1; + } + if (argc >= 3) bam_index_build2(argv[1], argv[2]); + else bam_index_build(argv[1]); + return 0; +} + +int bam_idxstats(int argc, char *argv[]) +{ + bam_index_t *idx; + bam_header_t *header; + bamFile fp; + int i; + if (argc < 2) { + fprintf(stderr, "Usage: samtools idxstats <in.bam>\n"); + return 1; + } + fp = bam_open(argv[1], "r"); + if (fp == 0) { fprintf(stderr, "[%s] fail to open BAM.\n", __func__); return 1; } + header = bam_header_read(fp); + bam_close(fp); + idx = bam_index_load(argv[1]); + if (idx == 0) { fprintf(stderr, "[%s] fail to load the index.\n", __func__); return 1; } + for (i = 0; i < idx->n; ++i) { + khint_t k; + khash_t(i) *h = idx->index[i]; + printf("%s\t%d", header->target_name[i], header->target_len[i]); + k = kh_get(i, h, BAM_MAX_BIN); + if (k != kh_end(h)) + printf("\t%llu\t%llu", (long long)kh_val(h, k).list[1].u, (long long)kh_val(h, k).list[1].v); + else printf("\t0\t0"); + putchar('\n'); + } + printf("*\t0\t0\t%llu\n", (long long)idx->n_no_coor); + bam_header_destroy(header); + bam_index_destroy(idx); + return 0; +} + +static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[BAM_MAX_BIN]) +{ + int i = 0, k; + if (beg >= end) return 0; + if (end >= 1u<<29) end = 1u<<29; + --end; + list[i++] = 0; + for (k = 1 + (beg>>26); k <= 1 + (end>>26); ++k) list[i++] = k; + for (k = 9 + (beg>>23); k <= 9 + (end>>23); ++k) list[i++] = k; + for (k = 73 + (beg>>20); k <= 73 + (end>>20); ++k) list[i++] = k; + for (k = 585 + (beg>>17); k <= 585 + (end>>17); ++k) list[i++] = k; + for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k; + return i; +} + +static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b) +{ + uint32_t rbeg = b->core.pos; + uint32_t rend = b->core.n_cigar? bam_calend(&b->core, bam1_cigar(b)) : b->core.pos + 1; + return (rend > beg && rbeg < end); +} + +struct __bam_iter_t { + int from_first; // read from the first record; no random access + int tid, beg, end, n_off, i, finished; + uint64_t curr_off; + pair64_t *off; +}; + +// bam_fetch helper function retrieves +bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end) +{ + uint16_t *bins; + int i, n_bins, n_off; + pair64_t *off; + khint_t k; + khash_t(i) *index; + uint64_t min_off; + bam_iter_t iter = 0; + + if (beg < 0) beg = 0; + if (end < beg) return 0; + // initialize iter + iter = calloc(1, sizeof(struct __bam_iter_t)); + iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; + // + bins = (uint16_t*)calloc(BAM_MAX_BIN, 2); + n_bins = reg2bins(beg, end, bins); + index = idx->index[tid]; + if (idx->index2[tid].n > 0) { + min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1] + : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT]; + if (min_off == 0) { // improvement for index files built by tabix prior to 0.1.4 + int n = beg>>BAM_LIDX_SHIFT; + if (n > idx->index2[tid].n) n = idx->index2[tid].n; + for (i = n - 1; i >= 0; --i) + if (idx->index2[tid].offset[i] != 0) break; + if (i >= 0) min_off = idx->index2[tid].offset[i]; + } + } else min_off = 0; // tabix 0.1.2 may produce such index files + for (i = n_off = 0; i < n_bins; ++i) { + if ((k = kh_get(i, index, bins[i])) != kh_end(index)) + n_off += kh_value(index, k).n; + } + if (n_off == 0) { + free(bins); return iter; + } + off = (pair64_t*)calloc(n_off, 16); + for (i = n_off = 0; i < n_bins; ++i) { + if ((k = kh_get(i, index, bins[i])) != kh_end(index)) { + int j; + bam_binlist_t *p = &kh_value(index, k); + for (j = 0; j < p->n; ++j) + if (p->list[j].v > min_off) off[n_off++] = p->list[j]; + } + } + free(bins); + if (n_off == 0) { + free(off); return iter; + } + { + bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t)); + int l; + ks_introsort(off, n_off, off); + // resolve completely contained adjacent blocks + for (i = 1, l = 0; i < n_off; ++i) + if (off[l].v < off[i].v) + off[++l] = off[i]; + n_off = l + 1; + // resolve overlaps between adjacent blocks; this may happen due to the merge in indexing + for (i = 1; i < n_off; ++i) + if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u; + { // merge adjacent blocks +#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16) + for (i = 1, l = 0; i < n_off; ++i) { +#ifdef BAM_TRUE_OFFSET + if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v; +#else + if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v; +#endif + else off[++l] = off[i]; + } + n_off = l + 1; +#endif + } + bam_destroy1(b); + } + iter->n_off = n_off; iter->off = off; + return iter; +} + +pair64_t *get_chunk_coordinates(const bam_index_t *idx, int tid, int beg, int end, int *cnt_off) +{ // for pysam compatibility + bam_iter_t iter; + pair64_t *off; + iter = bam_iter_query(idx, tid, beg, end); + off = iter->off; *cnt_off = iter->n_off; + free(iter); + return off; +} + +void bam_iter_destroy(bam_iter_t iter) +{ + if (iter) { free(iter->off); free(iter); } +} + +int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b) +{ + int ret; + if (iter && iter->finished) return -1; + if (iter == 0 || iter->from_first) { + ret = bam_read1(fp, b); + if (ret < 0 && iter) iter->finished = 1; + return ret; + } + if (iter->off == 0) return -1; + for (;;) { + if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk + if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks + if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug + if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek + bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET); + iter->curr_off = bam_tell(fp); + } + ++iter->i; + } + if ((ret = bam_read1(fp, b)) >= 0) { + iter->curr_off = bam_tell(fp); + if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed + ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error + break; + } + else if (is_overlap(iter->beg, iter->end, b)) return ret; + } else break; // end of file or error + } + iter->finished = 1; + return ret; +} + +int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) +{ + int ret; + bam_iter_t iter; + bam1_t *b; + b = bam_init1(); + iter = bam_iter_query(idx, tid, beg, end); + while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data); + bam_iter_destroy(iter); + bam_destroy1(b); + return (ret == -1)? 0 : ret; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_lpileup.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,198 @@ +#include <stdlib.h> +#include <stdio.h> +#include <assert.h> +#include "bam.h" +#include "ksort.h" + +#define TV_GAP 2 + +typedef struct __freenode_t { + uint32_t level:28, cnt:4; + struct __freenode_t *next; +} freenode_t, *freenode_p; + +#define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level)) +KSORT_INIT(node, freenode_p, freenode_lt) + +/* Memory pool, similar to the one in bam_pileup.c */ +typedef struct { + int cnt, n, max; + freenode_t **buf; +} mempool_t; + +static mempool_t *mp_init() +{ + return (mempool_t*)calloc(1, sizeof(mempool_t)); +} +static void mp_destroy(mempool_t *mp) +{ + int k; + for (k = 0; k < mp->n; ++k) free(mp->buf[k]); + free(mp->buf); free(mp); +} +static inline freenode_t *mp_alloc(mempool_t *mp) +{ + ++mp->cnt; + if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t)); + else return mp->buf[--mp->n]; +} +static inline void mp_free(mempool_t *mp, freenode_t *p) +{ + --mp->cnt; p->next = 0; p->cnt = TV_GAP; + if (mp->n == mp->max) { + mp->max = mp->max? mp->max<<1 : 256; + mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max); + } + mp->buf[mp->n++] = p; +} + +/* core part */ +struct __bam_lplbuf_t { + int max, n_cur, n_pre; + int max_level, *cur_level, *pre_level; + mempool_t *mp; + freenode_t **aux, *head, *tail; + int n_nodes, m_aux; + bam_pileup_f func; + void *user_data; + bam_plbuf_t *plbuf; +}; + +void bam_lplbuf_reset(bam_lplbuf_t *buf) +{ + freenode_t *p, *q; + bam_plbuf_reset(buf->plbuf); + for (p = buf->head; p->next;) { + q = p->next; + mp_free(buf->mp, p); + p = q; + } + buf->head = buf->tail; + buf->max_level = 0; + buf->n_cur = buf->n_pre = 0; + buf->n_nodes = 0; +} + +static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) +{ + bam_lplbuf_t *tv = (bam_lplbuf_t*)data; + freenode_t *p; + int i, l, max_level; + // allocate memory if necessary + if (tv->max < n) { // enlarge + tv->max = n; + kroundup32(tv->max); + tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max); + tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max); + } + tv->n_cur = n; + // update cnt + for (p = tv->head; p->next; p = p->next) + if (p->cnt > 0) --p->cnt; + // calculate cur_level[] + max_level = 0; + for (i = l = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->is_head) { + if (tv->head->next && tv->head->cnt == 0) { // then take a free slot + freenode_t *p = tv->head->next; + tv->cur_level[i] = tv->head->level; + mp_free(tv->mp, tv->head); + tv->head = p; + --tv->n_nodes; + } else tv->cur_level[i] = ++tv->max_level; + } else { + tv->cur_level[i] = tv->pre_level[l++]; + if (p->is_tail) { // then return a free slot + tv->tail->level = tv->cur_level[i]; + tv->tail->next = mp_alloc(tv->mp); + tv->tail = tv->tail->next; + ++tv->n_nodes; + } + } + if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i]; + ((bam_pileup1_t*)p)->level = tv->cur_level[i]; + } + assert(l == tv->n_pre); + tv->func(tid, pos, n, pl, tv->user_data); + // sort the linked list + if (tv->n_nodes) { + freenode_t *q; + if (tv->n_nodes + 1 > tv->m_aux) { // enlarge + tv->m_aux = tv->n_nodes + 1; + kroundup32(tv->m_aux); + tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux); + } + for (p = tv->head, i = l = 0; p->next;) { + if (p->level > max_level) { // then discard this entry + q = p->next; + mp_free(tv->mp, p); + p = q; + } else { + tv->aux[i++] = p; + p = p->next; + } + } + tv->aux[i] = tv->tail; // add a proper tail for the loop below + tv->n_nodes = i; + if (tv->n_nodes) { + ks_introsort(node, tv->n_nodes, tv->aux); + for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1]; + tv->head = tv->aux[0]; + } else tv->head = tv->tail; + } + // clean up + tv->max_level = max_level; + memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4); + // squeeze out terminated levels + for (i = l = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (!p->is_tail) + tv->pre_level[l++] = tv->pre_level[i]; + } + tv->n_pre = l; +/* + fprintf(stderr, "%d\t", pos+1); + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->is_head) fprintf(stderr, "^"); + if (p->is_tail) fprintf(stderr, "$"); + fprintf(stderr, "%d,", p->level); + } + fprintf(stderr, "\n"); +*/ + return 0; +} + +bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data) +{ + bam_lplbuf_t *tv; + tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t)); + tv->mp = mp_init(); + tv->head = tv->tail = mp_alloc(tv->mp); + tv->func = func; + tv->user_data = data; + tv->plbuf = bam_plbuf_init(tview_func, tv); + return (bam_lplbuf_t*)tv; +} + +void bam_lplbuf_destroy(bam_lplbuf_t *tv) +{ + freenode_t *p, *q; + free(tv->cur_level); free(tv->pre_level); + bam_plbuf_destroy(tv->plbuf); + free(tv->aux); + for (p = tv->head; p->next;) { + q = p->next; + mp_free(tv->mp, p); p = q; + } + mp_free(tv->mp, p); + assert(tv->mp->cnt == 0); + mp_destroy(tv->mp); + free(tv); +} + +int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv) +{ + return bam_plbuf_push(b, tv->plbuf); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_mate.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,128 @@ +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include "kstring.h" +#include "bam.h" + +void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) +{ + bam1_t *swap; + int i, end; + uint32_t *cigar; + str->l = 0; + if (b1->core.tid != b2->core.tid || b1->core.tid < 0) return; // coordinateless or not on the same chr; skip + if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate + kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index + kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand + for (i = 0, cigar = bam1_cigar(b1); i < b1->core.n_cigar; ++i) { + kputw(bam_cigar_oplen(cigar[i]), str); + kputc(bam_cigar_opchr(cigar[i]), str); + } + end = bam_calend(&b1->core, cigar); + kputw(b2->core.pos - end, str); + kputc('T', str); + kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index + kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand + for (i = 0, cigar = bam1_cigar(b2); i < b2->core.n_cigar; ++i) { + kputw(bam_cigar_oplen(cigar[i]), str); + kputc(bam_cigar_opchr(cigar[i]), str); + } + bam_aux_append(b1, "CT", 'Z', str->l+1, (uint8_t*)str->s); +} + +// currently, this function ONLY works if each read has one hit +void bam_mating_core(bamFile in, bamFile out, int remove_reads) +{ + bam_header_t *header; + bam1_t *b[2]; + int curr, has_prev, pre_end = 0, cur_end; + kstring_t str; + + str.l = str.m = 0; str.s = 0; + header = bam_header_read(in); + bam_header_write(out, header); + + b[0] = bam_init1(); + b[1] = bam_init1(); + curr = 0; has_prev = 0; + while (bam_read1(in, b[curr]) >= 0) { + bam1_t *cur = b[curr], *pre = b[1-curr]; + if (cur->core.tid < 0) + { + if ( !remove_reads ) bam_write1(out, cur); + continue; + } + cur_end = bam_calend(&cur->core, bam1_cigar(cur)); + if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; + if (cur->core.flag & BAM_FSECONDARY) + { + if ( !remove_reads ) bam_write1(out, cur); + continue; // skip secondary alignments + } + if (has_prev) { + if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name + cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; + pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; + if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) + && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // set TLEN/ISIZE + { + uint32_t cur5, pre5; + cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; + pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; + cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; + } else cur->core.isize = pre->core.isize = 0; + if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; + else cur->core.flag &= ~BAM_FMREVERSE; + if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE; + else pre->core.flag &= ~BAM_FMREVERSE; + if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } + if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } + bam_template_cigar(pre, cur, &str); + bam_write1(out, pre); + bam_write1(out, cur); + has_prev = 0; + } else { // unpaired or singleton + pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; + if (pre->core.flag & BAM_FPAIRED) { + pre->core.flag |= BAM_FMUNMAP; + pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR; + } + bam_write1(out, pre); + } + } else has_prev = 1; + curr = 1 - curr; + pre_end = cur_end; + } + if (has_prev) bam_write1(out, b[1-curr]); + bam_header_destroy(header); + bam_destroy1(b[0]); + bam_destroy1(b[1]); + free(str.s); +} + +void usage() +{ + fprintf(stderr,"Usage: samtools fixmate <in.nameSrt.bam> <out.nameSrt.bam>\n"); + fprintf(stderr,"Options:\n"); + fprintf(stderr," -r remove unmapped reads and secondary alignments\n"); + exit(1); +} + +int bam_mating(int argc, char *argv[]) +{ + bamFile in, out; + int c, remove_reads=0; + while ((c = getopt(argc, argv, "r")) >= 0) { + switch (c) { + case 'r': remove_reads=1; break; + } + } + if (optind+1 >= argc) usage(); + in = (strcmp(argv[optind], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[optind], "r"); + out = (strcmp(argv[optind+1], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[optind+1], "w"); + bam_mating_core(in, out, remove_reads); + bam_close(in); bam_close(out); + return 0; +} + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_md.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,389 @@ +#include <unistd.h> +#include <assert.h> +#include <string.h> +#include <ctype.h> +#include <math.h> +#include "faidx.h" +#include "sam.h" +#include "kstring.h" +#include "kaln.h" +#include "kprobaln.h" + +#define USE_EQUAL 1 +#define DROP_TAG 2 +#define BIN_QUAL 4 +#define UPDATE_NM 8 +#define UPDATE_MD 16 +#define HASH_QNM 32 + +char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; + +int bam_aux_drop_other(bam1_t *b, uint8_t *s); + +void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm) +{ + uint8_t *seq = bam1_seq(b); + uint32_t *cigar = bam1_cigar(b); + bam1_core_t *c = &b->core; + int i, x, y, u = 0; + kstring_t *str; + int32_t old_nm_i = -1, nm = 0; + + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { + int j, l = cigar[i]>>4, op = cigar[i]&0xf; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + for (j = 0; j < l; ++j) { + int z = y + j; + int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; + if (ref[x+j] == 0) break; // out of boundary + if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match + if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; + ++u; + } else { + kputw(u, str); kputc(ref[x+j], str); + u = 0; ++nm; + } + } + if (j < l) break; + x += l; y += l; + } else if (op == BAM_CDEL) { + kputw(u, str); kputc('^', str); + for (j = 0; j < l; ++j) { + if (ref[x+j] == 0) break; + kputc(ref[x+j], str); + } + u = 0; + if (j < l) break; + x += l; nm += l; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { + y += l; + if (op == BAM_CINS) nm += l; + } else if (op == BAM_CREF_SKIP) { + x += l; + } + } + kputw(u, str); + // apply max_nm + if (max_nm > 0 && nm >= max_nm) { + for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { + int j, l = cigar[i]>>4, op = cigar[i]&0xf; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + for (j = 0; j < l; ++j) { + int z = y + j; + int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; + if (ref[x+j] == 0) break; // out of boundary + if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match + seq[z/2] |= (z&1)? 0x0f : 0xf0; + bam1_qual(b)[z] = 0; + } + } + if (j < l) break; + x += l; y += l; + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + } + } + // update NM + if (flag & UPDATE_NM) { + uint8_t *old_nm = bam_aux_get(b, "NM"); + if (c->flag & BAM_FUNMAP) return; + if (old_nm) old_nm_i = bam_aux2i(old_nm); + if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + else if (nm != old_nm_i) { + fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); + bam_aux_del(b, old_nm); + bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + } + } + // update MD + if (flag & UPDATE_MD) { + uint8_t *old_md = bam_aux_get(b, "MD"); + if (c->flag & BAM_FUNMAP) return; + if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + else { + int is_diff = 0; + if (strlen((char*)old_md+1) == str->l) { + for (i = 0; i < str->l; ++i) + if (toupper(old_md[i+1]) != toupper(str->s[i])) + break; + if (i < str->l) is_diff = 1; + } else is_diff = 1; + if (is_diff) { + fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s); + bam_aux_del(b, old_md); + bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + } + } + } + // drop all tags but RG + if (flag&DROP_TAG) { + uint8_t *q = bam_aux_get(b, "RG"); + bam_aux_drop_other(b, q); + } + // reduce the resolution of base quality + if (flag&BIN_QUAL) { + uint8_t *qual = bam1_qual(b); + for (i = 0; i < b->core.l_qseq; ++i) + if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7; + } + free(str->s); free(str); +} + +void bam_fillmd1(bam1_t *b, char *ref, int flag) +{ + bam_fillmd1_core(b, ref, flag, 0); +} + +int bam_cap_mapQ(bam1_t *b, char *ref, int thres) +{ + uint8_t *seq = bam1_seq(b), *qual = bam1_qual(b); + uint32_t *cigar = bam1_cigar(b); + bam1_core_t *c = &b->core; + int i, x, y, mm, q, len, clip_l, clip_q; + double t; + if (thres < 0) thres = 40; // set the default + mm = q = len = clip_l = clip_q = 0; + for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { + int j, l = cigar[i]>>4, op = cigar[i]&0xf; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + for (j = 0; j < l; ++j) { + int z = y + j; + int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; + if (ref[x+j] == 0) break; // out of boundary + if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous + ++len; + if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch + ++mm; + q += qual[z] > 33? 33 : qual[z]; + } + } + } + if (j < l) break; + x += l; y += l; len += l; + } else if (op == BAM_CDEL) { + for (j = 0; j < l; ++j) + if (ref[x+j] == 0) break; + if (j < l) break; + x += l; + } else if (op == BAM_CSOFT_CLIP) { + for (j = 0; j < l; ++j) clip_q += qual[y+j]; + clip_l += l; + y += l; + } else if (op == BAM_CHARD_CLIP) { + clip_q += 13 * l; + clip_l += l; + } else if (op == BAM_CINS) y += l; + else if (op == BAM_CREF_SKIP) x += l; + } + for (i = 0, t = 1; i < mm; ++i) + t *= (double)len / (i+1); + t = q - 4.343 * log(t) + clip_q / 5.; + if (t > thres) return -1; + if (t < 0) t = 0; + t = sqrt((thres - t) / thres) * thres; +// fprintf(stderr, "%s %lf %d\n", bam1_qname(b), t, q); + return (int)(t + .499); +} + +int bam_prob_realn_core(bam1_t *b, const char *ref, int flag) +{ + int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4; + uint32_t *cigar = bam1_cigar(b); + bam1_core_t *c = &b->core; + kpa_par_t conf = kpa_par_def; + uint8_t *bq = 0, *zq = 0, *qual = bam1_qual(b); + if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0) return -1; // do nothing + // test if BQ or ZQ is present + if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq; + if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq; + if (bq && redo_baq) + { + bam_aux_del(b, bq-1); + bq = 0; + } + if (bq && zq) { // remove the ZQ tag + bam_aux_del(b, zq-1); + zq = 0; + } + if (bq || zq) { + if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing + if (bq && apply_baq) { // then convert BQ to ZQ + for (i = 0; i < c->l_qseq; ++i) + qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64); + *(bq - 3) = 'Z'; + } else if (zq && !apply_baq) { // then convert ZQ to BQ + for (i = 0; i < c->l_qseq; ++i) + qual[i] += (int)zq[i] - 64; + *(zq - 3) = 'B'; + } + return 0; + } + // find the start and end of the alignment + x = c->pos, y = 0, yb = ye = xb = xe = -1; + for (k = 0; k < c->n_cigar; ++k) { + int op, l; + op = cigar[k]&0xf; l = cigar[k]>>4; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + if (yb < 0) yb = y; + if (xb < 0) xb = x; + ye = y + l; xe = x + l; + x += l; y += l; + } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; + else if (op == BAM_CDEL) x += l; + else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip + } + // set bandwidth and the start and the end + bw = 7; + if (abs((xe - xb) - (ye - yb)) > bw) + bw = abs((xe - xb) - (ye - yb)) + 3; + conf.bw = bw; + xb -= yb + bw/2; if (xb < 0) xb = 0; + xe += c->l_qseq - ye + bw/2; + if (xe - xb - c->l_qseq > bw) + xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2; + { // glocal + uint8_t *s, *r, *q, *seq = bam1_seq(b), *bq; + int *state; + bq = calloc(c->l_qseq + 1, 1); + memcpy(bq, qual, c->l_qseq); + s = calloc(c->l_qseq, 1); + for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)]; + r = calloc(xe - xb, 1); + for (i = xb; i < xe; ++i) { + if (ref[i] == 0) { xe = i; break; } + r[i-xb] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[i]]]; + } + state = calloc(c->l_qseq, sizeof(int)); + q = calloc(c->l_qseq, 1); + kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q); + if (!extend_baq) { // in this block, bq[] is capped by base quality qual[] + for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { + int op = cigar[k]&0xf, l = cigar[k]>>4; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + for (i = y; i < y + l; ++i) { + if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0; + else bq[i] = bq[i] < q[i]? bq[i] : q[i]; + } + x += l; y += l; + } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; + else if (op == BAM_CDEL) x += l; + } + for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ + } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!) + uint8_t *left, *rght; + left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1); + for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { + int op = cigar[k]&0xf, l = cigar[k]>>4; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + for (i = y; i < y + l; ++i) + bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i]; + for (left[y] = bq[y], i = y + 1; i < y + l; ++i) + left[i] = bq[i] > left[i-1]? bq[i] : left[i-1]; + for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i) + rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1]; + for (i = y; i < y + l; ++i) + bq[i] = left[i] < rght[i]? left[i] : rght[i]; + x += l; y += l; + } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; + else if (op == BAM_CDEL) x += l; + } + for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ + free(left); free(rght); + } + if (apply_baq) { + for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual + bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq); + } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq); + free(bq); free(s); free(r); free(q); free(state); + } + return 0; +} + +int bam_prob_realn(bam1_t *b, const char *ref) +{ + return bam_prob_realn_core(b, ref, 1); +} + +int bam_fillmd(int argc, char *argv[]) +{ + int c, flt_flag, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag; + samfile_t *fp, *fpout = 0; + faidx_t *fai; + char *ref = 0, mode_w[8], mode_r[8]; + bam1_t *b; + + flt_flag = UPDATE_NM | UPDATE_MD; + is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0; + mode_w[0] = mode_r[0] = 0; + strcpy(mode_r, "r"); strcpy(mode_w, "w"); + while ((c = getopt(argc, argv, "EqreuNhbSC:n:Ad")) >= 0) { + switch (c) { + case 'r': is_realn = 1; break; + case 'e': flt_flag |= USE_EQUAL; break; + case 'd': flt_flag |= DROP_TAG; break; + case 'q': flt_flag |= BIN_QUAL; break; + case 'h': flt_flag |= HASH_QNM; break; + case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break; + case 'b': is_bam_out = 1; break; + case 'u': is_uncompressed = is_bam_out = 1; break; + case 'S': is_sam_in = 1; break; + case 'n': max_nm = atoi(optarg); break; + case 'C': capQ = atoi(optarg); break; + case 'A': baq_flag |= 1; break; + case 'E': baq_flag |= 2; break; + default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1; + } + } + if (!is_sam_in) strcat(mode_r, "b"); + if (is_bam_out) strcat(mode_w, "b"); + else strcat(mode_w, "h"); + if (is_uncompressed) strcat(mode_w, "u"); + if (optind + 1 >= argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools fillmd [-eubrS] <aln.bam> <ref.fasta>\n\n"); + fprintf(stderr, "Options: -e change identical bases to '='\n"); + fprintf(stderr, " -u uncompressed BAM output (for piping)\n"); + fprintf(stderr, " -b compressed BAM output\n"); + fprintf(stderr, " -S the input is SAM with header\n"); + fprintf(stderr, " -A modify the quality string\n"); + fprintf(stderr, " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n"); + fprintf(stderr, " -E extended BAQ for better sensitivity but lower specificity\n\n"); + return 1; + } + fp = samopen(argv[optind], mode_r, 0); + if (fp == 0) return 1; + if (is_sam_in && (fp->header == 0 || fp->header->n_targets == 0)) { + fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); + return 1; + } + fpout = samopen("-", mode_w, fp->header); + fai = fai_load(argv[optind+1]); + + b = bam_init1(); + while ((ret = samread(fp, b)) >= 0) { + if (b->core.tid >= 0) { + if (tid != b->core.tid) { + free(ref); + ref = fai_fetch(fai, fp->header->target_name[b->core.tid], &len); + tid = b->core.tid; + if (ref == 0) + fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", + fp->header->target_name[tid]); + } + if (is_realn) bam_prob_realn_core(b, ref, baq_flag); + if (capQ > 10) { + int q = bam_cap_mapQ(b, ref, capQ); + if (b->core.qual > q) b->core.qual = q; + } + if (ref) bam_fillmd1_core(b, ref, flt_flag, max_nm); + } + samwrite(fpout, b); + } + bam_destroy1(b); + + free(ref); + fai_destroy(fai); + samclose(fp); samclose(fpout); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_pileup.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,437 @@ +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <assert.h> +#include "sam.h" + +typedef struct { + int k, x, y, end; +} cstate_t; + +static cstate_t g_cstate_null = { -1, 0, 0, 0 }; + +typedef struct __linkbuf_t { + bam1_t b; + uint32_t beg, end; + cstate_t s; + struct __linkbuf_t *next; +} lbnode_t; + +/* --- BEGIN: Memory pool */ + +typedef struct { + int cnt, n, max; + lbnode_t **buf; +} mempool_t; + +static mempool_t *mp_init() +{ + mempool_t *mp; + mp = (mempool_t*)calloc(1, sizeof(mempool_t)); + return mp; +} +static void mp_destroy(mempool_t *mp) +{ + int k; + for (k = 0; k < mp->n; ++k) { + free(mp->buf[k]->b.data); + free(mp->buf[k]); + } + free(mp->buf); + free(mp); +} +static inline lbnode_t *mp_alloc(mempool_t *mp) +{ + ++mp->cnt; + if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t)); + else return mp->buf[--mp->n]; +} +static inline void mp_free(mempool_t *mp, lbnode_t *p) +{ + --mp->cnt; p->next = 0; // clear lbnode_t::next here + if (mp->n == mp->max) { + mp->max = mp->max? mp->max<<1 : 256; + mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max); + } + mp->buf[mp->n++] = p; +} + +/* --- END: Memory pool */ + +/* --- BEGIN: Auxiliary functions */ + +/* s->k: the index of the CIGAR operator that has just been processed. + s->x: the reference coordinate of the start of s->k + s->y: the query coordiante of the start of s->k + */ +static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s) +{ +#define _cop(c) ((c)&BAM_CIGAR_MASK) +#define _cln(c) ((c)>>BAM_CIGAR_SHIFT) + + bam1_t *b = p->b; + bam1_core_t *c = &b->core; + uint32_t *cigar = bam1_cigar(b); + int k, is_head = 0; + // determine the current CIGAR operation +// fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam1_qname(b), pos, s->end, s->k, s->x, s->y); + if (s->k == -1) { // never processed + is_head = 1; + if (c->n_cigar == 1) { // just one operation, save a loop + if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0; + } else { // find the first match or deletion + for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) { + int op = _cop(cigar[k]); + int l = _cln(cigar[k]); + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break; + else if (op == BAM_CREF_SKIP) s->x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; + } + assert(k < c->n_cigar); + s->k = k; + } + } else { // the read has been processed before + int op, l = _cln(cigar[s->k]); + if (pos - s->x >= l) { // jump to the next operation + assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case + op = _cop(cigar[s->k+1]); + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop + if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; + s->x += l; + ++s->k; + } else { // find the next M/D/N/=/X + if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; + s->x += l; + for (k = s->k + 1; k < c->n_cigar; ++k) { + op = _cop(cigar[k]), l = _cln(cigar[k]); + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; + } + s->k = k; + } + assert(s->k < c->n_cigar); // otherwise a bug + } // else, do nothing + } + { // collect pileup information + int op, l; + op = _cop(cigar[s->k]); l = _cln(cigar[s->k]); + p->is_del = p->indel = p->is_refskip = 0; + if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation + int op2 = _cop(cigar[s->k+1]); + int l2 = _cln(cigar[s->k+1]); + if (op2 == BAM_CDEL) p->indel = -(int)l2; + else if (op2 == BAM_CINS) p->indel = l2; + else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding + int l3 = 0; + for (k = s->k + 2; k < c->n_cigar; ++k) { + op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); + if (op2 == BAM_CINS) l3 += l2; + else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break; + } + if (l3 > 0) p->indel = l3; + } + } + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + p->qpos = s->y + (pos - s->x); + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { + p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!! + p->is_refskip = (op == BAM_CREF_SKIP); + } // cannot be other operations; otherwise a bug + p->is_head = (pos == c->pos); p->is_tail = (pos == s->end); + } + return 1; +} + +/* --- END: Auxiliary functions */ + +/******************* + * pileup iterator * + *******************/ + +struct __bam_plp_t { + mempool_t *mp; + lbnode_t *head, *tail, *dummy; + int32_t tid, pos, max_tid, max_pos; + int is_eof, flag_mask, max_plp, error, maxcnt; + bam_pileup1_t *plp; + // for the "auto" interface only + bam1_t *b; + bam_plp_auto_f func; + void *data; +}; + +bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) +{ + bam_plp_t iter; + iter = calloc(1, sizeof(struct __bam_plp_t)); + iter->mp = mp_init(); + iter->head = iter->tail = mp_alloc(iter->mp); + iter->dummy = mp_alloc(iter->mp); + iter->max_tid = iter->max_pos = -1; + iter->flag_mask = BAM_DEF_MASK; + iter->maxcnt = 8000; + if (func) { + iter->func = func; + iter->data = data; + iter->b = bam_init1(); + } + return iter; +} + +void bam_plp_destroy(bam_plp_t iter) +{ + mp_free(iter->mp, iter->dummy); + mp_free(iter->mp, iter->head); + if (iter->mp->cnt != 0) + fprintf(stderr, "[bam_plp_destroy] memory leak: %d. Continue anyway.\n", iter->mp->cnt); + mp_destroy(iter->mp); + if (iter->b) bam_destroy1(iter->b); + free(iter->plp); + free(iter); +} + +const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +{ + if (iter->error) { *_n_plp = -1; return 0; } + *_n_plp = 0; + if (iter->is_eof && iter->head->next == 0) return 0; + while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) { + int n_plp = 0; + lbnode_t *p, *q; + // write iter->plp at iter->pos + iter->dummy->next = iter->head; + for (p = iter->head, q = iter->dummy; p->next; q = p, p = p->next) { + if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove + q->next = p->next; mp_free(iter->mp, p); p = q; + } else if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup + if (n_plp == iter->max_plp) { // then double the capacity + iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256; + iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp); + } + iter->plp[n_plp].b = &p->b; + if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true... + } + } + iter->head = iter->dummy->next; // dummy->next may be changed + *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos; + // update iter->tid and iter->pos + if (iter->head->next) { + if (iter->tid > iter->head->b.core.tid) { + fprintf(stderr, "[%s] unsorted input. Pileup aborts.\n", __func__); + iter->error = 1; + *_n_plp = -1; + return 0; + } + } + if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence + iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference + } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid + iter->pos = iter->head->beg; // jump to the next position + } else ++iter->pos; // scan contiguously + // return + if (n_plp) return iter->plp; + if (iter->is_eof && iter->head->next == 0) break; + } + return 0; +} + +int bam_plp_push(bam_plp_t iter, const bam1_t *b) +{ + if (iter->error) return -1; + if (b) { + if (b->core.tid < 0) return 0; + if (b->core.flag & iter->flag_mask) return 0; + if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) return 0; + bam_copy1(&iter->tail->b, b); + iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b)); + iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t + if (b->core.tid < iter->max_tid) { + fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n"); + iter->error = 1; + return -1; + } + if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) { + fprintf(stderr, "[bam_pileup_core] the input is not sorted (reads out of order)\n"); + iter->error = 1; + return -1; + } + iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg; + if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) { + iter->tail->next = mp_alloc(iter->mp); + iter->tail = iter->tail->next; + } + } else iter->is_eof = 1; + return 0; +} + +const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +{ + const bam_pileup1_t *plp; + if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } + if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + else { // no pileup line can be obtained; read alignments + *_n_plp = 0; + if (iter->is_eof) return 0; + while (iter->func(iter->data, iter->b) >= 0) { + if (bam_plp_push(iter, iter->b) < 0) { + *_n_plp = -1; + return 0; + } + if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + // otherwise no pileup line can be returned; read the next alignment. + } + bam_plp_push(iter, 0); + if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + return 0; + } +} + +void bam_plp_reset(bam_plp_t iter) +{ + lbnode_t *p, *q; + iter->max_tid = iter->max_pos = -1; + iter->tid = iter->pos = 0; + iter->is_eof = 0; + for (p = iter->head; p->next;) { + q = p->next; + mp_free(iter->mp, p); + p = q; + } + iter->head = iter->tail; +} + +void bam_plp_set_mask(bam_plp_t iter, int mask) +{ + iter->flag_mask = mask < 0? BAM_DEF_MASK : (BAM_FUNMAP | mask); +} + +void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) +{ + iter->maxcnt = maxcnt; +} + +/***************** + * callback APIs * + *****************/ + +int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data) +{ + bam_plbuf_t *buf; + int ret; + bam1_t *b; + b = bam_init1(); + buf = bam_plbuf_init(func, func_data); + bam_plbuf_set_mask(buf, mask); + while ((ret = bam_read1(fp, b)) >= 0) + bam_plbuf_push(b, buf); + bam_plbuf_push(0, buf); + bam_plbuf_destroy(buf); + bam_destroy1(b); + return 0; +} + +void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask) +{ + bam_plp_set_mask(buf->iter, mask); +} + +void bam_plbuf_reset(bam_plbuf_t *buf) +{ + bam_plp_reset(buf->iter); +} + +bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data) +{ + bam_plbuf_t *buf; + buf = calloc(1, sizeof(bam_plbuf_t)); + buf->iter = bam_plp_init(0, 0); + buf->func = func; + buf->data = data; + return buf; +} + +void bam_plbuf_destroy(bam_plbuf_t *buf) +{ + bam_plp_destroy(buf->iter); + free(buf); +} + +int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf) +{ + int ret, n_plp, tid, pos; + const bam_pileup1_t *plp; + ret = bam_plp_push(buf->iter, b); + if (ret < 0) return ret; + while ((plp = bam_plp_next(buf->iter, &tid, &pos, &n_plp)) != 0) + buf->func(tid, pos, n_plp, plp, buf->data); + return 0; +} + +/*********** + * mpileup * + ***********/ + +struct __bam_mplp_t { + int n; + uint64_t min, *pos; + bam_plp_t *iter; + int *n_plp; + const bam_pileup1_t **plp; +}; + +bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) +{ + int i; + bam_mplp_t iter; + iter = calloc(1, sizeof(struct __bam_mplp_t)); + iter->pos = calloc(n, 8); + iter->n_plp = calloc(n, sizeof(int)); + iter->plp = calloc(n, sizeof(void*)); + iter->iter = calloc(n, sizeof(void*)); + iter->n = n; + iter->min = (uint64_t)-1; + for (i = 0; i < n; ++i) { + iter->iter[i] = bam_plp_init(func, data[i]); + iter->pos[i] = iter->min; + } + return iter; +} + +void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) +{ + int i; + for (i = 0; i < iter->n; ++i) + iter->iter[i]->maxcnt = maxcnt; +} + +void bam_mplp_destroy(bam_mplp_t iter) +{ + int i; + for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]); + free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp); + free(iter); +} + +int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) +{ + int i, ret = 0; + uint64_t new_min = (uint64_t)-1; + for (i = 0; i < iter->n; ++i) { + if (iter->pos[i] == iter->min) { + int tid, pos; + iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); + iter->pos[i] = (uint64_t)tid<<32 | pos; + } + if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i]; + } + iter->min = new_min; + if (new_min == (uint64_t)-1) return 0; + *_tid = new_min>>32; *_pos = (uint32_t)new_min; + for (i = 0; i < iter->n; ++i) { + if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line" + n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i]; + ++ret; + } else n_plp[i] = 0, plp[i] = 0; + } + return ret; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_plcmd.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,606 @@ +#include <math.h> +#include <stdio.h> +#include <unistd.h> +#include <ctype.h> +#include <string.h> +#include <errno.h> +#include <sys/stat.h> +#include <getopt.h> +#include "sam.h" +#include "faidx.h" +#include "kstring.h" +#include "sam_header.h" + +static inline int printw(int c, FILE *fp) +{ + char buf[16]; + int l, x; + if (c == 0) return fputc('0', fp); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + buf[l] = 0; + for (x = 0; x < l/2; ++x) { + int y = buf[x]; buf[x] = buf[l-1-x]; buf[l-1-x] = y; + } + fputs(buf, fp); + return 0; +} + +static inline void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, const char *ref) +{ + int j; + if (p->is_head) { + putchar('^'); + putchar(p->b->core.qual > 93? 126 : p->b->core.qual + 33); + } + if (!p->is_del) { + int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; + if (ref) { + int rb = pos < ref_len? ref[pos] : 'N'; + if (c == '=' || bam_nt16_table[c] == bam_nt16_table[rb]) c = bam1_strand(p->b)? ',' : '.'; + else c = bam1_strand(p->b)? tolower(c) : toupper(c); + } else { + if (c == '=') c = bam1_strand(p->b)? ',' : '.'; + else c = bam1_strand(p->b)? tolower(c) : toupper(c); + } + putchar(c); + } else putchar(p->is_refskip? (bam1_strand(p->b)? '<' : '>') : '*'); + if (p->indel > 0) { + putchar('+'); printw(p->indel, stdout); + for (j = 1; j <= p->indel; ++j) { + int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; + putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); + } + } else if (p->indel < 0) { + printw(p->indel, stdout); + for (j = 1; j <= -p->indel; ++j) { + int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; + putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); + } + } + if (p->is_tail) putchar('$'); +} + +#include <assert.h> +#include "bam2bcf.h" +#include "sample.h" + +#define MPLP_GLF 0x10 +#define MPLP_NO_COMP 0x20 +#define MPLP_NO_ORPHAN 0x40 +#define MPLP_REALN 0x80 +#define MPLP_NO_INDEL 0x400 +#define MPLP_REDO_BAQ 0x800 +#define MPLP_ILLUMINA13 0x1000 +#define MPLP_IGNORE_RG 0x2000 +#define MPLP_PRINT_POS 0x4000 +#define MPLP_PRINT_MAPQ 0x8000 +#define MPLP_PER_SAMPLE 0x10000 + +void *bed_read(const char *fn); +void bed_destroy(void *_h); +int bed_overlap(const void *_h, const char *chr, int beg, int end); + +typedef struct { + int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag; + int rflag_require, rflag_filter; + int openQ, extQ, tandemQ, min_support; // for indels + double min_frac; // for indels + char *reg, *pl_list, *fai_fname; + faidx_t *fai; + void *bed, *rghash; +} mplp_conf_t; + +typedef struct { + bamFile fp; + bam_iter_t iter; + bam_header_t *h; + int ref_id; + char *ref; + const mplp_conf_t *conf; +} mplp_aux_t; + +typedef struct { + int n; + int *n_plp, *m_plp; + bam_pileup1_t **plp; +} mplp_pileup_t; + +static int mplp_func(void *data, bam1_t *b) +{ + extern int bam_realn(bam1_t *b, const char *ref); + extern int bam_prob_realn_core(bam1_t *b, const char *ref, int); + extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres); + mplp_aux_t *ma = (mplp_aux_t*)data; + int ret, skip = 0; + do { + int has_ref; + ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b); + if (ret < 0) break; + if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) { // exclude unmapped reads + skip = 1; + continue; + } + if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; } + if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; } + if (ma->conf->bed) { // test overlap + skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b))); + if (skip) continue; + } + if (ma->conf->rghash) { // exclude read groups + uint8_t *rg = bam_aux_get(b, "RG"); + skip = (rg && bcf_str2id(ma->conf->rghash, (const char*)(rg+1)) >= 0); + if (skip) continue; + } + if (ma->conf->flag & MPLP_ILLUMINA13) { + int i; + uint8_t *qual = bam1_qual(b); + for (i = 0; i < b->core.l_qseq; ++i) + qual[i] = qual[i] > 31? qual[i] - 31 : 0; + } + has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0; + skip = 0; + if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3); + if (has_ref && ma->conf->capQ_thres > 10) { + int q = bam_cap_mapQ(b, ma->ref, ma->conf->capQ_thres); + if (q < 0) skip = 1; + else if (b->core.qual > q) b->core.qual = q; + } + else if (b->core.qual < ma->conf->min_mq) skip = 1; + else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1; + } while (skip); + return ret; +} + +static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, + int n, char *const*fn, int *n_plp, const bam_pileup1_t **plp, int ignore_rg) +{ + int i, j; + memset(m->n_plp, 0, m->n * sizeof(int)); + for (i = 0; i < n; ++i) { + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + uint8_t *q; + int id = -1; + q = ignore_rg? 0 : bam_aux_get(p->b, "RG"); + if (q) id = bam_smpl_rg2smid(sm, fn[i], (char*)q+1, buf); + if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf); + if (id < 0 || id >= m->n) { + assert(q); // otherwise a bug + fprintf(stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]); + exit(1); + } + if (m->n_plp[id] == m->m_plp[id]) { + m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; + m->plp[id] = realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]); + } + m->plp[id][m->n_plp[id]++] = *p; + } + } +} + +static int mpileup(mplp_conf_t *conf, int n, char **fn) +{ + extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); + extern void bcf_call_del_rghash(void *rghash); + mplp_aux_t **data; + int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; + const bam_pileup1_t **plp; + bam_mplp_t iter; + bam_header_t *h = 0; + char *ref; + void *rghash = 0; + + bcf_callaux_t *bca = 0; + bcf_callret1_t *bcr = 0; + bcf_call_t bc; + bcf_t *bp = 0; + bcf_hdr_t *bh = 0; + + bam_sample_t *sm = 0; + kstring_t buf; + mplp_pileup_t gplp; + + memset(&gplp, 0, sizeof(mplp_pileup_t)); + memset(&buf, 0, sizeof(kstring_t)); + memset(&bc, 0, sizeof(bcf_call_t)); + data = calloc(n, sizeof(void*)); + plp = calloc(n, sizeof(void*)); + n_plp = calloc(n, sizeof(int*)); + sm = bam_smpl_init(); + + // read the header and initialize data + for (i = 0; i < n; ++i) { + bam_header_t *h_tmp; + data[i] = calloc(1, sizeof(mplp_aux_t)); + data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); + if ( !data[i]->fp ) + { + fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); + exit(1); + } + data[i]->conf = conf; + h_tmp = bam_header_read(data[i]->fp); + if ( !h_tmp ) { + fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); + exit(1); + } + data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet + bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); + rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); + if (conf->reg) { + int beg, end; + bam_index_t *idx; + idx = bam_index_load(fn[i]); + if (idx == 0) { + fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); + exit(1); + } + if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { + fprintf(stderr, "[%s] malformatted region or wrong seqname for %s\n", __func__, fn[i]); + exit(1); + } + if (i == 0) tid0 = tid, beg0 = beg, end0 = end; + data[i]->iter = bam_iter_query(idx, tid, beg, end); + bam_index_destroy(idx); + } + if (i == 0) h = h_tmp; + else { + // FIXME: to check consistency + bam_header_destroy(h_tmp); + } + } + gplp.n = sm->n; + gplp.n_plp = calloc(sm->n, sizeof(int)); + gplp.m_plp = calloc(sm->n, sizeof(int)); + gplp.plp = calloc(sm->n, sizeof(void*)); + + fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); + // write the VCF header + if (conf->flag & MPLP_GLF) { + kstring_t s; + bh = calloc(1, sizeof(bcf_hdr_t)); + s.l = s.m = 0; s.s = 0; + bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w"); + for (i = 0; i < h->n_targets; ++i) { + kputs(h->target_name[i], &s); + kputc('\0', &s); + } + bh->l_nm = s.l; + bh->name = malloc(s.l); + memcpy(bh->name, s.s, s.l); + s.l = 0; + for (i = 0; i < sm->n; ++i) { + kputs(sm->smpl[i], &s); kputc('\0', &s); + } + bh->l_smpl = s.l; + bh->sname = malloc(s.l); + memcpy(bh->sname, s.s, s.l); + s.l = 0; + ksprintf(&s, "##samtoolsVersion=%s\n", BAM_VERSION); + if (conf->fai_fname) ksprintf(&s, "##reference=file://%s\n", conf->fai_fname); + h->dict = sam_header_parse2(h->text); + int nseq; + const char *tags[] = {"SN","LN","UR","M5",NULL}; + char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &nseq); + for (i=0; i<nseq; i++) + { + ksprintf(&s, "##contig=<ID=%s", tbl[4*i]); + if ( tbl[4*i+1] ) ksprintf(&s, ",length=%s", tbl[4*i+1]); + if ( tbl[4*i+2] ) ksprintf(&s, ",URL=%s", tbl[4*i+2]); + if ( tbl[4*i+3] ) ksprintf(&s, ",md5=%s", tbl[4*i+3]); + kputs(">\n", &s); + } + if (tbl) free(tbl); + bh->txt = s.s; + bh->l_txt = 1 + s.l; + bcf_hdr_sync(bh); + bcf_hdr_write(bp, bh); + bca = bcf_call_init(-1., conf->min_baseQ); + bcr = calloc(sm->n, sizeof(bcf_callret1_t)); + bca->rghash = rghash; + bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; + bca->min_frac = conf->min_frac; + bca->min_support = conf->min_support; + bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; + } + if (tid0 >= 0 && conf->fai) { // region is set + ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); + ref_tid = tid0; + for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; + } else ref_tid = -1, ref = 0; + iter = bam_mplp_init(n, mplp_func, (void**)data); + max_depth = conf->max_depth; + if (max_depth * sm->n > 1<<20) + fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); + if (max_depth * sm->n < 8000) { + max_depth = 8000 / sm->n; + fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); + } + max_indel_depth = conf->max_indel_depth * sm->n; + bam_mplp_set_maxcnt(iter, max_depth); + while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { + if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested + if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; + if (tid != ref_tid) { + free(ref); ref = 0; + if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); + for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; + ref_tid = tid; + } + if (conf->flag & MPLP_GLF) { + int total_depth, _ref0, ref16; + bcf1_t *b = calloc(1, sizeof(bcf1_t)); + for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; + group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); + _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; + ref16 = bam_nt16_table[_ref0]; + for (i = 0; i < gplp.n; ++i) + bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); + bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); + bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, 0, 0); + bcf_write(bp, bh, b); + bcf_destroy(b); + // call indels + if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { + for (i = 0; i < gplp.n; ++i) + bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); + if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { + b = calloc(1, sizeof(bcf1_t)); + bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, bca, ref); + bcf_write(bp, bh, b); + bcf_destroy(b); + } + } + } else { + printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); + for (i = 0; i < n; ++i) { + int j, cnt; + for (j = cnt = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + if (bam1_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt; + } + printf("\t%d\t", cnt); + if (n_plp[i] == 0) { + printf("*\t*"); // FIXME: printf() is very slow... + if (conf->flag & MPLP_PRINT_POS) printf("\t*"); + } else { + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + if (bam1_qual(p->b)[p->qpos] >= conf->min_baseQ) + pileup_seq(plp[i] + j, pos, ref_len, ref); + } + putchar('\t'); + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + int c = bam1_qual(p->b)[p->qpos]; + if (c >= conf->min_baseQ) { + c = c + 33 < 126? c + 33 : 126; + putchar(c); + } + } + if (conf->flag & MPLP_PRINT_MAPQ) { + putchar('\t'); + for (j = 0; j < n_plp[i]; ++j) { + int c = plp[i][j].b->core.qual + 33; + if (c > 126) c = 126; + putchar(c); + } + } + if (conf->flag & MPLP_PRINT_POS) { + putchar('\t'); + for (j = 0; j < n_plp[i]; ++j) { + if (j > 0) putchar(','); + printf("%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... + } + } + } + } + putchar('\n'); + } + } + + bcf_close(bp); + bam_smpl_destroy(sm); free(buf.s); + for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); + free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); + bcf_call_del_rghash(rghash); + bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); + bam_mplp_destroy(iter); + bam_header_destroy(h); + for (i = 0; i < n; ++i) { + bam_close(data[i]->fp); + if (data[i]->iter) bam_iter_destroy(data[i]->iter); + free(data[i]); + } + free(data); free(plp); free(ref); free(n_plp); + return 0; +} + +#define MAX_PATH_LEN 1024 +int read_file_list(const char *file_list,int *n,char **argv[]) +{ + char buf[MAX_PATH_LEN]; + int len, nfiles = 0; + char **files = NULL; + struct stat sb; + + *n = 0; + *argv = NULL; + + FILE *fh = fopen(file_list,"r"); + if ( !fh ) + { + fprintf(stderr,"%s: %s\n", file_list,strerror(errno)); + return 1; + } + + files = calloc(nfiles,sizeof(char*)); + nfiles = 0; + while ( fgets(buf,MAX_PATH_LEN,fh) ) + { + // allow empty lines and trailing spaces + len = strlen(buf); + while ( len>0 && isspace(buf[len-1]) ) len--; + if ( !len ) continue; + + // check sanity of the file list + buf[len] = 0; + if (stat(buf, &sb) != 0) + { + // no such file, check if it is safe to print its name + int i, safe_to_print = 1; + for (i=0; i<len; i++) + if (!isprint(buf[i])) { safe_to_print = 0; break; } + if ( safe_to_print ) + fprintf(stderr,"The file list \"%s\" appears broken, could not locate: %s\n", file_list,buf); + else + fprintf(stderr,"Does the file \"%s\" really contain a list of files and do all exist?\n", file_list); + return 1; + } + + nfiles++; + files = realloc(files,nfiles*sizeof(char*)); + files[nfiles-1] = strdup(buf); + } + fclose(fh); + if ( !nfiles ) + { + fprintf(stderr,"No files read from %s\n", file_list); + return 1; + } + *argv = files; + *n = nfiles; + return 0; +} +#undef MAX_PATH_LEN + +int bam_mpileup(int argc, char *argv[]) +{ + int c; + const char *file_list = NULL; + char **fn = NULL; + int nfiles = 0, use_orphan = 0; + mplp_conf_t mplp; + memset(&mplp, 0, sizeof(mplp_conf_t)); + mplp.max_mq = 60; + mplp.min_baseQ = 13; + mplp.capQ_thres = 0; + mplp.max_depth = 250; mplp.max_indel_depth = 250; + mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; + mplp.min_frac = 0.002; mplp.min_support = 1; + mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN; + static struct option lopts[] = + { + {"rf",1,0,1}, // require flag + {"ff",1,0,2}, // filter flag + {0,0,0,0} + }; + while ((c = getopt_long(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV1:2:",lopts,NULL)) >= 0) { + switch (c) { + case 1 : mplp.rflag_require = strtol(optarg,0,0); break; + case 2 : mplp.rflag_filter = strtol(optarg,0,0); break; + case 'f': + mplp.fai = fai_load(optarg); + if (mplp.fai == 0) return 1; + mplp.fai_fname = optarg; + break; + case 'd': mplp.max_depth = atoi(optarg); break; + case 'r': mplp.reg = strdup(optarg); break; + case 'l': mplp.bed = bed_read(optarg); break; + case 'P': mplp.pl_list = strdup(optarg); break; + case 'p': mplp.flag |= MPLP_PER_SAMPLE; break; + case 'g': mplp.flag |= MPLP_GLF; break; + case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break; + case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break; + case 'B': mplp.flag &= ~MPLP_REALN; break; + case 'D': mplp.fmt_flag |= B2B_FMT_DP; break; + case 'S': mplp.fmt_flag |= B2B_FMT_SP; break; + case 'V': mplp.fmt_flag |= B2B_FMT_DV; break; + case 'I': mplp.flag |= MPLP_NO_INDEL; break; + case 'E': mplp.flag |= MPLP_REDO_BAQ; break; + case '6': mplp.flag |= MPLP_ILLUMINA13; break; + case 'R': mplp.flag |= MPLP_IGNORE_RG; break; + case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; + case 'O': mplp.flag |= MPLP_PRINT_POS; break; + case 'C': mplp.capQ_thres = atoi(optarg); break; + case 'M': mplp.max_mq = atoi(optarg); break; + case 'q': mplp.min_mq = atoi(optarg); break; + case 'Q': mplp.min_baseQ = atoi(optarg); break; + case 'b': file_list = optarg; break; + case 'o': mplp.openQ = atoi(optarg); break; + case 'e': mplp.extQ = atoi(optarg); break; + case 'h': mplp.tandemQ = atoi(optarg); break; + case 'A': use_orphan = 1; break; + case 'F': mplp.min_frac = atof(optarg); break; + case 'm': mplp.min_support = atoi(optarg); break; + case 'L': mplp.max_indel_depth = atoi(optarg); break; + case 'G': { + FILE *fp_rg; + char buf[1024]; + mplp.rghash = bcf_str2id_init(); + if ((fp_rg = fopen(optarg, "r")) == 0) + fprintf(stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg); + while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me... + bcf_str2id_add(mplp.rghash, strdup(buf)); + fclose(fp_rg); + } + break; + } + } + if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; + if (argc == 1) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n"); + fprintf(stderr, "Input options:\n\n"); + fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n"); + fprintf(stderr, " -A count anomalous read pairs\n"); + fprintf(stderr, " -B disable BAQ computation\n"); + fprintf(stderr, " -b FILE list of input BAM filenames, one per line [null]\n"); + fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n"); + fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth); + fprintf(stderr, " -E recalculate extended BAQ on the fly thus ignoring existing BQs\n"); + fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n"); + fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n"); + fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n"); + fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); + fprintf(stderr, " -r STR region in which pileup is generated [null]\n"); + fprintf(stderr, " -R ignore RG tags\n"); + fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq); + fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ); + fprintf(stderr, " --rf INT required flags: skip reads with mask bits unset []\n"); + fprintf(stderr, " --ff INT filter flags: skip reads with mask bits set []\n"); + fprintf(stderr, "\nOutput options:\n\n"); + fprintf(stderr, " -D output per-sample DP in BCF (require -g/-u)\n"); + fprintf(stderr, " -g generate BCF output (genotype likelihoods)\n"); + fprintf(stderr, " -O output base positions on reads (disabled by -g/-u)\n"); + fprintf(stderr, " -s output mapping quality (disabled by -g/-u)\n"); + fprintf(stderr, " -S output per-sample strand bias P-value in BCF (require -g/-u)\n"); + fprintf(stderr, " -u generate uncompress BCF output\n"); + fprintf(stderr, "\nSNP/INDEL genotype likelihoods options (effective with `-g' or `-u'):\n\n"); + fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); + fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac); + fprintf(stderr, " -h INT coefficient for homopolymer errors [%d]\n", mplp.tandemQ); + fprintf(stderr, " -I do not perform indel calling\n"); + fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); + fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); + fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); + fprintf(stderr, " -p apply -m and -F per-sample to increase sensitivity\n"); + fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Notes: Assuming diploid individuals.\n\n"); + return 1; + } + bam_no_B = 1; + if (file_list) { + if ( read_file_list(file_list,&nfiles,&fn) ) return 1; + mpileup(&mplp,nfiles,fn); + for (c=0; c<nfiles; c++) free(fn[c]); + free(fn); + } else mpileup(&mplp, argc - optind, argv + optind); + if (mplp.rghash) bcf_str2id_thorough_destroy(mplp.rghash); + free(mplp.reg); free(mplp.pl_list); + if (mplp.fai) fai_destroy(mplp.fai); + if (mplp.bed) bed_destroy(mplp.bed); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_reheader.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,62 @@ +#include <stdio.h> +#include <stdlib.h> +#include "knetfile.h" +#include "bgzf.h" +#include "bam.h" + +#define BUF_SIZE 0x10000 + +int bam_reheader(BGZF *in, const bam_header_t *h, int fd) +{ + BGZF *fp; + bam_header_t *old; + int len; + uint8_t *buf; + if (in->is_write) return -1; + buf = malloc(BUF_SIZE); + old = bam_header_read(in); + fp = bgzf_fdopen(fd, "w"); + bam_header_write(fp, h); + if (in->block_offset < in->block_length) { + bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); + bgzf_flush(fp); + } +#ifdef _USE_KNETFILE + while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0) + fwrite(buf, 1, len, fp->fp); +#else + while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) + fwrite(buf, 1, len, fp->file); +#endif + free(buf); + fp->block_offset = in->block_offset = 0; + bgzf_close(fp); + return 0; +} + +int main_reheader(int argc, char *argv[]) +{ + bam_header_t *h; + BGZF *in; + if (argc != 3) { + fprintf(stderr, "Usage: samtools reheader <in.header.sam> <in.bam>\n"); + return 1; + } + { // read the header + tamFile fph = sam_open(argv[1]); + if (fph == 0) { + fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]); + return 1; + } + h = sam_header_read(fph); + sam_close(fph); + } + in = strcmp(argv[2], "-")? bam_open(argv[2], "r") : bam_dopen(fileno(stdin), "r"); + if (in == 0) { + fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]); + return 1; + } + bam_reheader(in, h, fileno(stdout)); + bgzf_close(in); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_rmdup.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,206 @@ +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <zlib.h> +#include <unistd.h> +#include "sam.h" + +typedef bam1_t *bam1_p; + +#include "khash.h" +KHASH_SET_INIT_STR(name) +KHASH_MAP_INIT_INT64(pos, bam1_p) + +#define BUFFER_SIZE 0x40000 + +typedef struct { + uint64_t n_checked, n_removed; + khash_t(pos) *best_hash; +} lib_aux_t; +KHASH_MAP_INIT_STR(lib, lib_aux_t) + +typedef struct { + int n, max; + bam1_t **a; +} tmp_stack_t; + +static inline void stack_insert(tmp_stack_t *stack, bam1_t *b) +{ + if (stack->n == stack->max) { + stack->max = stack->max? stack->max<<1 : 0x10000; + stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max); + } + stack->a[stack->n++] = b; +} + +static inline void dump_best(tmp_stack_t *stack, samfile_t *out) +{ + int i; + for (i = 0; i != stack->n; ++i) { + samwrite(out, stack->a[i]); + bam_destroy1(stack->a[i]); + } + stack->n = 0; +} + +static void clear_del_set(khash_t(name) *del_set) +{ + khint_t k; + for (k = kh_begin(del_set); k < kh_end(del_set); ++k) + if (kh_exist(del_set, k)) + free((char*)kh_key(del_set, k)); + kh_clear(name, del_set); +} + +static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) +{ + khint_t k = kh_get(lib, aux, lib); + if (k == kh_end(aux)) { + int ret; + char *p = strdup(lib); + lib_aux_t *q; + k = kh_put(lib, aux, p, &ret); + q = &kh_val(aux, k); + q->n_checked = q->n_removed = 0; + q->best_hash = kh_init(pos); + return q; + } else return &kh_val(aux, k); +} + +static void clear_best(khash_t(lib) *aux, int max) +{ + khint_t k; + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + if (kh_size(q->best_hash) >= max) + kh_clear(pos, q->best_hash); + } + } +} + +static inline int sum_qual(const bam1_t *b) +{ + int i, q; + uint8_t *qual = bam1_qual(b); + for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; + return q; +} + +void bam_rmdup_core(samfile_t *in, samfile_t *out) +{ + bam1_t *b; + int last_tid = -1, last_pos = -1; + tmp_stack_t stack; + khint_t k; + khash_t(lib) *aux; + khash_t(name) *del_set; + + aux = kh_init(lib); + del_set = kh_init(name); + b = bam_init1(); + memset(&stack, 0, sizeof(tmp_stack_t)); + + kh_resize(name, del_set, 4 * BUFFER_SIZE); + while (samread(in, b) >= 0) { + bam1_core_t *c = &b->core; + if (c->tid != last_tid || last_pos != c->pos) { + dump_best(&stack, out); // write the result + clear_best(aux, BUFFER_SIZE); + if (c->tid != last_tid) { + clear_best(aux, 0); + if (kh_size(del_set)) { // check + fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set)); + clear_del_set(del_set); + } + if ((int)c->tid == -1) { // append unmapped reads + samwrite(out, b); + while (samread(in, b) >= 0) samwrite(out, b); + break; + } + last_tid = c->tid; + fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]); + } + } + if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { + samwrite(out, b); + } else if (c->isize > 0) { // paired, head + uint64_t key = (uint64_t)c->pos<<32 | c->isize; + const char *lib; + lib_aux_t *q; + int ret; + lib = bam_get_library(in->header, b); + q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); + ++q->n_checked; + k = kh_put(pos, q->best_hash, key, &ret); + if (ret == 0) { // found in best_hash + bam1_t *p = kh_val(q->best_hash, k); + ++q->n_removed; + if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle + kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed + bam_copy1(p, b); // replaced as b + } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed + if (ret == 0) + fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b)); + } else { // not found in best_hash + kh_val(q->best_hash, k) = bam_dup1(b); + stack_insert(&stack, kh_val(q->best_hash, k)); + } + } else { // paired, tail + k = kh_get(name, del_set, bam1_qname(b)); + if (k != kh_end(del_set)) { + free((char*)kh_key(del_set, k)); + kh_del(name, del_set, k); + } else samwrite(out, b); + } + last_pos = c->pos; + } + + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + dump_best(&stack, out); + fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, + (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); + kh_destroy(pos, q->best_hash); + free((char*)kh_key(aux, k)); + } + } + kh_destroy(lib, aux); + + clear_del_set(del_set); + kh_destroy(name, del_set); + free(stack.a); + bam_destroy1(b); +} + +void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se); + +int bam_rmdup(int argc, char *argv[]) +{ + int c, is_se = 0, force_se = 0; + samfile_t *in, *out; + while ((c = getopt(argc, argv, "sS")) >= 0) { + switch (c) { + case 's': is_se = 1; break; + case 'S': force_se = is_se = 1; break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n"); + fprintf(stderr, "Option: -s rmdup for SE reads\n"); + fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n\n"); + return 1; + } + in = samopen(argv[optind], "rb", 0); + out = samopen(argv[optind+1], "wb", in->header); + if (in == 0 || out == 0) { + fprintf(stderr, "[bam_rmdup] fail to read/write input files\n"); + return 1; + } + if (is_se) bam_rmdupse_core(in, out, force_se); + else bam_rmdup_core(in, out); + samclose(in); samclose(out); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_rmdupse.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,159 @@ +#include <math.h> +#include "sam.h" +#include "khash.h" +#include "klist.h" + +#define QUEUE_CLEAR_SIZE 0x100000 +#define MAX_POS 0x7fffffff + +typedef struct { + int endpos; + uint32_t score:31, discarded:1; + bam1_t *b; +} elem_t, *elem_p; +#define __free_elem(p) bam_destroy1((p)->data.b) +KLIST_INIT(q, elem_t, __free_elem) +typedef klist_t(q) queue_t; + +KHASH_MAP_INIT_INT(best, elem_p) +typedef khash_t(best) besthash_t; + +typedef struct { + uint64_t n_checked, n_removed; + besthash_t *left, *rght; +} lib_aux_t; +KHASH_MAP_INIT_STR(lib, lib_aux_t) + +static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) +{ + khint_t k = kh_get(lib, aux, lib); + if (k == kh_end(aux)) { + int ret; + char *p = strdup(lib); + lib_aux_t *q; + k = kh_put(lib, aux, p, &ret); + q = &kh_val(aux, k); + q->left = kh_init(best); + q->rght = kh_init(best); + q->n_checked = q->n_removed = 0; + return q; + } else return &kh_val(aux, k); +} + +static inline int sum_qual(const bam1_t *b) +{ + int i, q; + uint8_t *qual = bam1_qual(b); + for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; + return q; +} + +static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score) +{ + elem_t *p = kl_pushp(q, queue); + p->discarded = 0; + p->endpos = endpos; p->score = score; + if (p->b == 0) p->b = bam_init1(); + bam_copy1(p->b, b); + return p; +} + +static void clear_besthash(besthash_t *h, int32_t pos) +{ + khint_t k; + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos) + kh_del(best, h, k); +} + +static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h) +{ + if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) { + khint_t k; + while (1) { + elem_t *q; + if (queue->head == queue->tail) break; + q = &kl_val(queue->head); + if (q->discarded) { + q->b->data_len = 0; + kl_shift(q, queue, 0); + continue; + } + if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break; + samwrite(out, q->b); + q->b->data_len = 0; + kl_shift(q, queue, 0); + } + for (k = kh_begin(h); k != kh_end(h); ++k) { + if (kh_exist(h, k)) { + clear_besthash(kh_val(h, k).left, pos); + clear_besthash(kh_val(h, k).rght, pos); + } + } + } +} + +void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se) +{ + bam1_t *b; + queue_t *queue; + khint_t k; + int last_tid = -2; + khash_t(lib) *aux; + + aux = kh_init(lib); + b = bam_init1(); + queue = kl_init(q); + while (samread(in, b) >= 0) { + bam1_core_t *c = &b->core; + int endpos = bam_calend(c, bam1_cigar(b)); + int score = sum_qual(b); + + if (last_tid != c->tid) { + if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux); + last_tid = c->tid; + } else dump_alignment(out, queue, c->pos, aux); + if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) { + push_queue(queue, b, endpos, score); + } else { + const char *lib; + lib_aux_t *q; + besthash_t *h; + uint32_t key; + int ret; + lib = bam_get_library(in->header, b); + q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); + ++q->n_checked; + h = (c->flag&BAM_FREVERSE)? q->rght : q->left; + key = (c->flag&BAM_FREVERSE)? endpos : c->pos; + k = kh_put(best, h, key, &ret); + if (ret == 0) { // in the hash table + elem_t *p = kh_val(h, k); + ++q->n_removed; + if (p->score < score) { + if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue + p->discarded = 1; + kh_val(h, k) = push_queue(queue, b, endpos, score); + } else { // replace + p->score = score; p->endpos = endpos; + bam_copy1(p->b, b); + } + } // otherwise, discard the alignment + } else kh_val(h, k) = push_queue(queue, b, endpos, score); + } + } + dump_alignment(out, queue, MAX_POS, aux); + + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, + (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); + kh_destroy(best, q->left); kh_destroy(best, q->rght); + free((char*)kh_key(aux, k)); + } + } + kh_destroy(lib, aux); + bam_destroy1(b); + kl_destroy(q, queue); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_sort.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,571 @@ +#include <stdlib.h> +#include <ctype.h> +#include <assert.h> +#include <errno.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include "bam.h" +#include "ksort.h" + +static int g_is_by_qname = 0; + +static int strnum_cmp(const char *_a, const char *_b) +{ + const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b; + const unsigned char *pa = a, *pb = b; + while (*pa && *pb) { + if (isdigit(*pa) && isdigit(*pb)) { + while (*pa == '0') ++pa; + while (*pb == '0') ++pb; + while (isdigit(*pa) && isdigit(*pb) && *pa == *pb) ++pa, ++pb; + if (isdigit(*pa) && isdigit(*pb)) { + int i = 0; + while (isdigit(pa[i]) && isdigit(pb[i])) ++i; + return isdigit(pa[i])? 1 : isdigit(pb[i])? -1 : (int)*pa - (int)*pb; + } else if (isdigit(*pa)) return 1; + else if (isdigit(*pb)) return -1; + else if (pa - a != pb - b) return pa - a < pb - b? 1 : -1; + } else { + if (*pa != *pb) return (int)*pa - (int)*pb; + ++pa; ++pb; + } + } + return *pa? 1 : *pb? -1 : 0; +} + +#define HEAP_EMPTY 0xffffffffffffffffull + +typedef struct { + int i; + uint64_t pos, idx; + bam1_t *b; +} heap1_t; + +#define __pos_cmp(a, b) ((a).pos > (b).pos || ((a).pos == (b).pos && ((a).i > (b).i || ((a).i == (b).i && (a).idx > (b).idx)))) + +static inline int heap_lt(const heap1_t a, const heap1_t b) +{ + if (g_is_by_qname) { + int t; + if (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0; + t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b)); + return (t > 0 || (t == 0 && (a.b->core.flag&0xc0) > (b.b->core.flag&0xc0))); + } else return __pos_cmp(a, b); +} + +KSORT_INIT(heap, heap1_t, heap_lt) + +static void swap_header_targets(bam_header_t *h1, bam_header_t *h2) +{ + bam_header_t t; + t.n_targets = h1->n_targets, h1->n_targets = h2->n_targets, h2->n_targets = t.n_targets; + t.target_name = h1->target_name, h1->target_name = h2->target_name, h2->target_name = t.target_name; + t.target_len = h1->target_len, h1->target_len = h2->target_len, h2->target_len = t.target_len; +} + +static void swap_header_text(bam_header_t *h1, bam_header_t *h2) +{ + int tempi; + char *temps; + tempi = h1->l_text, h1->l_text = h2->l_text, h2->l_text = tempi; + temps = h1->text, h1->text = h2->text, h2->text = temps; +} + +#define MERGE_RG 1 +#define MERGE_UNCOMP 2 +#define MERGE_LEVEL1 4 +#define MERGE_FORCE 8 + +/*! + @abstract Merge multiple sorted BAM. + @param is_by_qname whether to sort by query name + @param out output BAM file name + @param headers name of SAM file from which to copy '@' header lines, + or NULL to copy them from the first file to be merged + @param n number of files to be merged + @param fn names of files to be merged + + @discussion Padding information may NOT correctly maintained. This + function is NOT thread safe. + */ +int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads, int level) +{ + bamFile fpout, *fp; + heap1_t *heap; + bam_header_t *hout = 0; + bam_header_t *hheaders = NULL; + int i, j, *RG_len = 0; + uint64_t idx = 0; + char **RG = 0, mode[8]; + bam_iter_t *iter = 0; + + if (headers) { + tamFile fpheaders = sam_open(headers); + if (fpheaders == 0) { + const char *message = strerror(errno); + fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); + return -1; + } + hheaders = sam_header_read(fpheaders); + sam_close(fpheaders); + } + + g_is_by_qname = by_qname; + fp = (bamFile*)calloc(n, sizeof(bamFile)); + heap = (heap1_t*)calloc(n, sizeof(heap1_t)); + iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t)); + // prepare RG tag + if (flag & MERGE_RG) { + RG = (char**)calloc(n, sizeof(void*)); + RG_len = (int*)calloc(n, sizeof(int)); + for (i = 0; i != n; ++i) { + int l = strlen(fn[i]); + const char *s = fn[i]; + if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; + for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; + ++j; l -= j; + RG[i] = calloc(l + 1, 1); + RG_len[i] = l; + strncpy(RG[i], s + j, l); + } + } + // read the first + for (i = 0; i != n; ++i) { + bam_header_t *hin; + fp[i] = bam_open(fn[i], "r"); + if (fp[i] == 0) { + int j; + fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]); + for (j = 0; j < i; ++j) bam_close(fp[j]); + free(fp); free(heap); + // FIXME: possible memory leak + return -1; + } + hin = bam_header_read(fp[i]); + if (i == 0) { // the first BAM + hout = hin; + } else { // validate multiple baf + int min_n_targets = hout->n_targets; + if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets; + + for (j = 0; j < min_n_targets; ++j) + if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) { + fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n", + hout->target_name[j], hin->target_name[j], fn[i]); + return -1; + } + + // If this input file has additional target reference sequences, + // add them to the headers to be output + if (hin->n_targets > hout->n_targets) { + swap_header_targets(hout, hin); + // FIXME Possibly we should also create @SQ text headers + // for the newly added reference sequences + } + + bam_header_destroy(hin); + } + } + + if (hheaders) { + // If the text headers to be swapped in include any @SQ headers, + // check that they are consistent with the existing binary list + // of reference information. + if (hheaders->n_targets > 0) { + if (hout->n_targets != hheaders->n_targets) { + fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers); + if (!reg) return -1; + } + for (j = 0; j < hout->n_targets; ++j) + if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) { + fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers); + if (!reg) return -1; + } + } + + swap_header_text(hout, hheaders); + bam_header_destroy(hheaders); + } + + if (reg) { + int tid, beg, end; + if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) { + fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__); + return -1; + } + for (i = 0; i < n; ++i) { + bam_index_t *idx; + idx = bam_index_load(fn[i]); + iter[i] = bam_iter_query(idx, tid, beg, end); + bam_index_destroy(idx); + } + } + + for (i = 0; i < n; ++i) { + heap1_t *h = heap + i; + h->i = i; + h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); + if (bam_iter_read(fp[i], iter[i], h->b) >= 0) { + h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b); + h->idx = idx++; + } + else h->pos = HEAP_EMPTY; + } + if (flag & MERGE_UNCOMP) level = 0; + else if (flag & MERGE_LEVEL1) level = 1; + strcpy(mode, "w"); + if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); + if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) { + fprintf(stderr, "[%s] fail to create the output file.\n", __func__); + return -1; + } + bam_header_write(fpout, hout); + bam_header_destroy(hout); + if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256); + + ks_heapmake(heap, n, heap); + while (heap->pos != HEAP_EMPTY) { + bam1_t *b = heap->b; + if (flag & MERGE_RG) { + uint8_t *rg = bam_aux_get(b, "RG"); + if (rg) bam_aux_del(b, rg); + bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); + } + bam_write1_core(fpout, &b->core, b->data_len, b->data); + if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) { + heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b); + heap->idx = idx++; + } else if (j == -1) { + heap->pos = HEAP_EMPTY; + free(heap->b->data); free(heap->b); + heap->b = 0; + } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); + ks_heapadjust(heap, 0, n, heap); + } + + if (flag & MERGE_RG) { + for (i = 0; i != n; ++i) free(RG[i]); + free(RG); free(RG_len); + } + for (i = 0; i != n; ++i) { + bam_iter_destroy(iter[i]); + bam_close(fp[i]); + } + bam_close(fpout); + free(fp); free(heap); free(iter); + return 0; +} + +int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg) +{ + return bam_merge_core2(by_qname, out, headers, n, fn, flag, reg, 0, -1); +} + +int bam_merge(int argc, char *argv[]) +{ + int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1; + char *fn_headers = NULL, *reg = 0; + + while ((c = getopt(argc, argv, "h:nru1R:f@:l:")) >= 0) { + switch (c) { + case 'r': flag |= MERGE_RG; break; + case 'f': flag |= MERGE_FORCE; break; + case 'h': fn_headers = strdup(optarg); break; + case 'n': is_by_qname = 1; break; + case '1': flag |= MERGE_LEVEL1; break; + case 'u': flag |= MERGE_UNCOMP; break; + case 'R': reg = strdup(optarg); break; + case 'l': level = atoi(optarg); break; + case '@': n_threads = atoi(optarg); break; + } + } + if (optind + 2 >= argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]\n\n"); + fprintf(stderr, "Options: -n sort by read names\n"); + fprintf(stderr, " -r attach RG tag (inferred from file names)\n"); + fprintf(stderr, " -u uncompressed BAM output\n"); + fprintf(stderr, " -f overwrite the output BAM if exist\n"); + fprintf(stderr, " -1 compress level 1\n"); + fprintf(stderr, " -l INT compression level, from 0 to 9 [-1]\n"); + fprintf(stderr, " -@ INT number of BAM compression threads [0]\n"); + fprintf(stderr, " -R STR merge file in the specified region STR [all]\n"); + fprintf(stderr, " -h FILE copy the header in FILE to <out.bam> [in1.bam]\n\n"); + fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n"); + fprintf(stderr, " must provide the correct header with -h, or uses Picard which properly maintains\n"); + fprintf(stderr, " the header dictionary in merging.\n\n"); + return 1; + } + if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) { + FILE *fp = fopen(argv[optind], "rb"); + if (fp != NULL) { + fclose(fp); + fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]); + return 1; + } + } + if (bam_merge_core2(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg, n_threads, level) < 0) ret = 1; + free(reg); + free(fn_headers); + return ret; +} + +/*************** + * BAM sorting * + ***************/ + +#include <pthread.h> + +typedef bam1_t *bam1_p; + +static int change_SO(bam_header_t *h, const char *so) +{ + char *p, *q, *beg = 0, *end = 0, *newtext; + if (h->l_text > 3) { + if (strncmp(h->text, "@HD", 3) == 0) { + if ((p = strchr(h->text, '\n')) == 0) return -1; + *p = '\0'; + if ((q = strstr(h->text, "\tSO:")) != 0) { + *p = '\n'; // change back + if (strncmp(q + 4, so, p - q - 4) != 0) { + beg = q; + for (q += 4; *q != '\n' && *q != '\t'; ++q); + end = q; + } else return 0; // no need to change + } else beg = end = p, *p = '\n'; + } + } + if (beg == 0) { // no @HD + h->l_text += strlen(so) + 15; + newtext = malloc(h->l_text + 1); + sprintf(newtext, "@HD\tVN:1.3\tSO:%s\n", so); + strcat(newtext, h->text); + } else { // has @HD but different or no SO + h->l_text = (beg - h->text) + (4 + strlen(so)) + (h->text + h->l_text - end); + newtext = malloc(h->l_text + 1); + strncpy(newtext, h->text, beg - h->text); + sprintf(newtext + (beg - h->text), "\tSO:%s", so); + strcat(newtext, end); + } + free(h->text); + h->text = newtext; + return 0; +} + +static inline int bam1_lt(const bam1_p a, const bam1_p b) +{ + if (g_is_by_qname) { + int t = strnum_cmp(bam1_qname(a), bam1_qname(b)); + return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0))); + } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam1_strand(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam1_strand(b))); +} +KSORT_INIT(sort, bam1_p, bam1_lt) + +typedef struct { + size_t buf_len; + const char *prefix; + bam1_p *buf; + const bam_header_t *h; + int index; +} worker_t; + +static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_header_t *h, int n_threads) +{ + size_t i; + bamFile fp; + fp = strcmp(fn, "-")? bam_open(fn, mode) : bam_dopen(fileno(stdout), mode); + if (fp == 0) return; + bam_header_write(fp, h); + if (n_threads > 1) bgzf_mt(fp, n_threads, 256); + for (i = 0; i < l; ++i) + bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data); + bam_close(fp); +} + +static void *worker(void *data) +{ + worker_t *w = (worker_t*)data; + char *name; + ks_mergesort(sort, w->buf_len, w->buf, 0); + name = (char*)calloc(strlen(w->prefix) + 20, 1); + sprintf(name, "%s.%.4d.bam", w->prefix, w->index); + write_buffer(name, "w1", w->buf_len, w->buf, w->h, 0); + free(name); + return 0; +} + +static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, const bam_header_t *h, int n_threads) +{ + int i; + size_t rest; + bam1_p *b; + pthread_t *tid; + pthread_attr_t attr; + worker_t *w; + + if (n_threads < 1) n_threads = 1; + if (k < n_threads * 64) n_threads = 1; // use a single thread if we only sort a small batch of records + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + w = calloc(n_threads, sizeof(worker_t)); + tid = calloc(n_threads, sizeof(pthread_t)); + b = buf; rest = k; + for (i = 0; i < n_threads; ++i) { + w[i].buf_len = rest / (n_threads - i); + w[i].buf = b; + w[i].prefix = prefix; + w[i].h = h; + w[i].index = n_files + i; + b += w[i].buf_len; rest -= w[i].buf_len; + pthread_create(&tid[i], &attr, worker, &w[i]); + } + for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); + free(tid); free(w); + return n_files + n_threads; +} + +/*! + @abstract Sort an unsorted BAM file based on the chromosome order + and the leftmost position of an alignment + + @param is_by_qname whether to sort by query name + @param fn name of the file to be sorted + @param prefix prefix of the output and the temporary files; upon + sucessess, prefix.bam will be written. + @param max_mem approxiate maximum memory (very inaccurate) + @param full_path the given output path is the full path and not just the prefix + + @discussion It may create multiple temporary subalignment files + and then merge them by calling bam_merge_core(). This function is + NOT thread safe. + */ +void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t _max_mem, int is_stdout, int n_threads, int level, int full_path) +{ + int ret, i, n_files = 0; + size_t mem, max_k, k, max_mem; + bam_header_t *header; + bamFile fp; + bam1_t *b, **buf; + char *fnout = 0; + char const *suffix = ".bam"; + if (full_path) suffix += 4; + + if (n_threads < 2) n_threads = 1; + g_is_by_qname = is_by_qname; + max_k = k = 0; mem = 0; + max_mem = _max_mem * n_threads; + buf = 0; + fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn); + return; + } + header = bam_header_read(fp); + if (is_by_qname) change_SO(header, "queryname"); + else change_SO(header, "coordinate"); + // write sub files + for (;;) { + if (k == max_k) { + size_t old_max = max_k; + max_k = max_k? max_k<<1 : 0x10000; + buf = realloc(buf, max_k * sizeof(void*)); + memset(buf + old_max, 0, sizeof(void*) * (max_k - old_max)); + } + if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); + b = buf[k]; + if ((ret = bam_read1(fp, b)) < 0) break; + if (b->data_len < b->m_data>>2) { // shrink + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = realloc(b->data, b->m_data); + } + mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays + ++k; + if (mem >= max_mem) { + n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); + mem = k = 0; + } + } + if (ret != -1) + fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); + // output file name + fnout = calloc(strlen(prefix) + 20, 1); + if (is_stdout) sprintf(fnout, "-"); + else sprintf(fnout, "%s%s", prefix, suffix); + // write the final output + if (n_files == 0) { // a single block + char mode[8]; + strcpy(mode, "w"); + if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); + ks_mergesort(sort, k, buf, 0); + write_buffer(fnout, mode, k, buf, header, n_threads); + } else { // then merge + char **fns; + n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); + fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files); + fns = (char**)calloc(n_files, sizeof(char*)); + for (i = 0; i < n_files; ++i) { + fns[i] = (char*)calloc(strlen(prefix) + 20, 1); + sprintf(fns[i], "%s.%.4d%s", prefix, i, suffix); + } + bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, n_threads, level); + for (i = 0; i < n_files; ++i) { + unlink(fns[i]); + free(fns[i]); + } + free(fns); + } + free(fnout); + // free + for (k = 0; k < max_k; ++k) { + if (!buf[k]) continue; + free(buf[k]->data); + free(buf[k]); + } + free(buf); + bam_header_destroy(header); + bam_close(fp); +} + +void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem) +{ + bam_sort_core_ext(is_by_qname, fn, prefix, max_mem, 0, 0, -1, 0); +} + +int bam_sort(int argc, char *argv[]) +{ + size_t max_mem = 768<<20; // 512MB + int c, is_by_qname = 0, is_stdout = 0, n_threads = 0, level = -1, full_path = 0; + while ((c = getopt(argc, argv, "fnom:@:l:")) >= 0) { + switch (c) { + case 'f': full_path = 1; break; + case 'o': is_stdout = 1; break; + case 'n': is_by_qname = 1; break; + case 'm': { + char *q; + max_mem = strtol(optarg, &q, 0); + if (*q == 'k' || *q == 'K') max_mem <<= 10; + else if (*q == 'm' || *q == 'M') max_mem <<= 20; + else if (*q == 'g' || *q == 'G') max_mem <<= 30; + break; + } + case '@': n_threads = atoi(optarg); break; + case 'l': level = atoi(optarg); break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools sort [options] <in.bam> <out.prefix>\n\n"); + fprintf(stderr, "Options: -n sort by read name\n"); + fprintf(stderr, " -f use <out.prefix> as full file name instead of prefix\n"); + fprintf(stderr, " -o final output to stdout\n"); + fprintf(stderr, " -l INT compression level, from 0 to 9 [-1]\n"); + fprintf(stderr, " -@ INT number of sorting and compression threads [1]\n"); + fprintf(stderr, " -m INT max memory per thread; suffix K/M/G recognized [768M]\n"); + fprintf(stderr, "\n"); + return 1; + } + bam_sort_core_ext(is_by_qname, argv[optind], argv[optind+1], max_mem, is_stdout, n_threads, level, full_path); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_stat.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,77 @@ +#include <unistd.h> +#include <assert.h> +#include "bam.h" + +typedef struct { + long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2]; + long long n_sgltn[2], n_read1[2], n_read2[2]; + long long n_dup[2]; + long long n_diffchr[2], n_diffhigh[2]; +} bam_flagstat_t; + +#define flagstat_loop(s, c) do { \ + int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0; \ + ++(s)->n_reads[w]; \ + if ((c)->flag & BAM_FPAIRED) { \ + ++(s)->n_pair_all[w]; \ + if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good[w]; \ + if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w]; \ + if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w]; \ + if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w]; \ + if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \ + ++(s)->n_pair_map[w]; \ + if ((c)->mtid != (c)->tid) { \ + ++(s)->n_diffchr[w]; \ + if ((c)->qual >= 5) ++(s)->n_diffhigh[w]; \ + } \ + } \ + } \ + if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w]; \ + if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ + } while (0) + +bam_flagstat_t *bam_flagstat_core(bamFile fp) +{ + bam_flagstat_t *s; + bam1_t *b; + bam1_core_t *c; + int ret; + s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t)); + b = bam_init1(); + c = &b->core; + while ((ret = bam_read1(fp, b)) >= 0) + flagstat_loop(s, c); + bam_destroy1(b); + if (ret != -1) + fprintf(stderr, "[bam_flagstat_core] Truncated file? Continue anyway.\n"); + return s; +} +int bam_flagstat(int argc, char *argv[]) +{ + bamFile fp; + bam_header_t *header; + bam_flagstat_t *s; + if (argc == optind) { + fprintf(stderr, "Usage: samtools flagstat <in.bam>\n"); + return 1; + } + fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); + assert(fp); + header = bam_header_read(fp); + s = bam_flagstat_core(fp); + printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); + printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); + printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0); + printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); + printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); + printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); + printf("%lld + %lld properly paired (%.2f%%:%.2f%%)\n", s->n_pair_good[0], s->n_pair_good[1], (float)s->n_pair_good[0] / s->n_pair_all[0] * 100.0, (float)s->n_pair_good[1] / s->n_pair_all[1] * 100.0); + printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); + printf("%lld + %lld singletons (%.2f%%:%.2f%%)\n", s->n_sgltn[0], s->n_sgltn[1], (float)s->n_sgltn[0] / s->n_pair_all[0] * 100.0, (float)s->n_sgltn[1] / s->n_pair_all[1] * 100.0); + printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); + printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); + free(s); + bam_header_destroy(header); + bam_close(fp); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_tview.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,368 @@ +#include <assert.h> +#include "bam_tview.h" + +int base_tv_init(tview_t* tv,const char *fn, const char *fn_fa, const char *samples) + { + assert(tv!=NULL); + assert(fn!=NULL); + tv->mrow = 24; tv->mcol = 80; + tv->color_for = TV_COLOR_MAPQ; + tv->is_dot = 1; + + tv->fp = bam_open(fn, "r"); + if(tv->fp==0) + { + fprintf(stderr,"bam_open %s. %s\n", fn,fn_fa); + exit(EXIT_FAILURE); + } + bgzf_set_cache_size(tv->fp, 8 * 1024 *1024); + assert(tv->fp); + + tv->header = bam_header_read(tv->fp); + if(tv->header==0) + { + fprintf(stderr,"Cannot read '%s'.\n", fn); + exit(EXIT_FAILURE); + } + tv->idx = bam_index_load(fn); + if (tv->idx == 0) + { + fprintf(stderr,"Cannot read index for '%s'.\n", fn); + exit(EXIT_FAILURE); + } + tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv); + if (fn_fa) tv->fai = fai_load(fn_fa); + tv->bca = bcf_call_init(0.83, 13); + tv->ins = 1; + + if ( samples ) + { + if ( !tv->header->dict ) tv->header->dict = sam_header_parse2(tv->header->text); + void *iter = tv->header->dict; + const char *key, *val; + int n = 0; + tv->rg_hash = kh_init(kh_rg); + while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) ) + { + if ( !strcmp(samples,key) || (val && !strcmp(samples,val)) ) + { + khiter_t k = kh_get(kh_rg, tv->rg_hash, key); + if ( k != kh_end(tv->rg_hash) ) continue; + int ret; + k = kh_put(kh_rg, tv->rg_hash, key, &ret); + kh_value(tv->rg_hash, k) = val; + n++; + } + } + if ( !n ) + { + fprintf(stderr,"The sample or read group \"%s\" not present.\n", samples); + exit(EXIT_FAILURE); + } + } + + return 0; + } + + +void base_tv_destroy(tview_t* tv) + { + bam_lplbuf_destroy(tv->lplbuf); + bcf_call_destroy(tv->bca); + bam_index_destroy(tv->idx); + if (tv->fai) fai_destroy(tv->fai); + free(tv->ref); + bam_header_destroy(tv->header); + bam_close(tv->fp); + } + + +int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) +{ + extern unsigned char bam_nt16_table[256]; + tview_t *tv = (tview_t*)data; + int i, j, c, rb, attr, max_ins = 0; + uint32_t call = 0; + if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen + // print referece + rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N'; + for (i = tv->last_pos + 1; i < pos; ++i) { + if (i%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", i+1); + c = tv->ref? tv->ref[i - tv->left_pos] : 'N'; + tv->my_mvaddch(tv,1, tv->ccol++, c); + } + if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1); + { // call consensus + bcf_callret1_t bcr; + int qsum[4], a1, a2, tmp; + double p[3], prior = 30; + bcf_call_glfgen(n, pl, bam_nt16_table[rb], tv->bca, &bcr); + for (i = 0; i < 4; ++i) qsum[i] = bcr.qsum[i]<<2 | i; + for (i = 1; i < 4; ++i) // insertion sort + for (j = i; j > 0 && qsum[j] > qsum[j-1]; --j) + tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp; + a1 = qsum[0]&3; a2 = qsum[1]&3; + p[0] = bcr.p[a1*5+a1]; p[1] = bcr.p[a1*5+a2] + prior; p[2] = bcr.p[a2*5+a2]; + if ("ACGT"[a1] != toupper(rb)) p[0] += prior + 3; + if ("ACGT"[a2] != toupper(rb)) p[2] += prior + 3; + if (p[0] < p[1] && p[0] < p[2]) call = (1<<a1)<<16 | (int)((p[1]<p[2]?p[1]:p[2]) - p[0] + .499); + else if (p[2] < p[1] && p[2] < p[0]) call = (1<<a2)<<16 | (int)((p[0]<p[1]?p[0]:p[1]) - p[2] + .499); + else call = (1<<a1|1<<a2)<<16 | (int)((p[0]<p[2]?p[0]:p[2]) - p[1] + .499); + } + attr = tv->my_underline(tv); + c = ",ACMGRSVTWYHKDBN"[call>>16&0xf]; + i = (call&0xffff)/10+1; + if (i > 4) i = 4; + attr |= tv->my_colorpair(tv,i); + if (c == toupper(rb)) c = '.'; + tv->my_attron(tv,attr); + tv->my_mvaddch(tv,2, tv->ccol, c); + tv->my_attroff(tv,attr); + if(tv->ins) { + // calculate maximum insert + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel; + } + } + // core loop + for (j = 0; j <= max_ins; ++j) { + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = pl + i; + int row = TV_MIN_ALNROW + p->level - tv->row_shift; + if (j == 0) { + if (!p->is_del) { + if (tv->base_for == TV_BASE_COLOR_SPACE && + (c = bam_aux_getCSi(p->b, p->qpos))) { + // assume that if we found one color, we will be able to get the color error + if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam1_strand(p->b)? ',' : '.'; + } else { + if (tv->show_name) { + char *name = bam1_qname(p->b); + c = (p->qpos + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos]; + } else { + c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; + if (tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; + } + } + } else c = p->is_refskip? (bam1_strand(p->b)? '<' : '>') : '*'; + } else { // padding + if (j > p->indel) c = '*'; + else { // insertion + if (tv->base_for == TV_BASE_NUCL) { + if (tv->show_name) { + char *name = bam1_qname(p->b); + c = (p->qpos + j + 1 >= p->b->core.l_qname)? ' ' : name[p->qpos + j]; + } else { + c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; + if (j == 0 && tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; + } + } else { + c = bam_aux_getCSi(p->b, p->qpos + j); + if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos + j)) c = bam1_strand(p->b)? ',' : '.'; + } + } + } + if (row > TV_MIN_ALNROW && row < tv->mrow) { + int x; + attr = 0; + if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR)) + || (p->b->core.flag & BAM_FSECONDARY)) attr |= tv->my_underline(tv); + if (tv->color_for == TV_COLOR_BASEQ) { + x = bam1_qual(p->b)[p->qpos]/10 + 1; + if (x > 4) x = 4; + attr |= tv->my_colorpair(tv,x); + } else if (tv->color_for == TV_COLOR_MAPQ) { + x = p->b->core.qual/10 + 1; + if (x > 4) x = 4; + attr |= tv->my_colorpair(tv,x); + } else if (tv->color_for == TV_COLOR_NUCL) { + x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)] + 5; + attr |= tv->my_colorpair(tv,x); + } else if(tv->color_for == TV_COLOR_COL) { + x = 0; + switch(bam_aux_getCSi(p->b, p->qpos)) { + case '0': x = 0; break; + case '1': x = 1; break; + case '2': x = 2; break; + case '3': x = 3; break; + case '4': x = 4; break; + default: x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; break; + } + x+=5; + attr |= tv->my_colorpair(tv,x); + } else if(tv->color_for == TV_COLOR_COLQ) { + x = bam_aux_getCQi(p->b, p->qpos); + if(0 == x) x = bam1_qual(p->b)[p->qpos]; + x = x/10 + 1; + if (x > 4) x = 4; + attr |= tv->my_colorpair(tv,x); + } + tv->my_attron(tv,attr); + tv->my_mvaddch(tv,row, tv->ccol, bam1_strand(p->b)? tolower(c) : toupper(c)); + tv->my_attroff(tv,attr); + } + } + c = j? '*' : rb; + if (c == '*') { + attr = tv->my_colorpair(tv,8); + tv->my_attron(tv,attr); + tv->my_mvaddch(tv,1, tv->ccol++, c); + tv->my_attroff(tv,attr); + } else tv->my_mvaddch(tv,1, tv->ccol++, c); + } + tv->last_pos = pos; + return 0; +} + + + + +int tv_fetch_func(const bam1_t *b, void *data) +{ + tview_t *tv = (tview_t*)data; + if ( tv->rg_hash ) + { + const uint8_t *rg = bam_aux_get(b, "RG"); + if ( !rg ) return 0; + khiter_t k = kh_get(kh_rg, tv->rg_hash, (const char*)(rg + 1)); + if ( k == kh_end(tv->rg_hash) ) return 0; + } + if (tv->no_skip) { + uint32_t *cigar = bam1_cigar(b); // this is cheating... + int i; + for (i = 0; i <b->core.n_cigar; ++i) { + if ((cigar[i]&0xf) == BAM_CREF_SKIP) + cigar[i] = cigar[i]>>4<<4 | BAM_CDEL; + } + } + bam_lplbuf_push(b, tv->lplbuf); + return 0; +} + +int base_draw_aln(tview_t *tv, int tid, int pos) + { + assert(tv!=NULL); + // reset + tv->my_clear(tv); + tv->curr_tid = tid; tv->left_pos = pos; + tv->last_pos = tv->left_pos - 1; + tv->ccol = 0; + // print ref and consensus + if (tv->fai) { + char *str; + if (tv->ref) free(tv->ref); + assert(tv->curr_tid>=0); + + str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1); + assert(str!=NULL); + sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol); + tv->ref = fai_fetch(tv->fai, str, &tv->l_ref); + free(str); + } + // draw aln + bam_lplbuf_reset(tv->lplbuf); + bam_fetch(tv->fp, tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol, tv, tv_fetch_func); + bam_lplbuf_push(0, tv->lplbuf); + + while (tv->ccol < tv->mcol) { + int pos = tv->last_pos + 1; + if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1); + tv->my_mvaddch(tv,1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N'); + ++tv->last_pos; + } + return 0; +} + + + + +static void error(const char *format, ...) +{ + if ( !format ) + { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bamtk tview [options] <aln.bam> [ref.fasta]\n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -d display output as (H)tml or (C)urses or (T)ext \n"); + fprintf(stderr, " -p chr:pos go directly to this position\n"); + fprintf(stderr, " -s STR display only reads from this sample or group\n"); + fprintf(stderr, "\n\n"); + } + else + { + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + } + exit(-1); +} + +enum dipsay_mode {display_ncurses,display_html,display_text}; +extern tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples); +extern tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples); +extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples); + +int bam_tview_main(int argc, char *argv[]) + { + int view_mode=display_ncurses; + tview_t* tv=NULL; + char *samples=NULL, *position=NULL; + int c; + while ((c = getopt(argc, argv, "s:p:d:")) >= 0) { + switch (c) { + case 's': samples=optarg; break; + case 'p': position=optarg; break; + case 'd': + { + switch(optarg[0]) + { + case 'H': case 'h': view_mode=display_html;break; + case 'T': case 't': view_mode=display_text;break; + case 'C': case 'c': view_mode=display_ncurses;break; + default: view_mode=display_ncurses;break; + } + break; + } + default: error(NULL); + } + } + if (argc==optind) error(NULL); + + switch(view_mode) + { + case display_ncurses: + { + tv = curses_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples); + break; + } + case display_text: + { + tv = text_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples); + break; + } + case display_html: + { + tv = html_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples); + break; + } + } + if(tv==NULL) + { + error("cannot create view"); + return EXIT_FAILURE; + } + + if ( position ) + { + int _tid = -1, _beg, _end; + bam_parse_region(tv->header, position, &_tid, &_beg, &_end); + if (_tid >= 0) { tv->curr_tid = _tid; tv->left_pos = _beg; } + } + tv->my_drawaln(tv, tv->curr_tid, tv->left_pos); + tv->my_loop(tv); + tv->my_destroy(tv); + + return EXIT_SUCCESS; + }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_tview.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,75 @@ +#ifndef BAM_TVIEW_H +#define BAM_TVIEW_H + +#include <ctype.h> +#include <assert.h> +#include <string.h> +#include <math.h> +#include <unistd.h> +#include <stdarg.h> +#include "bam.h" +#include "faidx.h" +#include "bam2bcf.h" +#include "sam_header.h" +#include "khash.h" + +KHASH_MAP_INIT_STR(kh_rg, const char *) + +typedef struct AbstractTview { + int mrow, mcol; + + bam_index_t *idx; + bam_lplbuf_t *lplbuf; + bam_header_t *header; + bamFile fp; + int curr_tid, left_pos; + faidx_t *fai; + bcf_callaux_t *bca; + + int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name; + char *ref; + khash_t(kh_rg) *rg_hash; + /* callbacks */ + void (*my_destroy)(struct AbstractTview* ); + void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...); + void (*my_mvaddch)(struct AbstractTview*,int,int,int); + void (*my_attron)(struct AbstractTview*,int); + void (*my_attroff)(struct AbstractTview*,int); + void (*my_clear)(struct AbstractTview*); + int (*my_colorpair)(struct AbstractTview*,int); + int (*my_drawaln)(struct AbstractTview*,int,int); + int (*my_loop)(struct AbstractTview*); + int (*my_underline)(struct AbstractTview*); +} tview_t; + + +char bam_aux_getCEi(bam1_t *b, int i); +char bam_aux_getCSi(bam1_t *b, int i); +char bam_aux_getCQi(bam1_t *b, int i); + +#define TV_MIN_ALNROW 2 +#define TV_MAX_GOTO 40 +#define TV_LOW_MAPQ 10 + +#define TV_COLOR_MAPQ 0 +#define TV_COLOR_BASEQ 1 +#define TV_COLOR_NUCL 2 +#define TV_COLOR_COL 3 +#define TV_COLOR_COLQ 4 + +#define TV_BASE_NUCL 0 +#define TV_BASE_COLOR_SPACE 1 + +int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); +int base_tv_init(tview_t*,const char *fn, const char *fn_fa, const char *samples); +void base_tv_destroy(tview_t*); +int base_draw_aln(tview_t *tv, int tid, int pos); + +typedef struct Tixel + { + int ch; + int attributes; + }tixel_t; + +#endif +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_tview_curses.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,297 @@ +#undef _HAVE_CURSES + +#if _CURSES_LIB == 0 +#elif _CURSES_LIB == 1 +#include <curses.h> +#ifndef NCURSES_VERSION +#warning "_CURSES_LIB=1 but NCURSES_VERSION not defined; tview is NOT compiled" +#else +#define _HAVE_CURSES +#endif +#elif _CURSES_LIB == 2 +#include <xcurses.h> +#define _HAVE_CURSES +#else +#warning "_CURSES_LIB is not 0, 1 or 2; tview is NOT compiled" +#endif + + +#include "bam_tview.h" + +#ifdef _HAVE_CURSES + + + +typedef struct CursesTview { + tview_t view; + WINDOW *wgoto, *whelp; + } curses_tview_t; + + + + +#define FROM_TV(ptr) ((curses_tview_t*)ptr) + +static void curses_destroy(tview_t* base) + { + curses_tview_t* tv=(curses_tview_t*)base; + + + delwin(tv->wgoto); delwin(tv->whelp); + endwin(); + + base_tv_destroy(base); + + free(tv); + } + +/* + void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...); + void (*my_)(struct AbstractTview*,int,int,int); + void (*my_attron)(struct AbstractTview*,int); + void (*my_attroff)(struct AbstractTview*,int); + void (*my_clear)(struct AbstractTview*); + int (*my_colorpair)(struct AbstractTview*,int); +*/ + +static void curses_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...) + { + unsigned int size=tv->mcol+2; + char* str=malloc(size); + if(str==0) exit(EXIT_FAILURE); + va_list argptr; + va_start(argptr, fmt); + vsnprintf(str,size, fmt, argptr); + va_end(argptr); + mvprintw(y,x,str); + free(str); + } + +static void curses_mvaddch(struct AbstractTview* tv,int y,int x,int ch) + { + mvaddch(y,x,ch); + } + +static void curses_attron(struct AbstractTview* tv,int flag) + { + attron(flag); + } +static void curses_attroff(struct AbstractTview* tv,int flag) + { + attroff(flag); + } +static void curses_clear(struct AbstractTview* tv) + { + clear(); + } + +static int curses_colorpair(struct AbstractTview* tv,int flag) + { + return COLOR_PAIR(flag); + } + +static int curses_drawaln(struct AbstractTview* tv, int tid, int pos) + { + return base_draw_aln(tv, tid, pos); + } + + + +static void tv_win_goto(curses_tview_t *tv, int *tid, int *pos) + { + char str[256], *p; + int i, l = 0; + tview_t *base=(tview_t*)tv; + wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+'); + mvwprintw(tv->wgoto, 1, 2, "Goto: "); + for (;;) { + int c = wgetch(tv->wgoto); + wrefresh(tv->wgoto); + if (c == KEY_BACKSPACE || c == '\010' || c == '\177') { + if(l > 0) --l; + } else if (c == KEY_ENTER || c == '\012' || c == '\015') { + int _tid = -1, _beg, _end; + if (str[0] == '=') { + _beg = strtol(str+1, &p, 10) - 1; + if (_beg > 0) { + *pos = _beg; + return; + } + } else { + bam_parse_region(base->header, str, &_tid, &_beg, &_end); + if (_tid >= 0) { + *tid = _tid; *pos = _beg; + return; + } + } + } else if (isgraph(c)) { + if (l < TV_MAX_GOTO) str[l++] = c; + } else if (c == '\027') l = 0; + else if (c == '\033') return; + str[l] = '\0'; + for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' '); + mvwprintw(tv->wgoto, 1, 8, "%s", str); + } +} + + + + +static void tv_win_help(curses_tview_t *tv) { + int r = 1; + tview_t* base=(tview_t*)base; + WINDOW *win = tv->whelp; + wborder(win, '|', '|', '-', '-', '+', '+', '+', '+'); + mvwprintw(win, r++, 2, " -=- Help -=- "); + r++; + mvwprintw(win, r++, 2, "? This window"); + mvwprintw(win, r++, 2, "Arrows Small scroll movement"); + mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement"); + mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement"); + mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left"); + mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right"); + mvwprintw(win, r++, 2, "space Scroll one screen"); + mvwprintw(win, r++, 2, "backspace Scroll back one screen"); + mvwprintw(win, r++, 2, "g Go to specific location"); + mvwprintw(win, r++, 2, "m Color for mapping qual"); + mvwprintw(win, r++, 2, "n Color for nucleotide"); + mvwprintw(win, r++, 2, "b Color for base quality"); + mvwprintw(win, r++, 2, "c Color for cs color"); + mvwprintw(win, r++, 2, "z Color for cs qual"); + mvwprintw(win, r++, 2, ". Toggle on/off dot view"); + mvwprintw(win, r++, 2, "s Toggle on/off ref skip"); + mvwprintw(win, r++, 2, "r Toggle on/off rd name"); + mvwprintw(win, r++, 2, "N Turn on nt view"); + mvwprintw(win, r++, 2, "C Turn on cs view"); + mvwprintw(win, r++, 2, "i Toggle on/off ins"); + mvwprintw(win, r++, 2, "q Exit"); + r++; + mvwprintw(win, r++, 2, "Underline: Secondary or orphan"); + mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19"); + mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30"); + wrefresh(win); + wgetch(win); +} + +static int curses_underline(tview_t* tv) + { + return A_UNDERLINE; + } + +static int curses_loop(tview_t* tv) + { + int tid, pos; + curses_tview_t *CTV=(curses_tview_t *)tv; + tid = tv->curr_tid; pos = tv->left_pos; + while (1) { + int c = getch(); + switch (c) { + case '?': tv_win_help(CTV); break; + case '\033': + case 'q': goto end_loop; + case '/': + case 'g': tv_win_goto(CTV, &tid, &pos); break; + case 'm': tv->color_for = TV_COLOR_MAPQ; break; + case 'b': tv->color_for = TV_COLOR_BASEQ; break; + case 'n': tv->color_for = TV_COLOR_NUCL; break; + case 'c': tv->color_for = TV_COLOR_COL; break; + case 'z': tv->color_for = TV_COLOR_COLQ; break; + case 's': tv->no_skip = !tv->no_skip; break; + case 'r': tv->show_name = !tv->show_name; break; + case KEY_LEFT: + case 'h': --pos; break; + case KEY_RIGHT: + case 'l': ++pos; break; + case KEY_SLEFT: + case 'H': pos -= 20; break; + case KEY_SRIGHT: + case 'L': pos += 20; break; + case '.': tv->is_dot = !tv->is_dot; break; + case 'N': tv->base_for = TV_BASE_NUCL; break; + case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break; + case 'i': tv->ins = !tv->ins; break; + case '\010': pos -= 1000; break; + case '\014': pos += 1000; break; + case ' ': pos += tv->mcol; break; + case KEY_UP: + case 'j': --tv->row_shift; break; + case KEY_DOWN: + case 'k': ++tv->row_shift; break; + case KEY_BACKSPACE: + case '\177': pos -= tv->mcol; break; + case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break; + default: continue; + } + if (pos < 0) pos = 0; + if (tv->row_shift < 0) tv->row_shift = 0; + tv->my_drawaln(tv, tid, pos); + } +end_loop: + return 0; +} + + + + +tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples) + { + curses_tview_t *tv = (curses_tview_t*)calloc(1, sizeof(curses_tview_t)); + tview_t* base=(tview_t*)tv; + if(tv==0) + { + fprintf(stderr,"Calloc failed\n"); + return 0; + } + + base_tv_init(base,fn,fn_fa,samples); + /* initialize callbacks */ +#define SET_CALLBACK(fun) base->my_##fun=curses_##fun; + SET_CALLBACK(destroy); + SET_CALLBACK(mvprintw); + SET_CALLBACK(mvaddch); + SET_CALLBACK(attron); + SET_CALLBACK(attroff); + SET_CALLBACK(clear); + SET_CALLBACK(colorpair); + SET_CALLBACK(drawaln); + SET_CALLBACK(loop); + SET_CALLBACK(underline); +#undef SET_CALLBACK + + initscr(); + keypad(stdscr, TRUE); + clear(); + noecho(); + cbreak(); + + getmaxyx(stdscr, base->mrow, base->mcol); + tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5); + tv->whelp = newwin(29, 40, 5, 5); + + start_color(); + init_pair(1, COLOR_BLUE, COLOR_BLACK); + init_pair(2, COLOR_GREEN, COLOR_BLACK); + init_pair(3, COLOR_YELLOW, COLOR_BLACK); + init_pair(4, COLOR_WHITE, COLOR_BLACK); + init_pair(5, COLOR_GREEN, COLOR_BLACK); + init_pair(6, COLOR_CYAN, COLOR_BLACK); + init_pair(7, COLOR_YELLOW, COLOR_BLACK); + init_pair(8, COLOR_RED, COLOR_BLACK); + init_pair(9, COLOR_BLUE, COLOR_BLACK); + return base; + } + + +#else // #ifdef _HAVE_CURSES +#include <stdio.h> +#warning "No curses library is available; tview with curses is disabled." + +extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples); + +tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples) + { + return text_tv_init(fn,fn_fa,samples); + } +#endif // #ifdef _HAVE_CURSES + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bam_tview_html.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,349 @@ +#include <unistd.h> +#include "bam_tview.h" + +#define UNDERLINE_FLAG 10 + +typedef struct HtmlTview { + tview_t view; + int row_count; + tixel_t** screen; + FILE* out; + int attributes;/* color... */ + } html_tview_t; + +#define FROM_TV(ptr) ((html_tview_t*)ptr) + +static void html_destroy(tview_t* base) + { + int i; + html_tview_t* tv=(html_tview_t*)base; + if(tv->screen!=NULL) + { + for(i=0;i< tv->row_count;++i) free(tv->screen[i]); + free(tv->screen); + } + base_tv_destroy(base); + free(tv); + } + +/* + void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...); + void (*my_)(struct AbstractTview*,int,int,int); + void (*my_attron)(struct AbstractTview*,int); + void (*my_attroff)(struct AbstractTview*,int); + void (*my_clear)(struct AbstractTview*); + int (*my_colorpair)(struct AbstractTview*,int); +*/ + +static void html_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...) + { + int i,nchars=0; + unsigned int size=tv->mcol+2; + char* str=malloc(size); + if(str==0) exit(EXIT_FAILURE); + va_list argptr; + va_start(argptr, fmt); + nchars=vsnprintf(str,size, fmt, argptr); + va_end(argptr); + + for(i=0;i< nchars;++i) + { + tv->my_mvaddch(tv,y,x+i,str[i]); + } + free(str); + } + +static void html_mvaddch(struct AbstractTview* tv,int y,int x,int ch) + { + tixel_t* row=NULL; + html_tview_t* ptr=FROM_TV(tv); + if( x >= tv->mcol ) return; //out of screen + while(ptr->row_count<=y) + { + int x; + row=(tixel_t*)calloc(tv->mcol,sizeof(tixel_t)); + if(row==0) exit(EXIT_FAILURE); + for(x=0;x<tv->mcol;++x) {row[x].ch=' ';row[x].attributes=0;} + ptr->screen=(tixel_t**)realloc(ptr->screen,sizeof(tixel_t*)*(ptr->row_count+1)); + ptr->screen[ptr->row_count++]=row; + } + row=ptr->screen[y]; + row[x].ch=ch; + row[x].attributes=ptr->attributes; + } + +static void html_attron(struct AbstractTview* tv,int flag) + { + html_tview_t* ptr=FROM_TV(tv); + ptr->attributes |= flag; + + + } + +static void html_attroff(struct AbstractTview* tv,int flag) + { + html_tview_t* ptr=FROM_TV(tv); + ptr->attributes &= ~(flag); + } + +static void html_clear(struct AbstractTview* tv) + { + html_tview_t* ptr=FROM_TV(tv); + if(ptr->screen!=NULL) + { + int i; + for(i=0;i< ptr->row_count;++i) free(ptr->screen[i]); + free(ptr->screen); + ptr->screen=NULL; + } + ptr->row_count=0; + ptr->attributes=0; + } + +static int html_colorpair(struct AbstractTview* tv,int flag) + { + return (1 << (flag)); + } + +static int html_drawaln(struct AbstractTview* tv, int tid, int pos) + { + int y,x; + html_tview_t* ptr=FROM_TV(tv); + html_clear(tv); + base_draw_aln(tv, tid, pos); + fputs("<html><head>",ptr->out); + fprintf(ptr->out,"<title>%s:%d</title>", + tv->header->target_name[tid], + pos+1 + ); + //style + + fputs("<style type='text/css'>\n",ptr->out); + fputs(".tviewbody { margin:5px; background-color:white;text-align:center;}\n",ptr->out); + fputs(".tviewtitle {text-align:center;}\n",ptr->out); + fputs(".tviewpre { margin:5px; background-color:white;}\n",ptr->out); + #define CSS(id,col) fprintf(ptr->out,".tviewc%d {color:%s;}\n.tviewcu%d {color:%s;text-decoration:underline;}\n",id,col,id,col); + CSS(0, "black"); + CSS(1, "blue"); + CSS(2, "green"); + CSS(3, "yellow"); + CSS(4, "black"); + CSS(5, "green"); + CSS(6, "cyan"); + CSS(7, "yellow"); + CSS(8, "red"); + CSS(9, "blue"); + #undef CSS + fputs("</style>",ptr->out); + + fputs("</head><body>",ptr->out); + + fprintf(ptr->out,"<div class='tviewbody'><div class='tviewtitle'>%s:%d</div>", + tv->header->target_name[tid], + pos+1 + ); + + fputs("<pre class='tviewpre'>",ptr->out); + for(y=0;y< ptr->row_count;++y) + { + + for(x=0;x< tv->mcol;++x) + { + + + if(x== 0 || ptr->screen[y][x].attributes != ptr->screen[y][x-1].attributes) + { + int css=0; + fprintf(ptr->out,"<span"); + while(css<32) + { + //if(y>1) fprintf(stderr,"css=%d pow2=%d vs %d\n",css,(1 << (css)),ptr->screen[y][x].attributes); + if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0) + { + + fprintf(ptr->out," class='tviewc%s%d'", + (( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)) )!=0?"u":""), + css); + break; + } + ++css; + } + + + fputs(">",ptr->out); + } + + int ch=ptr->screen[y][x].ch; + switch(ch) + { + case '<': fputs("<",ptr->out);break; + case '>': fputs(">",ptr->out);break; + case '&': fputs("&",ptr->out);break; + default: fputc(ch,ptr->out); break; + } + + + if(x+1 == tv->mcol || ptr->screen[y][x].attributes!=ptr->screen[y][x+1].attributes) + { + fputs("</span>",ptr->out); + } + } + if(y+1 < ptr->row_count) fputs("<br/>",ptr->out); + } + fputs("</pre></div></body></html>",ptr->out); + return 0; + } + + +#define ANSI_COLOR_RED "\x1b[31m" +#define ANSI_COLOR_GREEN "\x1b[32m" +#define ANSI_COLOR_YELLOW "\x1b[33m" +#define ANSI_COLOR_BLUE "\x1b[34m" +#define ANSI_COLOR_MAGENTA "\x1b[35m" +#define ANSI_COLOR_CYAN "\x1b[36m" +#define ANSI_COLOR_BLACK "\x1b[0m" +#define ANSI_COLOR_RESET ANSI_COLOR_BLACK + +#define ANSI_UNDERLINE_SET "\033[4m" +#define ANSI_UNDERLINE_UNSET "\033[0m" + +static int text_drawaln(struct AbstractTview* tv, int tid, int pos) + { + int y,x; + html_tview_t* ptr=FROM_TV(tv); + html_clear(tv); + base_draw_aln(tv, tid, pos); + int is_term= isatty(fileno(ptr->out)); + + for(y=0;y< ptr->row_count;++y) + { + for(x=0;x< tv->mcol;++x) + { + if(is_term) + { + int css=0; + while(css<32) + { + if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0) + { + break; + } + ++css; + } + switch(css) + { + //CSS(0, "black"); + case 1: fputs(ANSI_COLOR_BLUE,ptr->out); break; + case 2: fputs(ANSI_COLOR_GREEN,ptr->out); break; + case 3: fputs(ANSI_COLOR_YELLOW,ptr->out); break; + //CSS(4, "black"); + case 5: fputs(ANSI_COLOR_GREEN,ptr->out); break; + case 6: fputs(ANSI_COLOR_CYAN,ptr->out); break; + case 7: fputs(ANSI_COLOR_YELLOW,ptr->out); break; + case 8: fputs(ANSI_COLOR_RED,ptr->out); break; + case 9: fputs(ANSI_COLOR_BLUE,ptr->out); break; + default:break; + } + if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0) + { + fputs(ANSI_UNDERLINE_SET,ptr->out); + } + + } + + + int ch=ptr->screen[y][x].ch; + + fputc(ch,ptr->out); + if(is_term) + { + fputs(ANSI_COLOR_RESET,ptr->out); + if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0) + { + fputs(ANSI_UNDERLINE_UNSET,ptr->out); + } + } + } + fputc('\n',ptr->out); + } + return 0; + } + + +static int html_loop(tview_t* tv) + { + //tv->my_drawaln(tv, tv->curr_tid, tv->left_pos); + return 0; + } + +static int html_underline(tview_t* tv) + { + return (1 << UNDERLINE_FLAG); + } + +/* +static void init_pair(html_tview_t *tv,int id_ge_1, const char* pen, const char* paper) + { + + } +*/ + +tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples) + { + char* colstr=getenv("COLUMNS"); + html_tview_t *tv = (html_tview_t*)calloc(1, sizeof(html_tview_t)); + tview_t* base=(tview_t*)tv; + if(tv==0) + { + fprintf(stderr,"Calloc failed\n"); + return 0; + } + tv->row_count=0; + tv->screen=NULL; + tv->out=stdout; + tv->attributes=0; + base_tv_init(base,fn,fn_fa,samples); + /* initialize callbacks */ +#define SET_CALLBACK(fun) base->my_##fun=html_##fun; + SET_CALLBACK(destroy); + SET_CALLBACK(mvprintw); + SET_CALLBACK(mvaddch); + SET_CALLBACK(attron); + SET_CALLBACK(attroff); + SET_CALLBACK(clear); + SET_CALLBACK(colorpair); + SET_CALLBACK(drawaln); + SET_CALLBACK(loop); + SET_CALLBACK(underline); +#undef SET_CALLBACK + + + if(colstr!=0) + { + base->mcol=atoi(colstr); + if(base->mcol<10) base->mcol=80; + } + base->mrow=99999; + +/* + init_pair(tv,1, "blue", "white"); + init_pair(tv,2, "green", "white"); + init_pair(tv,3, "yellow", "white"); + init_pair(tv,4, "white", "white"); + init_pair(tv,5, "green", "white"); + init_pair(tv,6, "cyan", "white"); + init_pair(tv,7, "yellow", "white"); + init_pair(tv,8, "red", "white"); + init_pair(tv,9, "blue", "white"); + */ + return base; + } + + +tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples) + { + tview_t* tv=html_tv_init(fn,fn_fa,samples); + tv->my_drawaln=text_drawaln; + return tv; + } +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bamshuf.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,141 @@ +#include <unistd.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include "sam.h" +#include "ksort.h" + +#define DEF_CLEVEL 1 + +static inline unsigned hash_Wang(unsigned key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} + +static inline unsigned hash_X31_Wang(const char *s) +{ + unsigned h = *s; + if (h) { + for (++s ; *s; ++s) h = (h << 5) - h + *s; + return hash_Wang(h); + } else return 0; +} + +typedef struct { + unsigned key; + bam1_t *b; +} elem_t; + +static inline int elem_lt(elem_t x, elem_t y) +{ + if (x.key < y.key) return 1; + if (x.key == y.key) { + int t; + t = strcmp(bam_get_qname(x.b), bam_get_qname(y.b)); + if (t < 0) return 1; + return (t == 0 && ((x.b->core.flag>>6&3) < (y.b->core.flag>>6&3))); + } else return 0; +} + +KSORT_INIT(bamshuf, elem_t, elem_lt) + +static void bamshuf(const char *fn, int n_files, const char *pre, int clevel, int is_stdout) +{ + BGZF *fp, *fpw, **fpt; + char **fnt, modew[8]; + bam1_t *b; + int i, l; + bam_hdr_t *h; + int64_t *cnt; + + // split + fp = strcmp(fn, "-")? bgzf_open(fn, "r") : bgzf_dopen(fileno(stdin), "r"); + assert(fp); + h = bam_hdr_read(fp); + fnt = (char**)calloc(n_files, sizeof(void*)); + fpt = (BGZF**)calloc(n_files, sizeof(void*)); + cnt = (int64_t*)calloc(n_files, 8); + l = strlen(pre); + for (i = 0; i < n_files; ++i) { + fnt[i] = (char*)calloc(l + 10, 1); + sprintf(fnt[i], "%s.%.4d.bam", pre, i); + fpt[i] = bgzf_open(fnt[i], "w1"); + bam_hdr_write(fpt[i], h); + } + b = bam_init1(); + while (bam_read1(fp, b) >= 0) { + uint32_t x; + x = hash_X31_Wang(bam_get_qname(b)) % n_files; + bam_write1(fpt[x], b); + ++cnt[x]; + } + bam_destroy1(b); + for (i = 0; i < n_files; ++i) bgzf_close(fpt[i]); + free(fpt); + bgzf_close(fp); + // merge + sprintf(modew, "w%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL); + if (!is_stdout) { // output to a file + char *fnw = (char*)calloc(l + 5, 1); + sprintf(fnw, "%s.bam", pre); + fpw = bgzf_open(fnw, modew); + free(fnw); + } else fpw = bgzf_dopen(fileno(stdout), modew); // output to stdout + bam_hdr_write(fpw, h); + bam_hdr_destroy(h); + for (i = 0; i < n_files; ++i) { + int64_t j, c = cnt[i]; + elem_t *a; + fp = bgzf_open(fnt[i], "r"); + bam_hdr_destroy(bam_hdr_read(fp)); + a = (elem_t*)calloc(c, sizeof(elem_t)); + for (j = 0; j < c; ++j) { + a[j].b = bam_init1(); + assert(bam_read1(fp, a[j].b) >= 0); + a[j].key = hash_X31_Wang(bam_get_qname(a[j].b)); + } + bgzf_close(fp); + unlink(fnt[i]); + free(fnt[i]); + ks_introsort(bamshuf, c, a); + for (j = 0; j < c; ++j) { + bam_write1(fpw, a[j].b); + bam_destroy1(a[j].b); + } + free(a); + } + bgzf_close(fpw); + free(fnt); free(cnt); +} + +int main_bamshuf(int argc, char *argv[]) +{ + int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0; + while ((c = getopt(argc, argv, "n:l:uO")) >= 0) { + switch (c) { + case 'n': n_files = atoi(optarg); break; + case 'l': clevel = atoi(optarg); break; + case 'u': is_un = 1; break; + case 'O': is_stdout = 1; break; + } + } + if (is_un) clevel = 0; + if (optind + 2 > argc) { + fprintf(stderr, "\nUsage: bamshuf [-Ou] [-n nFiles] [-c cLevel] <in.bam> <out.prefix>\n\n"); + fprintf(stderr, "Options: -O output to stdout\n"); + fprintf(stderr, " -u uncompressed BAM output\n"); + fprintf(stderr, " -l INT compression level [%d]\n", DEF_CLEVEL); + fprintf(stderr, " -n INT number of temporary files [%d]\n", n_files); + fprintf(stderr, "\n"); + return 1; + } + bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_stdout); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bamtk.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,119 @@ +#include <stdio.h> +#include <unistd.h> +#include <assert.h> +#include <fcntl.h> +#include "bam.h" + +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +int bam_taf2baf(int argc, char *argv[]); +int bam_mpileup(int argc, char *argv[]); +int bam_merge(int argc, char *argv[]); +int bam_index(int argc, char *argv[]); +int bam_sort(int argc, char *argv[]); +int bam_tview_main(int argc, char *argv[]); +int bam_mating(int argc, char *argv[]); +int bam_rmdup(int argc, char *argv[]); +int bam_flagstat(int argc, char *argv[]); +int bam_fillmd(int argc, char *argv[]); +int bam_idxstats(int argc, char *argv[]); +int main_samview(int argc, char *argv[]); +int main_import(int argc, char *argv[]); +int main_reheader(int argc, char *argv[]); +int main_cut_target(int argc, char *argv[]); +int main_phase(int argc, char *argv[]); +int main_cat(int argc, char *argv[]); +int main_depth(int argc, char *argv[]); +int main_bam2fq(int argc, char *argv[]); +int main_pad2unpad(int argc, char *argv[]); +int main_bedcov(int argc, char *argv[]); +int main_bamshuf(int argc, char *argv[]); + +int faidx_main(int argc, char *argv[]); + +static int usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n"); + fprintf(stderr, "Version: %s\n\n", BAM_VERSION); + fprintf(stderr, "Usage: samtools <command> [options]\n\n"); + fprintf(stderr, "Command: view SAM<->BAM conversion\n"); + fprintf(stderr, " sort sort alignment file\n"); + fprintf(stderr, " mpileup multi-way pileup\n"); + fprintf(stderr, " depth compute the depth\n"); + fprintf(stderr, " faidx index/extract FASTA\n"); +#if _CURSES_LIB != 0 + fprintf(stderr, " tview text alignment viewer\n"); +#endif + fprintf(stderr, " index index alignment\n"); + fprintf(stderr, " idxstats BAM index stats (r595 or later)\n"); + fprintf(stderr, " fixmate fix mate information\n"); + fprintf(stderr, " flagstat simple stats\n"); + fprintf(stderr, " calmd recalculate MD/NM tags and '=' bases\n"); + fprintf(stderr, " merge merge sorted alignments\n"); + fprintf(stderr, " rmdup remove PCR duplicates\n"); + fprintf(stderr, " reheader replace BAM header\n"); + fprintf(stderr, " cat concatenate BAMs\n"); + fprintf(stderr, " bedcov read depth per BED region\n"); + fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n"); + fprintf(stderr, " phase phase heterozygotes\n"); + fprintf(stderr, " bamshuf shuffle and group alignments by name\n"); +// fprintf(stderr, " depad convert padded BAM to unpadded BAM\n"); // not stable + fprintf(stderr, "\n"); +#ifdef _WIN32 + fprintf(stderr, "\ +Note: The Windows version of SAMtools is mainly designed for read-only\n\ + operations, such as viewing the alignments and generating the pileup.\n\ + Binary files generated by the Windows version may be buggy.\n\n"); +#endif + return 1; +} + +int main(int argc, char *argv[]) +{ +#ifdef _WIN32 + setmode(fileno(stdout), O_BINARY); + setmode(fileno(stdin), O_BINARY); +#ifdef _USE_KNETFILE + knet_win32_init(); +#endif +#endif + if (argc < 2) return usage(); + if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1); + else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1); + else if (strcmp(argv[1], "mpileup") == 0) return bam_mpileup(argc-1, argv+1); + else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1); + else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1); + else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1); + else if (strcmp(argv[1], "idxstats") == 0) return bam_idxstats(argc-1, argv+1); + else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1); + else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1); + else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); + else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); + else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1); + else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); + else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1); + else if (strcmp(argv[1], "cat") == 0) return main_cat(argc-1, argv+1); + else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1); + else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1); + else if (strcmp(argv[1], "depth") == 0) return main_depth(argc-1, argv+1); + else if (strcmp(argv[1], "bam2fq") == 0) return main_bam2fq(argc-1, argv+1); + else if (strcmp(argv[1], "pad2unpad") == 0) return main_pad2unpad(argc-1, argv+1); + else if (strcmp(argv[1], "depad") == 0) return main_pad2unpad(argc-1, argv+1); + else if (strcmp(argv[1], "bedcov") == 0) return main_bedcov(argc-1, argv+1); + else if (strcmp(argv[1], "bamshuf") == 0) return main_bamshuf(argc-1, argv+1); + else if (strcmp(argv[1], "pileup") == 0) { + fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); + return 1; + } +#if _CURSES_LIB != 0 + else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); +#endif + else { + fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); + return 1; + } + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/Makefile Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,51 @@ +CC= gcc +CFLAGS= -g -Wall -O2 #-m64 #-arch ppc +DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE +LOBJS= bcf.o vcf.o bcfutils.o prob1.o em.o kfunc.o kmin.o index.o fet.o mut.o bcf2qcall.o +OMISC= .. +AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o $(OMISC)/bedidx.o +PROG= bcftools +INCLUDES= +SUBDIRS= . + +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) -I.. $(INCLUDES) $< -o $@ + +all-recur lib-recur clean-recur cleanlocal-recur install-recur: + @target=`echo $@ | sed s/-recur//`; \ + wdir=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + cd $$subdir; \ + $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ + INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ + cd $$wdir; \ + done; + +all:$(PROG) + +lib:libbcf.a + +libbcf.a:$(LOBJS) + $(AR) -csru $@ $(LOBJS) + +bcftools:lib $(AOBJS) + $(CC) $(CFLAGS) -o $@ $(AOBJS) -L. $(LIBPATH) -lbcf -lm -lz -lpthread + +bcf.o:bcf.h +vcf.o:bcf.h +index.o:bcf.h +bcfutils.o:bcf.h +prob1.o:prob1.h bcf.h +call1.o:prob1.h bcf.h +bcf2qcall.o:bcf.h +main.o:bcf.h + +bcf.pdf:bcf.tex + pdflatex bcf + +cleanlocal: + rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a bcf.aux bcf.log bcf.pdf *.class libbcf.*.dylib libbcf.so* + +clean:cleanlocal-recur
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/README Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,36 @@ +The view command of bcftools calls variants, tests Hardy-Weinberg +equilibrium (HWE), tests allele balances and estimates allele frequency. + +This command calls a site as a potential variant if P(ref|D,F) is below +0.9 (controlled by the -p option), where D is data and F is the prior +allele frequency spectrum (AFS). + +The view command performs two types of allele balance tests, both based +on Fisher's exact test for 2x2 contingency tables with the row variable +being reference allele or not. In the first table, the column variable +is strand. Two-tail P-value is taken. We test if variant bases tend to +come from one strand. In the second table, the column variable is +whether a base appears in the first or the last 11bp of the read. +One-tail P-value is taken. We test if variant bases tend to occur +towards the end of reads, which is usually an indication of +misalignment. + +Site allele frequency is estimated in two ways. In the first way, the +frequency is esimated as \argmax_f P(D|f) under the assumption of +HWE. Prior AFS is not used. In the second way, the frequency is +estimated as the posterior expectation of allele counts \sum_k +kP(k|D,F), dividied by the total number of haplotypes. HWE is not +assumed, but the estimate depends on the prior AFS. The two estimates +largely agree when the signal is strong, but may differ greatly on weak +sites as in this case, the prior plays an important role. + +To test HWE, we calculate the posterior distribution of genotypes +(ref-hom, het and alt-hom). Chi-square test is performed. It is worth +noting that the model used here is prior dependent and assumes HWE, +which is different from both models for allele frequency estimate. The +new model actually yields a third estimate of site allele frequency. + +The estimate allele frequency spectrum is printed to stderr per 64k +sites. The estimate is in fact only the first round of a EM +procedure. The second model (not the model for HWE testing) is used to +estimate the AFS. \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/bcf.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,396 @@ +#include <string.h> +#include <ctype.h> +#include <stdio.h> +#include "kstring.h" +#include "bcf.h" + +bcf_t *bcf_open(const char *fn, const char *mode) +{ + bcf_t *b; + b = calloc(1, sizeof(bcf_t)); + if (strchr(mode, 'w')) { + b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdout), mode); + } else { + b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdin), mode); + } + return b; +} + +int bcf_close(bcf_t *b) +{ + int ret; + if (b == 0) return 0; + ret = bgzf_close(b->fp); + free(b); + return ret; +} + +int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h) +{ + if (b == 0 || h == 0) return -1; + bgzf_write(b->fp, "BCF\4", 4); + bgzf_write(b->fp, &h->l_nm, 4); + bgzf_write(b->fp, h->name, h->l_nm); + bgzf_write(b->fp, &h->l_smpl, 4); + bgzf_write(b->fp, h->sname, h->l_smpl); + bgzf_write(b->fp, &h->l_txt, 4); + bgzf_write(b->fp, h->txt, h->l_txt); + bgzf_flush(b->fp); + return 16 + h->l_nm + h->l_smpl + h->l_txt; +} + +bcf_hdr_t *bcf_hdr_read(bcf_t *b) +{ + uint8_t magic[4]; + bcf_hdr_t *h; + if (b == 0) return 0; + h = calloc(1, sizeof(bcf_hdr_t)); + bgzf_read(b->fp, magic, 4); + bgzf_read(b->fp, &h->l_nm, 4); + h->name = malloc(h->l_nm); + bgzf_read(b->fp, h->name, h->l_nm); + bgzf_read(b->fp, &h->l_smpl, 4); + h->sname = malloc(h->l_smpl); + bgzf_read(b->fp, h->sname, h->l_smpl); + bgzf_read(b->fp, &h->l_txt, 4); + h->txt = malloc(h->l_txt); + bgzf_read(b->fp, h->txt, h->l_txt); + bcf_hdr_sync(h); + return h; +} + +void bcf_hdr_destroy(bcf_hdr_t *h) +{ + if (h == 0) return; + free(h->name); free(h->sname); free(h->txt); free(h->ns); free(h->sns); + free(h); +} + +static inline char **cnt_null(int l, char *str, int *_n) +{ + int n = 0; + char *p, **list; + *_n = 0; + if (l == 0 || str == 0) return 0; + for (p = str; p != str + l; ++p) + if (*p == 0) ++n; + *_n = n; + list = calloc(n, sizeof(void*)); + list[0] = str; + for (p = str, n = 1; p < str + l - 1; ++p) + if (*p == 0) list[n++] = p + 1; + return list; +} + +int bcf_hdr_sync(bcf_hdr_t *b) +{ + if (b == 0) return -1; + if (b->ns) free(b->ns); + if (b->sns) free(b->sns); + if (b->l_nm) b->ns = cnt_null(b->l_nm, b->name, &b->n_ref); + else b->ns = 0, b->n_ref = 0; + b->sns = cnt_null(b->l_smpl, b->sname, &b->n_smpl); + return 0; +} + +int bcf_sync(bcf1_t *b) +{ + char *p, *tmp[5]; + int i, n, n_smpl = b->n_smpl; + ks_tokaux_t aux; + // set ref, alt, flt, info, fmt + b->ref = b->alt = b->flt = b->info = b->fmt = 0; + for (p = b->str, n = 0; p < b->str + b->l_str; ++p) { + if (*p == 0 && p+1 != b->str + b->l_str) { + if (n == 5) { + ++n; + break; + } else tmp[n++] = p + 1; + } + } + if (n != 5) { + fprintf(stderr, "[%s] incorrect number of fields (%d != 5) at %d:%d\n", __func__, n, b->tid, b->pos); + return -1; + } + b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4]; + // set n_alleles + if (*b->alt == 0) b->n_alleles = 1; + else { + for (p = b->alt, n = 1; *p; ++p) + if (*p == ',') ++n; + b->n_alleles = n + 1; + } + // set n_gi and gi[i].fmt + for (p = b->fmt, n = 1; *p; ++p) + if (*p == ':') ++n; + if (n > b->m_gi) { + int old_m = b->m_gi; + b->m_gi = n; + kroundup32(b->m_gi); + b->gi = realloc(b->gi, b->m_gi * sizeof(bcf_ginfo_t)); + memset(b->gi + old_m, 0, (b->m_gi - old_m) * sizeof(bcf_ginfo_t)); + } + b->n_gi = n; + for (p = kstrtok(b->fmt, ":", &aux), n = 0; p; p = kstrtok(0, 0, &aux)) + b->gi[n++].fmt = bcf_str2int(p, aux.p - p); + // set gi[i].len + for (i = 0; i < b->n_gi; ++i) { + if (b->gi[i].fmt == bcf_str2int("PL", 2)) { + b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2; + } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("HQ", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { + b->gi[i].len = 2; + } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("GT", 2)) { + b->gi[i].len = 1; + } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { + b->gi[i].len = 4; + } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { + b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2 * 4; + } + b->gi[i].data = realloc(b->gi[i].data, n_smpl * b->gi[i].len); + } + return 0; +} + +int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b) +{ + int i, l = 0; + if (b == 0) return -1; + bgzf_write(bp->fp, &b->tid, 4); + bgzf_write(bp->fp, &b->pos, 4); + bgzf_write(bp->fp, &b->qual, 4); + bgzf_write(bp->fp, &b->l_str, 4); + bgzf_write(bp->fp, b->str, b->l_str); + l = 12 + b->l_str; + for (i = 0; i < b->n_gi; ++i) { + bgzf_write(bp->fp, b->gi[i].data, b->gi[i].len * h->n_smpl); + l += b->gi[i].len * h->n_smpl; + } + return l; +} + +int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b) +{ + int i, l = 0; + if (b == 0) return -1; + if (bgzf_read(bp->fp, &b->tid, 4) == 0) return -1; + b->n_smpl = h->n_smpl; + bgzf_read(bp->fp, &b->pos, 4); + bgzf_read(bp->fp, &b->qual, 4); + bgzf_read(bp->fp, &b->l_str, 4); + if (b->l_str > b->m_str) { + b->m_str = b->l_str; + kroundup32(b->m_str); + b->str = realloc(b->str, b->m_str); + } + bgzf_read(bp->fp, b->str, b->l_str); + l = 12 + b->l_str; + if (bcf_sync(b) < 0) return -2; + for (i = 0; i < b->n_gi; ++i) { + bgzf_read(bp->fp, b->gi[i].data, b->gi[i].len * h->n_smpl); + l += b->gi[i].len * h->n_smpl; + } + return l; +} + +int bcf_destroy(bcf1_t *b) +{ + int i; + if (b == 0) return -1; + free(b->str); + for (i = 0; i < b->m_gi; ++i) + free(b->gi[i].data); + free(b->gi); + free(b); + return 0; +} + +static inline void fmt_str(const char *p, kstring_t *s) +{ + if (*p == 0) kputc('.', s); + else kputs(p, s); +} + +void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s) +{ + int i, j, x; + s->l = 0; + if (h->n_ref) kputs(h->ns[b->tid], s); + else kputw(b->tid, s); + kputc('\t', s); + kputw(b->pos + 1, s); kputc('\t', s); + fmt_str(b->str, s); kputc('\t', s); + fmt_str(b->ref, s); kputc('\t', s); + fmt_str(b->alt, s); kputc('\t', s); + ksprintf(s, "%.3g", b->qual); kputc('\t', s); + fmt_str(b->flt, s); kputc('\t', s); + fmt_str(b->info, s); + if (b->fmt[0]) { + kputc('\t', s); + fmt_str(b->fmt, s); + } + x = b->n_alleles * (b->n_alleles + 1) / 2; + if (b->n_gi == 0) return; + int iPL = -1; + if ( b->n_alleles > 2 ) { + for (i=0; i<b->n_gi; i++) { + if ( b->gi[i].fmt == bcf_str2int("PL", 2) ) { + iPL = i; + break; + } + } + } + for (j = 0; j < h->n_smpl; ++j) { + int ploidy = b->ploidy ? b->ploidy[j] : 2; + kputc('\t', s); + for (i = 0; i < b->n_gi; ++i) { + if (i) kputc(':', s); + if (b->gi[i].fmt == bcf_str2int("PL", 2)) { + uint8_t *d = (uint8_t*)b->gi[i].data + j * x; + int k; + if ( ploidy==1 ) + for (k=0; k<b->n_alleles; k++) + { + if (k>0) kputc(',', s); + kputw(d[(k+1)*(k+2)/2-1], s); + } + else + for (k = 0; k < x; ++k) { + if (k > 0) kputc(',', s); + kputw(d[k], s); + } + } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { + kputw(((uint16_t*)b->gi[i].data)[j], s); + } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { + kputw(((uint8_t*)b->gi[i].data)[j], s); + } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { + kputw(((int32_t*)b->gi[i].data)[j], s); + } else if (b->gi[i].fmt == bcf_str2int("GT", 2)) { + int y = ((uint8_t*)b->gi[i].data)[j]; + if ( ploidy==1 ) + { + if ( y>>7&1 ) + kputc('.', s); + else + kputc('0' + (y>>3&7), s); + } + else + { + if ( y>>7&1 ) + kputsn("./.", 3, s); + else { + kputc('0' + (y>>3&7), s); + kputc("/|"[y>>6&1], s); + kputc('0' + (y&7), s); + } + } + } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { + float *d = (float*)b->gi[i].data + j * x; + int k; + //printf("- %lx\n", d); + for (k = 0; k < x; ++k) { + if (k > 0) kputc(',', s); + ksprintf(s, "%.2f", d[k]); + } + } else kputc('.', s); // custom fields + } + } +} + +char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b) +{ + kstring_t s; + s.l = s.m = 0; s.s = 0; + bcf_fmt_core(h, b, &s); + return s.s; +} + +int bcf_append_info(bcf1_t *b, const char *info, int l) +{ + int shift = b->fmt - b->str; + int l_fmt = b->l_str - shift; + char *ori = b->str; + if (b->l_str + l > b->m_str) { // enlarge if necessary + b->m_str = b->l_str + l; + kroundup32(b->m_str); + b->str = realloc(b->str, b->m_str); + } + memmove(b->str + shift + l, b->str + shift, l_fmt); // move the FORMAT field + memcpy(b->str + shift - 1, info, l); // append to the INFO field + b->str[shift + l - 1] = '\0'; + b->fmt = b->str + shift + l; + b->l_str += l; + if (ori != b->str) bcf_sync(b); // synchronize when realloc changes the pointer + return 0; +} + +int remove_tag(char *str, const char *tag, char delim) +{ + char *tmp = str, *p; + int len_diff = 0, ori_len = strlen(str); + while ( *tmp && (p = strstr(tmp,tag)) ) + { + if ( p>str ) + { + if ( *(p-1)!=delim ) { tmp=p+1; continue; } // shared substring + p--; + } + char *q=p+1; + while ( *q && *q!=delim ) q++; + if ( p==str && *q ) q++; // the tag is first, don't move the delim char + len_diff += q-p; + if ( ! *q ) { *p = 0; break; } // the tag was last, no delim follows + else + memmove(p,q,ori_len-(int)(p-str)-(int)(q-p)); // *q==delim + } + if ( len_diff==ori_len ) + str[0]='.', str[1]=0, len_diff--; + + return len_diff; +} + + +void rm_info(kstring_t *s, const char *key) +{ + char *p = s->s; + int n = 0; + while ( n<4 ) + { + if ( !*p ) n++; + p++; + } + char *q = p+1; + while ( *q && q-s->s<s->l ) q++; + + int nrm = remove_tag(p, key, ';'); + if ( nrm ) + memmove(q-nrm, q, s->s+s->l-q+1); + s->l -= nrm; +} + +int bcf_cpy(bcf1_t *r, const bcf1_t *b) +{ + char *t1 = r->str; + bcf_ginfo_t *t2 = r->gi; + int i, t3 = r->m_str, t4 = r->m_gi; + *r = *b; + r->str = t1; r->gi = t2; r->m_str = t3; r->m_gi = t4; + if (r->m_str < b->m_str) { + r->m_str = b->m_str; + r->str = realloc(r->str, r->m_str); + } + memcpy(r->str, b->str, r->m_str); + bcf_sync(r); // calling bcf_sync() is simple but inefficient + for (i = 0; i < r->n_gi; ++i) + memcpy(r->gi[i].data, b->gi[i].data, r->n_smpl * r->gi[i].len); + return 0; +} + +int bcf_is_indel(const bcf1_t *b) +{ + char *p; + if (strlen(b->ref) > 1) return 1; + for (p = b->alt; *p; ++p) + if (*p != ',' && p[1] != ',' && p[1] != '\0') + return 1; + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/bcf.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,197 @@ +/* The MIT License + + Copyright (c) 2010 Broad Institute + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li <lh3@live.co.uk> */ + +#ifndef BCF_H +#define BCF_H + +#define BCF_VERSION "0.1.19-44428cd" + +#include <stdint.h> +#include <zlib.h> + +#ifndef BCF_LITE +#include "bgzf.h" +typedef BGZF *bcfFile; +#else +typedef gzFile bcfFile; +#define bgzf_open(fn, mode) gzopen(fn, mode) +#define bgzf_fdopen(fd, mode) gzdopen(fd, mode) +#define bgzf_close(fp) gzclose(fp) +#define bgzf_read(fp, buf, len) gzread(fp, buf, len) +#define bgzf_write(fp, buf, len) +#define bgzf_flush(fp) +#endif + +/* + A member in the structs below is said to "primary" if its content + cannot be inferred from other members in any of structs below; a + member is said to be "derived" if its content can be derived from + other members. For example, bcf1_t::str is primary as this comes from + the input data, while bcf1_t::info is derived as it can always be + correctly set if we know bcf1_t::str. Derived members are for quick + access to the content and must be synchronized with the primary data. + */ + +typedef struct { + uint32_t fmt; // format of the block, set by bcf_str2int(). + int len; // length of data for each individual + void *data; // concatenated data + // derived info: fmt, len (<-bcf1_t::fmt) +} bcf_ginfo_t; + +typedef struct { + int32_t tid, pos; // refID and 0-based position + int32_t l_str, m_str; // length and the allocated size of ->str + float qual; // SNP quality + char *str; // concatenated string of variable length strings in VCF (from col.2 to col.7) + char *ref, *alt, *flt, *info, *fmt; // they all point to ->str; no memory allocation + int n_gi, m_gi; // number and the allocated size of geno fields + bcf_ginfo_t *gi; // array of geno fields + int n_alleles, n_smpl; // number of alleles and samples + // derived info: ref, alt, flt, info, fmt (<-str), n_gi (<-fmt), n_alleles (<-alt), n_smpl (<-bcf_hdr_t::n_smpl) + uint8_t *ploidy; // ploidy of all samples; if NULL, ploidy of 2 is assumed. +} bcf1_t; + +typedef struct { + int32_t n_ref, n_smpl; // number of reference sequences and samples + int32_t l_nm; // length of concatenated sequence names; 0 padded + int32_t l_smpl; // length of concatenated sample names; 0 padded + int32_t l_txt; // length of header text (lines started with ##) + char *name, *sname, *txt; // concatenated sequence names, sample names and header text + char **ns, **sns; // array of sequence and sample names; point to name and sname, respectively + // derived info: n_ref (<-name), n_smpl (<-sname), ns (<-name), sns (<-sname) +} bcf_hdr_t; + +typedef struct { + int is_vcf; // if the file in operation is a VCF + void *v; // auxillary data structure for VCF + bcfFile fp; // file handler for BCF +} bcf_t; + +struct __bcf_idx_t; +typedef struct __bcf_idx_t bcf_idx_t; + +#ifdef __cplusplus +extern "C" { +#endif + + // open a BCF file; for BCF file only + bcf_t *bcf_open(const char *fn, const char *mode); + // close file + int bcf_close(bcf_t *b); + // read one record from BCF; return -1 on end-of-file, and <-1 for errors + int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b); + // call this function if b->str is changed + int bcf_sync(bcf1_t *b); + // write a BCF record + int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b); + // read the BCF header; BCF only + bcf_hdr_t *bcf_hdr_read(bcf_t *b); + // write the BCF header + int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h); + // set bcf_hdr_t::ns and bcf_hdr_t::sns + int bcf_hdr_sync(bcf_hdr_t *b); + // destroy the header + void bcf_hdr_destroy(bcf_hdr_t *h); + // destroy a record + int bcf_destroy(bcf1_t *b); + // BCF->VCF conversion + char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b); + // append more info + int bcf_append_info(bcf1_t *b, const char *info, int l); + // remove tag + int remove_tag(char *string, const char *tag, char delim); + // remove info tag, string is the kstring holder of bcf1_t.str + void rm_info(kstring_t *string, const char *key); + // copy + int bcf_cpy(bcf1_t *r, const bcf1_t *b); + + // open a VCF or BCF file if "b" is set in "mode" + bcf_t *vcf_open(const char *fn, const char *mode); + // close a VCF/BCF file + int vcf_close(bcf_t *bp); + // read the VCF/BCF header + bcf_hdr_t *vcf_hdr_read(bcf_t *bp); + // read the sequence dictionary from a separate file; required for VCF->BCF conversion + int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn); + // read a VCF/BCF record; return -1 on end-of-file and <-1 for errors + int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b); + // write the VCF header + int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h); + // write a VCF record + int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b); + + // keep the first n alleles and discard the rest + int bcf_shrink_alt(bcf1_t *b, int n); + // keep the masked alleles and discard the rest + void bcf_fit_alt(bcf1_t *b, int mask); + // convert GL to PL + int bcf_gl2pl(bcf1_t *b); + // if the site is an indel + int bcf_is_indel(const bcf1_t *b); + bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list); + int bcf_subsam(int n_smpl, int *list, bcf1_t *b); + // move GT to the first FORMAT field + int bcf_fix_gt(bcf1_t *b); + // update PL generated by old samtools + int bcf_fix_pl(bcf1_t *b); + // convert PL to GLF-like 10-likelihood GL + int bcf_gl10(const bcf1_t *b, uint8_t *gl); + // convert up to 4 INDEL alleles to GLF-like 10-likelihood GL + int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl); + + // string hash table + void *bcf_build_refhash(bcf_hdr_t *h); + void bcf_str2id_destroy(void *_hash); + void bcf_str2id_thorough_destroy(void *_hash); + int bcf_str2id_add(void *_hash, const char *str); + int bcf_str2id(void *_hash, const char *str); + void *bcf_str2id_init(); + + // indexing related functions + int bcf_idx_build(const char *fn); + uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg); + int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end); + bcf_idx_t *bcf_idx_load(const char *fn); + void bcf_idx_destroy(bcf_idx_t *idx); + +#ifdef __cplusplus +} +#endif + +static inline uint32_t bcf_str2int(const char *str, int l) +{ + int i; + uint32_t x = 0; + for (i = 0; i < l && i < 4; ++i) { + if (str[i] == 0) return x; + x = x<<8 | str[i]; + } + return x; +} + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/bcf.tex Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,77 @@ +\documentclass[10pt,pdftex]{article} +\usepackage{color} +\definecolor{gray}{rgb}{0.7,0.7,0.7} + +\setlength{\topmargin}{0.0cm} +\setlength{\textheight}{21.5cm} +\setlength{\oddsidemargin}{0cm} +\setlength{\textwidth}{16.5cm} +\setlength{\columnsep}{0.6cm} + +\begin{document} + +\begin{center} +\begin{tabular}{|l|l|l|l|l|} +\hline +\multicolumn{2}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\hline\hline +\multicolumn{2}{|l|}{\sf magic} & Magic string & {\tt char[4]} & {\tt BCF\char92 4} \\\hline +\multicolumn{2}{|l|}{\sf l\_seqnm} & Length of concatenated sequence names & {\tt int32\_t} & \\\hline +\multicolumn{2}{|l|}{\sf seqnm} & Concatenated names, {\tt NULL} padded & {\tt char[{\sf l\_seqnm}]} & \\\hline +\multicolumn{2}{|l|}{\sf l\_smpl} & Length of concatenated sample names & {\tt int32\_t} & \\\hline +\multicolumn{2}{|l|}{\sf smpl} & Concatenated sample names & {\tt char[{\sf l\_smpl}]} & \\\hline +\multicolumn{2}{|l|}{\sf l\_meta} & Length of the meta text (double-hash lines)& {\tt int32\_t} & \\\hline +\multicolumn{2}{|l|}{\sf meta} & Meta text, {\tt NULL} terminated & {\tt char[{\sf l\_meta}]} & \\\hline +\multicolumn{5}{|c|}{\it \color{gray}{List of records until the end of the file}}\\\cline{2-5} +& {\sf seq\_id} & Reference sequence ID & {\tt int32\_t} & \\\cline{2-5} +& {\sf pos} & Position & {\tt int32\_t} & \\\cline{2-5} +& {\sf qual} & Variant quality & {\tt float} & \\\cline{2-5} +& {\sf l\_str} & Length of {\sf str} & {\tt int32\_t} & \\\cline{2-5} +& {\sf str} & {\tt ID+REF+ALT+FILTER+INFO+FORMAT}, {\tt NULL} padded & {\tt char[{\sf l\_str}]} &\\\cline{2-5} +& \multicolumn{4}{c|}{Blocks of data; \#blocks and formats defined by {\tt FORMAT} (table below)}\\ +\hline +\end{tabular} +\end{center} + +\begin{center} +\begin{tabular}{clp{9cm}} +\hline +\multicolumn{1}{l}{\bf Field} & \multicolumn{1}{l}{\bf Type} & \multicolumn{1}{l}{\bf Description} \\\hline +{\tt DP} & {\tt uint16\_t[n]} & Read depth \\ +{\tt GL} & {\tt float[n*G]} & Log10 likelihood of data; $G=\frac{A(A+1)}{2}$, $A=\#\{alleles\}$\\ +{\tt GT} & {\tt uint8\_t[n]} & {\tt missing\char60\char60 7 | phased\char60\char60 6 | allele1\char60\char60 3 | allele2} \\ +{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$. If the highest bit is set, + the allele is not present (e.g. due to different ploidy between samples).} \\ +{\tt GQ} & {\tt uint8\_t[n]} & {Genotype quality}\\ +{\tt HQ} & {\tt uint8\_t[n*2]} & {Haplotype quality}\\ +{\tt \_HQ} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic HQ}\\ +{\tt IBD} & {\tt uint32\_t[n*2]} & {IBD}\\ +{\tt \_IBD} & {\tt uint8\_t+uint32\_t[n*P]} & {Generic IBD}\\ +{\tt PL} & {\tt uint8\_t[n*G]} & {Phred-scaled likelihood of data}\\ +{\tt PS} & {\tt uint32\_t[n]} & {Phase set}\\ +%{\tt SP} & {\tt uint8\_t[n]} & {Strand bias P-value (bcftools only)}\\ +\emph{Integer} & {\tt int32\_t[n*X]} & {Fix-sized custom Integer; $X$ defined in the header}\\ +\emph{Numeric} & {\tt double[n*X]} & {Fix-sized custom Numeric}\\ +\emph{String} & {\tt uint32\_t+char*} & {\tt NULL} padded concat. strings (int equals to the length) \\ +\hline +\end{tabular} +\end{center} + +\begin{itemize} +\item A BCF file is in the {\tt BGZF} format. +\item All multi-byte numbers are little-endian. +\item In a string, a missing value `.' is an empty C string ``{\tt + \char92 0}'' (not ``{\tt .\char92 0}'') +\item For {\tt GL} and {\tt PL}, likelihoods of genotypes appear in the + order of alleles in {\tt REF} and then {\tt ALT}. For example, if {\tt + REF=C}, {\tt ALT=T,A}, likelihoods appear in the order of {\tt + CC,CT,TT,CA,TA,AA} (NB: the ordering is different from the one in the original + BCF proposal). +\item Predefined {\tt FORMAT} fields can be missing from VCF headers, but custom {\tt FORMAT} fields + are required to be explicitly defined in the headers. +\item A {\tt FORMAT} field with its name starting with `{\tt \_}' is specific to BCF only. + It gives an alternative binary representation of the corresponding VCF field, in case + the default representation is unable to keep the genotype information, + for example, when the ploidy is not 2 or there are more than 8 alleles. +\end{itemize} + +\end{document}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/bcf2qcall.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,91 @@ +#include <errno.h> +#include <math.h> +#include <string.h> +#include <stdlib.h> +#include "bcf.h" + +static int8_t nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +static int read_I16(bcf1_t *b, int anno[16]) +{ + char *p; + int i; + if ((p = strstr(b->info, "I16=")) == 0) return -1; + p += 4; + for (i = 0; i < 16; ++i) { + anno[i] = strtol(p, &p, 10); + if (anno[i] == 0 && (errno == EINVAL || errno == ERANGE)) return -2; + ++p; + } + return 0; +} + +int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b) +{ + int a[4], k, g[10], l, map[4], k1, j, i, i0, anno[16], dp, mq, d_rest; + char *s; + if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + if (read_I16(b, anno) != 0) return -1; // no I16; FIXME: can be improved + d_rest = dp = anno[0] + anno[1] + anno[2] + anno[3]; + if (dp == 0) return -1; // depth is zero + mq = (int)(sqrt((double)(anno[9] + anno[11]) / dp) + .499); + i0 = i; + a[0] = nt4_table[(int)b->ref[0]]; + if (a[0] > 3) return -1; // ref is not A/C/G/T + a[1] = a[2] = a[3] = -2; // -1 has a special meaning + if (b->alt[0] == 0) return -1; // no alternate allele + map[0] = map[1] = map[2] = map[3] = -2; + map[a[0]] = 0; + for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) { + if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base + a[k+1] = nt4_table[(int)*s]; + if (a[k+1] >= 0) map[a[k+1]] = k+1; + else k1 = k+1; + if (s[1] == 0) break; + } + for (k = 0; k < 4; ++k) + if (map[k] < 0) map[k] = k1; + for (i = 0; i < h->n_smpl; ++i) { + int d; + uint8_t *p = b->gi[i0].data + i * b->gi[i0].len; + for (j = 0; j < b->gi[i0].len; ++j) + if (p[j]) break; + d = (int)((double)d_rest / (h->n_smpl - i) + .499); + if (d == 0) d = 1; + if (j == b->gi[i0].len) d = 0; + d_rest -= d; + for (k = j = 0; k < 4; ++k) { + for (l = k; l < 4; ++l) { + int t, x = map[k], y = map[l]; + if (x > y) t = x, x = y, y = t; // swap + g[j++] = p[y * (y+1) / 2 + x]; + } + } + printf("%s\t%d\t%c", h->ns[b->tid], b->pos+1, *b->ref); + printf("\t%d\t%d\t0", d, mq); + for (j = 0; j < 10; ++j) + printf("\t%d", g[j]); + printf("\t%s\n", h->sns[i]); + } + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/bcfutils.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,504 @@ +#include <string.h> +#include <math.h> +#include <assert.h> +#include "bcf.h" +#include "kstring.h" +#include "khash.h" +KHASH_MAP_INIT_STR(str2id, int) + +#ifdef _WIN32 +#define srand48(x) srand(x) +#define drand48() ((double)rand() / RAND_MAX) +#endif + +// FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated... +void *bcf_build_refhash(bcf_hdr_t *h) +{ + khash_t(str2id) *hash; + int i, ret; + hash = kh_init(str2id); + for (i = 0; i < h->n_ref; ++i) { + khint_t k; + k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret + kh_val(hash, k) = i; + } + return hash; +} + +void *bcf_str2id_init() +{ + return kh_init(str2id); +} + +void bcf_str2id_destroy(void *_hash) +{ + khash_t(str2id) *hash = (khash_t(str2id)*)_hash; + if (hash) kh_destroy(str2id, hash); // Note that strings are not freed. +} + +void bcf_str2id_thorough_destroy(void *_hash) +{ + khash_t(str2id) *hash = (khash_t(str2id)*)_hash; + khint_t k; + if (hash == 0) return; + for (k = 0; k < kh_end(hash); ++k) + if (kh_exist(hash, k)) free((char*)kh_key(hash, k)); + kh_destroy(str2id, hash); +} + +int bcf_str2id(void *_hash, const char *str) +{ + khash_t(str2id) *hash = (khash_t(str2id)*)_hash; + khint_t k; + if (!hash) return -1; + k = kh_get(str2id, hash, str); + return k == kh_end(hash)? -1 : kh_val(hash, k); +} + +int bcf_str2id_add(void *_hash, const char *str) +{ + khint_t k; + int ret; + khash_t(str2id) *hash = (khash_t(str2id)*)_hash; + if (!hash) return -1; + k = kh_put(str2id, hash, str, &ret); + if (ret == 0) return kh_val(hash, k); + kh_val(hash, k) = kh_size(hash) - 1; + return kh_val(hash, k); +} + +void bcf_fit_alt(bcf1_t *b, int mask) +{ + mask |= 1; // REF must be always present + + int i,j,nals=0; + for (i=0; i<sizeof(int); i++) + if ( mask&1<<i) nals++; + if ( b->n_alleles <= nals ) return; + + // update ALT, in principle any of the alleles can be removed + char *p; + if ( nals>1 ) + { + char *dst, *src; + int n=0, nalts=nals-1; + for (src=dst=p=b->alt, i=1; *p; p++) + { + if ( *p!=',' ) continue; + + if ( mask&1<<i ) + { + n++; + if ( src!=dst ) + { + memmove(dst,src,p-src); + dst += p-src; + } + else dst = p; + if ( n<nalts ) { *dst=','; dst++; } + } + i++; + + if ( n>=nalts ) { *dst=0; break; } + src = p+1; + } + if ( n<nalts ) + { + memmove(dst,src,p-src); + dst += p-src; + *dst = 0; + } + p = dst; + } + else p = b->alt, *p = '\0'; + p++; + memmove(p, b->flt, b->str + b->l_str - b->flt); + b->l_str -= b->flt - p; + + // update PL and GT + int ipl=-1, igt=-1; + for (i = 0; i < b->n_gi; ++i) + { + bcf_ginfo_t *g = b->gi + i; + if (g->fmt == bcf_str2int("PL", 2)) ipl = i; + if (g->fmt == bcf_str2int("GT", 2)) igt = i; + } + + // .. create mapping between old and new indexes + int npl = nals * (nals+1) / 2; + int *map = malloc(sizeof(int)*(npl>b->n_alleles ? npl : b->n_alleles)); + int kori=0,knew=0; + for (i=0; i<b->n_alleles; i++) + { + for (j=0; j<=i; j++) + { + int skip=0; + if ( i && !(mask&1<<i) ) skip=1; + if ( j && !(mask&1<<j) ) skip=1; + if ( !skip ) { map[knew++] = kori; } + kori++; + } + } + // .. apply to all samples + int n_smpl = b->n_smpl; + for (i = 0; i < b->n_gi; ++i) + { + bcf_ginfo_t *g = b->gi + i; + if (g->fmt == bcf_str2int("PL", 2)) + { + g->len = npl; + uint8_t *d = (uint8_t*)g->data; + int ismpl, npl_ori = b->n_alleles * (b->n_alleles + 1) / 2; + for (knew=ismpl=0; ismpl<n_smpl; ismpl++) + { + uint8_t *dl = d + ismpl * npl_ori; + for (j=0; j<npl; j++) d[knew++] = dl[map[j]]; + } + } // FIXME: to add GL + } + // update GTs + map[0] = 0; + for (i=1, knew=0; i<b->n_alleles; i++) + map[i] = mask&1<<i ? ++knew : -1; + for (i=0; i<n_smpl; i++) + { + uint8_t gt = ((uint8_t*)b->gi[igt].data)[i]; + int a1 = (gt>>3)&7; + int a2 = gt&7; + assert( map[a1]>=0 && map[a2]>=0 ); + ((uint8_t*)b->gi[igt].data)[i] = ((1<<7|1<<6)>) | map[a1]<<3 | map[a2]; + } + free(map); + b->n_alleles = nals; + bcf_sync(b); +} + +int bcf_shrink_alt(bcf1_t *b, int n) +{ + char *p; + int i, j, k, n_smpl = b->n_smpl; + if (b->n_alleles <= n) return -1; + // update ALT + if (n > 1) { + for (p = b->alt, k = 1; *p; ++p) + if (*p == ',' && ++k == n) break; + *p = '\0'; + } else p = b->alt, *p = '\0'; + ++p; + memmove(p, b->flt, b->str + b->l_str - b->flt); + b->l_str -= b->flt - p; + // update PL + for (i = 0; i < b->n_gi; ++i) { + bcf_ginfo_t *g = b->gi + i; + if (g->fmt == bcf_str2int("PL", 2)) { + int l, x = b->n_alleles * (b->n_alleles + 1) / 2; + uint8_t *d = (uint8_t*)g->data; + g->len = n * (n + 1) / 2; + for (l = k = 0; l < n_smpl; ++l) { + uint8_t *dl = d + l * x; + for (j = 0; j < g->len; ++j) d[k++] = dl[j]; + } + } // FIXME: to add GL + } + b->n_alleles = n; + bcf_sync(b); + return 0; +} + +int bcf_gl2pl(bcf1_t *b) +{ + char *p; + int i, n_smpl = b->n_smpl; + bcf_ginfo_t *g; + float *d0; + uint8_t *d1; + if (strstr(b->fmt, "PL")) return -1; + if ((p = strstr(b->fmt, "GL")) == 0) return -1; + *p = 'P'; + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("GL", 2)) + break; + g = b->gi + i; + g->fmt = bcf_str2int("PL", 2); + g->len /= 4; // 4 == sizeof(float) + d0 = (float*)g->data; d1 = (uint8_t*)g->data; + for (i = 0; i < n_smpl * g->len; ++i) { + int x = (int)(-10. * d0[i] + .499); + if (x > 255) x = 255; + if (x < 0) x = 0; + d1[i] = x; + } + return 0; +} +/* FIXME: this function will fail given AB:GTX:GT. BCFtools never + * produces such FMT, but others may do. */ +int bcf_fix_gt(bcf1_t *b) +{ + char *s; + int i; + uint32_t tmp; + bcf_ginfo_t gt; + // check the presence of the GT FMT + if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first + assert(s[3] == '\0' || s[3] == ':'); // :GTX in fact + tmp = bcf_str2int("GT", 2); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug... + gt = b->gi[i]; + // move GT to the first + for (; i > 0; --i) b->gi[i] = b->gi[i-1]; + b->gi[0] = gt; + if ( s[3]==0 ) + memmove(b->fmt + 3, b->fmt, s - b->fmt); // :GT + else + memmove(b->fmt + 3, b->fmt, s - b->fmt + 1); // :GT: + b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':'; + return 0; +} + +int bcf_fix_pl(bcf1_t *b) +{ + int i; + uint32_t tmp; + uint8_t *PL, *swap; + bcf_ginfo_t *gi; + // pinpoint PL + tmp = bcf_str2int("PL", 2); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; + // prepare + gi = b->gi + i; + PL = (uint8_t*)gi->data; + swap = alloca(gi->len); + // loop through individuals + for (i = 0; i < b->n_smpl; ++i) { + int k, l, x; + uint8_t *PLi = PL + i * gi->len; + memcpy(swap, PLi, gi->len); + for (k = x = 0; k < b->n_alleles; ++k) + for (l = k; l < b->n_alleles; ++l) + PLi[l*(l+1)/2 + k] = swap[x++]; + } + return 0; +} + +int bcf_smpl_covered(const bcf1_t *b) +{ + int i, j, n = 0; + uint32_t tmp; + bcf_ginfo_t *gi; + // pinpoint PL + tmp = bcf_str2int("PL", 2); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; + // count how many samples having PL!=[0..0] + gi = b->gi + i; + for (i = 0; i < b->n_smpl; ++i) { + uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len; + for (j = 0; j < gi->len; ++j) + if (PLi[j]) break; + if (j < gi->len) ++n; + } + return n; +} + +static void *locate_field(const bcf1_t *b, const char *fmt, int l) +{ + int i; + uint32_t tmp; + tmp = bcf_str2int(fmt, l); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + return i == b->n_gi? 0 : b->gi[i].data; +} + +int bcf_anno_max(bcf1_t *b) +{ + int k, max_gq, max_sp, n_het; + kstring_t str; + uint8_t *gt, *gq; + int32_t *sp; + max_gq = max_sp = n_het = 0; + gt = locate_field(b, "GT", 2); + if (gt == 0) return -1; + gq = locate_field(b, "GQ", 2); + sp = locate_field(b, "SP", 2); + if (sp) + for (k = 0; k < b->n_smpl; ++k) + if (gt[k]&0x3f) + max_sp = max_sp > (int)sp[k]? max_sp : sp[k]; + if (gq) + for (k = 0; k < b->n_smpl; ++k) + if (gt[k]&0x3f) + max_gq = max_gq > (int)gq[k]? max_gq : gq[k]; + for (k = 0; k < b->n_smpl; ++k) { + int a1, a2; + a1 = gt[k]&7; a2 = gt[k]>>3&7; + if ((!a1 && a2) || (!a2 && a1)) { // a het + if (gq == 0) ++n_het; + else if (gq[k] >= 20) ++n_het; + } + } + if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499); + if (max_sp < 0) max_sp = 0; + memset(&str, 0, sizeof(kstring_t)); + if (*b->info) kputc(';', &str); + ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq); + bcf_append_info(b, str.s, str.l); + free(str.s); + return 0; +} + +// FIXME: only data are shuffled; the header is NOT +int bcf_shuffle(bcf1_t *b, int seed) +{ + int i, j, *a; + if (seed > 0) srand48(seed); + a = malloc(b->n_smpl * sizeof(int)); + for (i = 0; i < b->n_smpl; ++i) a[i] = i; + for (i = b->n_smpl; i > 1; --i) { + int tmp; + j = (int)(drand48() * i); + tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; + } + for (j = 0; j < b->n_gi; ++j) { + bcf_ginfo_t *gi = b->gi + j; + uint8_t *swap, *data = (uint8_t*)gi->data; + swap = malloc(gi->len * b->n_smpl); + for (i = 0; i < b->n_smpl; ++i) + memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len); + free(gi->data); + gi->data = swap; + } + free(a); + return 0; +} + +bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list) +{ + int i, ret, j; + khint_t k; + bcf_hdr_t *h; + khash_t(str2id) *hash; + kstring_t s; + s.l = s.m = 0; s.s = 0; + hash = kh_init(str2id); + for (i = 0; i < h0->n_smpl; ++i) { + k = kh_put(str2id, hash, h0->sns[i], &ret); + kh_val(hash, k) = i; + } + for (i = j = 0; i < n; ++i) { + k = kh_get(str2id, hash, samples[i]); + if (k != kh_end(hash)) { + list[j++] = kh_val(hash, k); + kputs(samples[i], &s); kputc('\0', &s); + } + } + if (j < n) + { + fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); + exit(1); + } + kh_destroy(str2id, hash); + h = calloc(1, sizeof(bcf_hdr_t)); + *h = *h0; + h->ns = 0; h->sns = 0; + h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm); + h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt); + h->l_smpl = s.l; h->sname = s.s; + bcf_hdr_sync(h); + return h; +} + +int bcf_subsam(int n_smpl, int *list, bcf1_t *b) +{ + int i, j; + for (j = 0; j < b->n_gi; ++j) { + bcf_ginfo_t *gi = b->gi + j; + uint8_t *swap; + swap = malloc(gi->len * b->n_smpl); + for (i = 0; i < n_smpl; ++i) + memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); + free(gi->data); + gi->data = swap; + } + b->n_smpl = n_smpl; + return 0; +} + +static int8_t nt4_table[128] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4 +}; + +int bcf_gl10(const bcf1_t *b, uint8_t *gl) +{ + int a[4], k, l, map[4], k1, j, i; + const bcf_ginfo_t *PL; + char *s; + if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + PL = b->gi + i; + a[0] = nt4_table[(int)b->ref[0]]; + if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T + a[1] = a[2] = a[3] = -2; // -1 has a special meaning + if (b->alt[0] == 0) return -1; // no alternate allele + map[0] = map[1] = map[2] = map[3] = -2; + map[a[0]] = 0; + for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) { + if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base + a[k+1] = nt4_table[(int)*s]; + if (a[k+1] >= 0) map[a[k+1]] = k+1; + else k1 = k + 1; + if (s[1] == 0) break; // the end of the ALT string + } + for (k = 0; k < 4; ++k) + if (map[k] < 0) map[k] = k1; + for (i = 0; i < b->n_smpl; ++i) { + const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual + uint8_t *g = gl + 10 * i; + for (k = j = 0; k < 4; ++k) { + for (l = k; l < 4; ++l) { + int t, x = map[k], y = map[l]; + if (x > y) t = x, x = y, y = t; // make sure x is the smaller + g[j++] = p[y * (y+1) / 2 + x]; + } + } + } + return 0; +} + +int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl) +{ + int k, l, j, i; + const bcf_ginfo_t *PL; + if (b->alt[0] == 0) return -1; // no alternate allele + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + PL = b->gi + i; + for (i = 0; i < b->n_smpl; ++i) { + const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual + uint8_t *g = gl + 10 * i; + for (k = j = 0; k < 4; ++k) { + for (l = k; l < 4; ++l) { + int t, x = k, y = l; + if (x > y) t = x, x = y, y = t; // make sure x is the smaller + x = y * (y+1) / 2 + x; + g[j++] = x < PL->len? p[x] : 255; + } + } + } + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/call1.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,633 @@ +#include <unistd.h> +#include <stdlib.h> +#include <math.h> +#include <zlib.h> +#include <errno.h> +#include "bcf.h" +#include "prob1.h" +#include "kstring.h" +#include "time.h" + +#ifdef _WIN32 +#define srand48(x) srand(x) +#define lrand48() rand() +#endif + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 16384) + +#define VC_NO_GENO 2 +#define VC_BCFOUT 4 +#define VC_CALL 8 +#define VC_VARONLY 16 +#define VC_VCFIN 32 +#define VC_UNCOMP 64 +#define VC_KEEPALT 256 +#define VC_ACGT_ONLY 512 +#define VC_QCALL 1024 +#define VC_CALL_GT 2048 +#define VC_ADJLD 4096 +#define VC_NO_INDEL 8192 +#define VC_ANNO_MAX 16384 +#define VC_FIX_PL 32768 +#define VC_EM 0x10000 +#define VC_PAIRCALL 0x20000 +#define VC_QCNT 0x40000 +#define VC_INDEL_ONLY 0x80000 + +typedef struct { + int flag, prior_type, n1, n_sub, *sublist, n_perm; + uint32_t *trio_aux; + char *prior_file, **subsam, *fn_dict; + uint8_t *ploidy; + double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt, min_ma_lrt; + void *bed; +} viewconf_t; + +void *bed_read(const char *fn); +void bed_destroy(void *_h); +int bed_overlap(const void *_h, const char *chr, int beg, int end); + +static double ttest(int n1, int n2, int a[4]) +{ + extern double kf_betai(double a, double b, double x); + double t, v, u1, u2; + if (n1 == 0 || n2 == 0 || n1 + n2 < 3) return 1.0; + u1 = (double)a[0] / n1; u2 = (double)a[2] / n2; + if (u1 <= u2) return 1.; + t = (u1 - u2) / sqrt(((a[1] - n1 * u1 * u1) + (a[3] - n2 * u2 * u2)) / (n1 + n2 - 2) * (1./n1 + 1./n2)); + v = n1 + n2 - 2; +// printf("%d,%d,%d,%d,%lf,%lf,%lf\n", a[0], a[1], a[2], a[3], t, u1, u2); + return t < 0.? 1. : .5 * kf_betai(.5*v, .5, v/(v+t*t)); +} + +static int test16_core(int anno[16], anno16_t *a) +{ + extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); + double left, right; + int i; + a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.; + memcpy(a->d, anno, 4 * sizeof(int)); + a->depth = anno[0] + anno[1] + anno[2] + anno[3]; + a->is_tested = (anno[0] + anno[1] > 0 && anno[2] + anno[3] > 0); + if (a->depth == 0) return -1; + a->mq = (int)(sqrt((anno[9] + anno[11]) / a->depth) + .499); + kt_fisher_exact(anno[0], anno[1], anno[2], anno[3], &left, &right, &a->p[0]); + for (i = 1; i < 4; ++i) + a->p[i] = ttest(anno[0] + anno[1], anno[2] + anno[3], anno+4*i); + return 0; +} + +int test16(bcf1_t *b, anno16_t *a) +{ + char *p; + int i, anno[16]; + a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.; + a->d[0] = a->d[1] = a->d[2] = a->d[3] = 0.; + a->mq = a->depth = a->is_tested = 0; + if ((p = strstr(b->info, "I16=")) == 0) return -1; + p += 4; + for (i = 0; i < 16; ++i) { + errno = 0; anno[i] = strtol(p, &p, 10); + if (anno[i] == 0 && (errno == EINVAL || errno == ERANGE)) return -2; + ++p; + } + return test16_core(anno, a); +} + +static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag, double em[10], int cons_llr, int64_t cons_gt) +{ + kstring_t s; + int has_I16, is_var; + double fq, r; + anno16_t a; + + has_I16 = test16(b, &a) >= 0? 1 : 0; + //rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed! + + memset(&s, 0, sizeof(kstring_t)); + kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s); + kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); + kputs(b->info, &s); + if (b->info[0]) kputc(';', &s); + { // print EM + if (em[0] >= 0) ksprintf(&s, "AF1=%.4g", 1 - em[0]); + if (em[4] >= 0 && em[4] <= 0.05) ksprintf(&s, ";G3=%.4g,%.4g,%.4g;HWE=%.3g", em[3], em[2], em[1], em[4]); + if (em[5] >= 0 && em[6] >= 0) ksprintf(&s, ";AF2=%.4g,%.4g", 1 - em[5], 1 - em[6]); + if (em[7] >= 0) ksprintf(&s, ";LRT=%.3g", em[7]); + if (em[8] >= 0) ksprintf(&s, ";LRT2=%.3g", em[8]); + } + if (cons_llr > 0) { + ksprintf(&s, ";CLR=%d", cons_llr); + if (cons_gt > 0) + ksprintf(&s, ";UGT=%c%c%c;CGT=%c%c%c", cons_gt&0xff, cons_gt>>8&0xff, cons_gt>>16&0xff, + cons_gt>>32&0xff, cons_gt>>40&0xff, cons_gt>>48&0xff); + } + if (pr == 0) { // if pr is unset, return + kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + return 1; + } + + is_var = (pr->p_ref < pref); + r = is_var? pr->p_ref : pr->p_var; + +// ksprintf(&s, ";CI95=%.4g,%.4g", pr->cil, pr->cih); // FIXME: when EM is not used, ";" should be omitted! + ksprintf(&s, ";AC1=%d", pr->ac); + if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded); + if (fq < -999) fq = -999; + if (fq > 999) fq = 999; + ksprintf(&s, ";FQ=%.3g", fq); + if (pr->cmp[0] >= 0.) { // two sample groups + int i, q[3]; + for (i = 1; i < 3; ++i) { + double x = pr->cmp[i] + pr->cmp[0]/2.; + q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499); + if (q[i] > 255) q[i] = 255; + } + if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank); + // ksprintf(&s, ";LRT3=%.3g", pr->lrt); + ksprintf(&s, ";PCHI2=%.3g;PC2=%d,%d", q[1], q[2], pr->p_chi2); + } + if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); + kputc('\0', &s); + rm_info(&s, "QS="); + rm_info(&s, "I16="); + kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + b->qual = r < 1e-100? 999 : -4.343 * log(r); + if (b->qual > 999) b->qual = 999; + bcf_sync(b); + if (!is_var) bcf_shrink_alt(b, 1); + else if (!(flag&VC_KEEPALT)) + bcf_shrink_alt(b, pr->rank0 < 2? 2 : pr->rank0+1); + if (is_var && (flag&VC_CALL_GT)) { // call individual genotype + int i, x, old_n_gi = b->n_gi; + s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str; + kputs(":GT:GQ", &s); kputc('\0', &s); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + for (i = 0; i < b->n_smpl; ++i) { + x = bcf_p1_call_gt(pa, pr->f_exp, i); + ((uint8_t*)b->gi[old_n_gi].data)[i] = (x&3) == 0? 1<<3|1 : (x&3) == 1? 1 : 0; + ((uint8_t*)b->gi[old_n_gi+1].data)[i] = x>>2; + } + } + return is_var; +} + +static char **read_samples(const char *fn, int *_n) +{ + gzFile fp; + kstream_t *ks; + kstring_t s; + int dret, n = 0, max = 0; + char **sam = 0; + *_n = 0; + s.l = s.m = 0; s.s = 0; + fp = gzopen(fn, "r"); + if (fp == 0) + { + // interpret as sample names, not as a file name + const char *t = fn, *p = t; + while (*t) + { + t++; + if ( *t==',' || !*t ) + { + sam = realloc(sam, sizeof(void*)*(n+1)); + sam[n] = (char*) malloc(sizeof(char)*(t-p+2)); + memcpy(sam[n], p, t-p); + sam[n][t-p] = 0; + sam[n][t-p+1] = 2; // assume diploid + p = t+1; + n++; + } + } + *_n = n; + return sam; // fail to open file + } + ks = ks_init(fp); + while (ks_getuntil(ks, 0, &s, &dret) >= 0) { + int l; + if (max == n) { + max = max? max<<1 : 4; + sam = realloc(sam, sizeof(void*)*max); + } + l = s.l; + sam[n] = malloc(s.l + 2); + strcpy(sam[n], s.s); + sam[n][l+1] = 2; // by default, diploid + if (dret != '\n') { + if (ks_getuntil(ks, 0, &s, &dret) >= 0) { // read ploidy, 1 or 2 + int x = (int)s.s[0] - '0'; + if (x == 1 || x == 2) sam[n][l+1] = x; + else fprintf(stderr, "(%s) ploidy can only be 1 or 2; assume diploid\n", __func__); + } + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); + } + ++n; + } + ks_destroy(ks); + gzclose(fp); + free(s.s); + *_n = n; + return sam; +} + +static void write_header(bcf_hdr_t *h) +{ + kstring_t str; + str.l = h->l_txt? h->l_txt - 1 : 0; + str.m = str.l + 1; str.s = h->txt; + if (!strstr(str.s, "##INFO=<ID=DP,")) + kputs("##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=DP4,")) + kputs("##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=MQ,")) + kputs("##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root-mean-square mapping quality of covering reads\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=FQ,")) + kputs("##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of all samples being the same\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=AF1,")) + kputs("##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele frequency (assuming HWE)\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=AC1,")) + kputs("##INFO=<ID=AC1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele count (no HWE assumption)\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=AN,")) + kputs("##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=IS,")) + kputs("##INFO=<ID=IS,Number=2,Type=Float,Description=\"Maximum number of reads supporting an indel and fraction of indel reads\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=AC,")) + kputs("##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes for each ALT allele, in the same order as listed\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=G3,")) + kputs("##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of genotype frequencies\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=HWE,")) + kputs("##INFO=<ID=HWE,Number=1,Type=Float,Description=\"Chi^2 based HWE test P-value based on G3\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=CLR,")) + kputs("##INFO=<ID=CLR,Number=1,Type=Integer,Description=\"Log ratio of genotype likelihoods with and without the constraint\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=UGT,")) + kputs("##INFO=<ID=UGT,Number=1,Type=String,Description=\"The most probable unconstrained genotype configuration in the trio\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=CGT,")) + kputs("##INFO=<ID=CGT,Number=1,Type=String,Description=\"The most probable constrained genotype configuration in the trio\">\n", &str); +// if (!strstr(str.s, "##INFO=<ID=CI95,")) +// kputs("##INFO=<ID=CI95,Number=2,Type=Float,Description=\"Equal-tail Bayesian credible interval of the site allele frequency at the 95% level\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=PV4,")) + kputs("##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=INDEL,")) + kputs("##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=PC2,")) + kputs("##INFO=<ID=PC2,Number=2,Type=Integer,Description=\"Phred probability of the nonRef allele frequency in group1 samples being larger (,smaller) than in group2.\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=PCHI2,")) + kputs("##INFO=<ID=PCHI2,Number=1,Type=Float,Description=\"Posterior weighted chi^2 P-value for testing the association between group1 and group2 samples.\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=QCHI2,")) + kputs("##INFO=<ID=QCHI2,Number=1,Type=Integer,Description=\"Phred scaled PCHI2.\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=RP,")) + kputs("##INFO=<ID=PR,Number=1,Type=Integer,Description=\"# permutations yielding a smaller PCHI2.\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=QBD,")) + kputs("##INFO=<ID=QBD,Number=1,Type=Float,Description=\"Quality by Depth: QUAL/#reads\">\n", &str); + //if (!strstr(str.s, "##INFO=<ID=RPS,")) + // kputs("##INFO=<ID=RPS,Number=3,Type=Float,Description=\"Read Position Stats: depth, average, stddev\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=RPB,")) + kputs("##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Read Position Bias\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=MDV,")) + kputs("##INFO=<ID=MDV,Number=1,Type=Integer,Description=\"Maximum number of high-quality nonRef reads in samples\">\n", &str); + if (!strstr(str.s, "##INFO=<ID=VDB,")) + kputs("##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias (v2) for filtering splice-site artefacts in RNA-seq data. Note: this version may be broken.\">\n", &str); + if (!strstr(str.s, "##FORMAT=<ID=GT,")) + kputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", &str); + if (!strstr(str.s, "##FORMAT=<ID=GQ,")) + kputs("##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">\n", &str); + if (!strstr(str.s, "##FORMAT=<ID=GL,")) + kputs("##FORMAT=<ID=GL,Number=3,Type=Float,Description=\"Likelihoods for RR,RA,AA genotypes (R=ref,A=alt)\">\n", &str); + if (!strstr(str.s, "##FORMAT=<ID=DP,")) + kputs("##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"# high-quality bases\">\n", &str); + if (!strstr(str.s, "##FORMAT=<ID=DV,")) + kputs("##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"# high-quality non-reference bases\">\n", &str); + if (!strstr(str.s, "##FORMAT=<ID=SP,")) + kputs("##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">\n", &str); + if (!strstr(str.s, "##FORMAT=<ID=PL,")) + kputs("##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">\n", &str); + h->l_txt = str.l + 1; h->txt = str.s; +} + +double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); + +int bcfview(int argc, char *argv[]) +{ + extern int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b); + extern void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x); + extern int bcf_fix_gt(bcf1_t *b); + extern int bcf_anno_max(bcf1_t *b); + extern int bcf_shuffle(bcf1_t *b, int seed); + extern uint32_t *bcf_trio_prep(int is_x, int is_son); + extern int bcf_trio_call(uint32_t *prep, const bcf1_t *b, int *llr, int64_t *gt); + extern int bcf_pair_call(const bcf1_t *b); + extern int bcf_min_diff(const bcf1_t *b); + extern int bcf_p1_get_M(bcf_p1aux_t *b); + + extern gzFile bcf_p1_fp_lk; + + bcf_t *bp, *bout = 0; + bcf1_t *b, *blast; + int c, *seeds = 0; + uint64_t n_processed = 0, qcnt[256]; + viewconf_t vc; + bcf_p1aux_t *p1 = 0; + bcf_hdr_t *hin, *hout; + int tid, begin, end; + char moder[4], modew[4]; + + tid = begin = end = -1; + memset(&vc, 0, sizeof(viewconf_t)); + vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0; vc.min_lrt = 1; vc.min_ma_lrt = -1; + memset(qcnt, 0, 8 * 256); + while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Ywm:K:")) >= 0) { + switch (c) { + case '1': vc.n1 = atoi(optarg); break; + case 'l': vc.bed = bed_read(optarg); if (!vc.bed) { fprintf(stderr,"Could not read \"%s\"\n", optarg); return 1; } break; + case 'D': vc.fn_dict = strdup(optarg); break; + case 'F': vc.flag |= VC_FIX_PL; break; + case 'N': vc.flag |= VC_ACGT_ONLY; break; + case 'G': vc.flag |= VC_NO_GENO; break; + case 'A': vc.flag |= VC_KEEPALT; break; + case 'b': vc.flag |= VC_BCFOUT; break; + case 'S': vc.flag |= VC_VCFIN; break; + case 'c': vc.flag |= VC_CALL; break; + case 'e': vc.flag |= VC_EM; break; + case 'v': vc.flag |= VC_VARONLY | VC_CALL; break; + case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break; + case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break; + case 'I': vc.flag |= VC_NO_INDEL; break; + case 'w': vc.flag |= VC_INDEL_ONLY; break; + case 'M': vc.flag |= VC_ANNO_MAX; break; + case 'Y': vc.flag |= VC_QCNT; break; + case 'm': vc.min_ma_lrt = atof(optarg); break; + case 't': vc.theta = atof(optarg); break; + case 'p': vc.pref = atof(optarg); break; + case 'i': vc.indel_frac = atof(optarg); break; + case 'Q': vc.flag |= VC_QCALL; break; + case 'L': vc.flag |= VC_ADJLD; break; + case 'U': vc.n_perm = atoi(optarg); break; + case 'C': vc.min_lrt = atof(optarg); break; + case 'X': vc.min_perm_p = atof(optarg); break; + case 'd': vc.min_smpl_frac = atof(optarg); break; + case 'K': bcf_p1_fp_lk = gzopen(optarg, "w"); break; + case 's': vc.subsam = read_samples(optarg, &vc.n_sub); + vc.ploidy = calloc(vc.n_sub + 1, 1); + for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1]; + tid = -1; + break; + case 'T': + if (strcmp(optarg, "trioauto") == 0) vc.trio_aux = bcf_trio_prep(0, 0); + else if (strcmp(optarg, "trioxd") == 0) vc.trio_aux = bcf_trio_prep(1, 0); + else if (strcmp(optarg, "trioxs") == 0) vc.trio_aux = bcf_trio_prep(1, 1); + else if (strcmp(optarg, "pair") == 0) vc.flag |= VC_PAIRCALL; + else { + fprintf(stderr, "[%s] Option '-T' can only take value trioauto, trioxd or trioxs.\n", __func__); + return 1; + } + break; + case 'P': + if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL; + else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2; + else if (strcmp(optarg, "flat") == 0) vc.prior_type = MC_PTYPE_FLAT; + else vc.prior_file = strdup(optarg); + break; + } + } + if (argc == optind) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bcftools view [options] <in.bcf> [reg]\n\n"); + fprintf(stderr, "Input/output options:\n\n"); + fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); + fprintf(stderr, " -b output BCF instead of VCF\n"); + fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n"); + fprintf(stderr, " -F PL generated by r921 or before (which generate old ordering)\n"); + fprintf(stderr, " -G suppress all individual genotype information\n"); + fprintf(stderr, " -l FILE list of sites (chr pos) or regions (BED) to output [all sites]\n"); + fprintf(stderr, " -L calculate LD for adjacent sites\n"); + fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n"); + fprintf(stderr, " -Q output the QCALL likelihood format\n"); + fprintf(stderr, " -s FILE list of samples to use [all samples]\n"); + fprintf(stderr, " -S input is VCF\n"); + fprintf(stderr, " -u uncompressed BCF output (force -b)\n"); + fprintf(stderr, "\nConsensus/variant calling options:\n\n"); + fprintf(stderr, " -c SNP calling (force -e)\n"); + fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n"); + fprintf(stderr, " -e likelihood based analyses\n"); + fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); + fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); + fprintf(stderr, " -I skip indels\n"); + fprintf(stderr, " -m FLOAT alternative model for multiallelic and rare-variant calling, include if P(chi^2)>=FLOAT\n"); + fprintf(stderr, " -p FLOAT variant if P(ref|D)<FLOAT [%.3g]\n", vc.pref); + fprintf(stderr, " -P STR type of prior: full, cond2, flat [full]\n"); + fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4g]\n", vc.theta); + fprintf(stderr, " -T STR constrained calling; STR can be: pair, trioauto, trioxd and trioxs (see manual) [null]\n"); + fprintf(stderr, " -v output potential variant sites only (force -c)\n"); + fprintf(stderr, "\nContrast calling and association test options:\n\n"); + fprintf(stderr, " -1 INT number of group-1 samples [0]\n"); + fprintf(stderr, " -C FLOAT posterior constrast for LRT<FLOAT and P(ref|D)<0.5 [%g]\n", vc.min_lrt); + fprintf(stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n"); + fprintf(stderr, " -X FLOAT only perform permutations for P(chi^2)<FLOAT [%g]\n", vc.min_perm_p); + fprintf(stderr, "\n"); + return 1; + } + + if (vc.flag & VC_CALL) vc.flag |= VC_EM; + if ((vc.flag & VC_VCFIN) && (vc.flag & VC_BCFOUT) && vc.fn_dict == 0) { + fprintf(stderr, "[%s] For VCF->BCF conversion please specify the sequence dictionary with -D\n", __func__); + return 1; + } + if (vc.n1 <= 0) vc.n_perm = 0; // TODO: give a warning here! + if (vc.n_perm > 0) { + seeds = malloc(vc.n_perm * sizeof(int)); + srand48(time(0)); + for (c = 0; c < vc.n_perm; ++c) seeds[c] = lrand48(); + } + b = calloc(1, sizeof(bcf1_t)); + blast = calloc(1, sizeof(bcf1_t)); + strcpy(moder, "r"); + if (!(vc.flag & VC_VCFIN)) strcat(moder, "b"); + strcpy(modew, "w"); + if (vc.flag & VC_BCFOUT) strcat(modew, "b"); + if (vc.flag & VC_UNCOMP) strcat(modew, "u"); + bp = vcf_open(argv[optind], moder); + hin = hout = vcf_hdr_read(bp); + if (vc.fn_dict && (vc.flag & VC_VCFIN)) + vcf_dictread(bp, hin, vc.fn_dict); + bout = vcf_open("-", modew); + if (!(vc.flag & VC_QCALL)) { + if (vc.n_sub) { + vc.sublist = calloc(vc.n_sub, sizeof(int)); + hout = bcf_hdr_subsam(hin, vc.n_sub, vc.subsam, vc.sublist); + } + write_header(hout); // always print the header + vcf_hdr_write(bout, hout); + } + if (vc.flag & VC_CALL) { + p1 = bcf_p1_init(hout->n_smpl, vc.ploidy); + if (vc.prior_file) { + if (bcf_p1_read_prior(p1, vc.prior_file) < 0) { + fprintf(stderr, "[%s] fail to read the prior AFS.\n", __func__); + return 1; + } + } else bcf_p1_init_prior(p1, vc.prior_type, vc.theta); + if (vc.n1 > 0 && vc.min_lrt > 0.) { // set n1 + bcf_p1_set_n1(p1, vc.n1); + bcf_p1_init_subprior(p1, vc.prior_type, vc.theta); + } + if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac); // otherwise use the default indel_frac + } + if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) { + void *str2id = bcf_build_refhash(hout); + if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) { + bcf_idx_t *idx; + idx = bcf_idx_load(argv[optind]); + if (idx) { + uint64_t off; + off = bcf_idx_query(idx, tid, begin); + if (off == 0) { + fprintf(stderr, "[%s] no records in the query region.\n", __func__); + return 1; // FIXME: a lot of memory leaks... + } + bgzf_seek(bp->fp, off, SEEK_SET); + bcf_idx_destroy(idx); + } + } + } + if (bcf_p1_fp_lk && p1) { + int32_t M = bcf_p1_get_M(p1); + gzwrite(bcf_p1_fp_lk, &M, 4); + } + while (vcf_read(bp, hin, b) > 0) { + int is_indel, cons_llr = -1; + int64_t cons_gt = -1; + double em[10]; + if ((vc.flag & VC_VARONLY) && strcmp(b->alt, "X") == 0) continue; + if ((vc.flag & VC_VARONLY) && vc.min_smpl_frac > 0.) { + extern int bcf_smpl_covered(const bcf1_t *b); + int n = bcf_smpl_covered(b); + if ((double)n / b->n_smpl < vc.min_smpl_frac) continue; + } + if (vc.n_sub) bcf_subsam(vc.n_sub, vc.sublist, b); + if (vc.flag & VC_FIX_PL) bcf_fix_pl(b); + is_indel = bcf_is_indel(b); + if ((vc.flag & VC_NO_INDEL) && is_indel) continue; + if ((vc.flag & VC_INDEL_ONLY) && !is_indel) continue; + if ((vc.flag & VC_ACGT_ONLY) && !is_indel) { + int x; + if (b->ref[0] == 0 || b->ref[1] != 0) continue; + x = toupper(b->ref[0]); + if (x != 'A' && x != 'C' && x != 'G' && x != 'T') continue; + } + if (vc.bed && !bed_overlap(vc.bed, hin->ns[b->tid], b->pos, b->pos + strlen(b->ref))) continue; + if (tid >= 0) { + int l = strlen(b->ref); + l = b->pos + (l > 0? l : 1); + if (b->tid != tid || b->pos >= end) break; + if (!(l > begin && end > b->pos)) continue; + } + ++n_processed; + if ((vc.flag & VC_QCNT) && !is_indel) { // summarize the difference + int x = bcf_min_diff(b); + if (x > 255) x = 255; + if (x >= 0) ++qcnt[x]; + } + if (vc.flag & VC_QCALL) { // output QCALL format; STOP here + bcf_2qcall(hout, b); + continue; + } + if (vc.trio_aux) // do trio calling + bcf_trio_call(vc.trio_aux, b, &cons_llr, &cons_gt); + else if (vc.flag & VC_PAIRCALL) + cons_llr = bcf_pair_call(b); + if (vc.flag & (VC_CALL|VC_ADJLD|VC_EM)) bcf_gl2pl(b); + if (vc.flag & VC_EM) bcf_em1(b, vc.n1, 0x1ff, em); + else { + int i; + for (i = 0; i < 9; ++i) em[i] = -1.; + } + if ( !(vc.flag&VC_KEEPALT) && (vc.flag&VC_CALL) && vc.min_ma_lrt>=0 ) + { + bcf_p1_set_ploidy(b, p1); // could be improved: do this per site to allow pseudo-autosomal regions + int gts = call_multiallelic_gt(b, p1, vc.min_ma_lrt, vc.flag&VC_VARONLY); + if ( gts<=1 && vc.flag & VC_VARONLY ) continue; + } + else if (vc.flag & VC_CALL) { // call variants + bcf_p1rst_t pr; + int calret; + gzwrite(bcf_p1_fp_lk, &b->tid, 4); + gzwrite(bcf_p1_fp_lk, &b->pos, 4); + gzwrite(bcf_p1_fp_lk, &em[0], sizeof(double)); + calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr); + if (n_processed % 100000 == 0) { + fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed); + bcf_p1_dump_afs(p1); + } + if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue; + if (vc.n_perm && vc.n1 > 0 && pr.p_chi2 < vc.min_perm_p) { // permutation test + bcf_p1rst_t r; + int i, n = 0; + for (i = 0; i < vc.n_perm; ++i) { +#ifdef BCF_PERM_LRT // LRT based permutation is much faster but less robust to artifacts + double x[10]; + bcf_shuffle(b, seeds[i]); + bcf_em1(b, vc.n1, 1<<7, x); + if (x[7] < em[7]) ++n; +#else + bcf_shuffle(b, seeds[i]); + bcf_p1_cal(b, 1, p1, &r); + if (pr.p_chi2 >= r.p_chi2) ++n; +#endif + } + pr.perm_rank = n; + } + if (calret >= 0) update_bcf1(b, p1, &pr, vc.pref, vc.flag, em, cons_llr, cons_gt); + } else if (vc.flag & VC_EM) update_bcf1(b, 0, 0, 0, vc.flag, em, cons_llr, cons_gt); + if (vc.flag & VC_ADJLD) { // compute LD + double f[4], r2; + if ((r2 = bcf_pair_freq(blast, b, f)) >= 0) { + kstring_t s; + s.m = s.l = 0; s.s = 0; + if (*b->info) kputc(';', &s); + ksprintf(&s, "NEIR=%.3f;NEIF4=%.3f,%.3f,%.3f,%.3f", r2, f[0], f[1], f[2], f[3]); + bcf_append_info(b, s.s, s.l); + free(s.s); + } + bcf_cpy(blast, b); + } + if (vc.flag & VC_ANNO_MAX) bcf_anno_max(b); + if (vc.flag & VC_NO_GENO) { // do not output GENO fields + b->n_gi = 0; + b->fmt[0] = '\0'; + b->l_str = b->fmt - b->str + 1; + } else bcf_fix_gt(b); + vcf_write(bout, hout, b); + } + + if (bcf_p1_fp_lk) gzclose(bcf_p1_fp_lk); + if (vc.prior_file) free(vc.prior_file); + if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1); + if (hin != hout) bcf_hdr_destroy(hout); + bcf_hdr_destroy(hin); + bcf_destroy(b); bcf_destroy(blast); + vcf_close(bp); vcf_close(bout); + if (vc.fn_dict) free(vc.fn_dict); + if (vc.ploidy) free(vc.ploidy); + if (vc.trio_aux) free(vc.trio_aux); + if (vc.n_sub) { + int i; + for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]); + free(vc.subsam); free(vc.sublist); + } + if (vc.bed) bed_destroy(vc.bed); + if (vc.flag & VC_QCNT) + for (c = 0; c < 256; ++c) + fprintf(stderr, "QT\t%d\t%lld\n", c, (long long)qcnt[c]); + if (seeds) free(seeds); + if (p1) bcf_p1_destroy(p1); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/em.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,310 @@ +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include "bcf.h" +#include "kmin.h" + +static double g_q2p[256]; + +#define ITER_MAX 50 +#define ITER_TRY 10 +#define EPS 1e-5 + +extern double kf_gammaq(double, double); + +/* + Generic routines + */ +// get the 3 genotype likelihoods +static double *get_pdg3(const bcf1_t *b) +{ + double *pdg; + const uint8_t *PL = 0; + int i, PL_len = 0; + // initialize g_q2p if necessary + if (g_q2p[0] == 0.) + for (i = 0; i < 256; ++i) + g_q2p[i] = pow(10., -i / 10.); + // set PL and PL_len + for (i = 0; i < b->n_gi; ++i) { + if (b->gi[i].fmt == bcf_str2int("PL", 2)) { + PL = (const uint8_t*)b->gi[i].data; + PL_len = b->gi[i].len; + break; + } + } + if (i == b->n_gi) return 0; // no PL + // fill pdg + pdg = malloc(3 * b->n_smpl * sizeof(double)); + for (i = 0; i < b->n_smpl; ++i) { + const uint8_t *pi = PL + i * PL_len; + double *p = pdg + i * 3; + p[0] = g_q2p[pi[2]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]]; + } + return pdg; +} + +// estimate site allele frequency in a very naive and inaccurate way +static double est_freq(int n, const double *pdg) +{ + int i, gcnt[3], tmp1; + // get a rough estimate of the genotype frequency + gcnt[0] = gcnt[1] = gcnt[2] = 0; + for (i = 0; i < n; ++i) { + const double *p = pdg + i * 3; + if (p[0] != 1. || p[1] != 1. || p[2] != 1.) { + int which = p[0] > p[1]? 0 : 1; + which = p[which] > p[2]? which : 2; + ++gcnt[which]; + } + } + tmp1 = gcnt[0] + gcnt[1] + gcnt[2]; + return (tmp1 == 0)? -1.0 : (.5 * gcnt[1] + gcnt[2]) / tmp1; +} + +/* + Single-locus EM + */ + +typedef struct { + int beg, end; + const double *pdg; +} minaux1_t; + +static double prob1(double f, void *data) +{ + minaux1_t *a = (minaux1_t*)data; + double p = 1., l = 0., f3[3]; + int i; +// printf("brent %lg\n", f); + if (f < 0 || f > 1) return 1e300; + f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f; + for (i = a->beg; i < a->end; ++i) { + const double *pdg = a->pdg + i * 3; + p *= pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]; + if (p < 1e-200) l -= log(p), p = 1.; + } + return l - log(p); +} + +// one EM iteration for allele frequency estimate +static double freq_iter(double *f, const double *_pdg, int beg, int end) +{ + double f0 = *f, f3[3], err; + int i; +// printf("em %lg\n", *f); + f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; + for (i = beg, f0 = 0.; i < end; ++i) { + const double *pdg = _pdg + i * 3; + f0 += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2]) + / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]); + } + f0 /= (end - beg) * 2; + err = fabs(f0 - *f); + *f = f0; + return err; +} + +/* The following function combines EM and Brent's method. When the signal from + * the data is strong, EM is faster but sometimes, EM may converge very slowly. + * When this happens, we switch to Brent's method. The idea is learned from + * Rasmus Nielsen. + */ +static double freqml(double f0, int beg, int end, const double *pdg) +{ + int i; + double f; + for (i = 0, f = f0; i < ITER_TRY; ++i) + if (freq_iter(&f, pdg, beg, end) < EPS) break; + if (i == ITER_TRY) { // haven't converged yet; try Brent's method + minaux1_t a; + a.beg = beg; a.end = end; a.pdg = pdg; + kmin_brent(prob1, f0 == f? .5*f0 : f0, f, (void*)&a, EPS, &f); + } + return f; +} + +// one EM iteration for genotype frequency estimate +static double g3_iter(double g[3], const double *_pdg, int beg, int end) +{ + double err, gg[3]; + int i; + gg[0] = gg[1] = gg[2] = 0.; +// printf("%lg,%lg,%lg\n", g[0], g[1], g[2]); + for (i = beg; i < end; ++i) { + double sum, tmp[3]; + const double *pdg = _pdg + i * 3; + tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2]; + sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg); + gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum; + } + err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]); + err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]); + g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2]; + return err; +} + +// perform likelihood ratio test +static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3]) +{ + double r; + int i; + for (i = 0, r = 1.; i < n1; ++i) { + const double *p = pdg + i * 3; + r *= (p[0] * f3[1][0] + p[1] * f3[1][1] + p[2] * f3[1][2]) + / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]); + } + for (; i < n; ++i) { + const double *p = pdg + i * 3; + r *= (p[0] * f3[2][0] + p[1] * f3[2][1] + p[2] * f3[2][2]) + / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]); + } + return r; +} + +// x[0]: ref frequency +// x[1..3]: alt-alt, alt-ref, ref-ref frequenc +// x[4]: HWE P-value +// x[5..6]: group1 freq, group2 freq +// x[7]: 1-degree P-value +// x[8]: 2-degree P-value +int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]) +{ + double *pdg; + int i, n, n2; + if (b->n_alleles < 2) return -1; // one allele only + // initialization + if (n1 < 0 || n1 > b->n_smpl) n1 = 0; + if (flag & 1<<7) flag |= 7<<5; // compute group freq if LRT is required + if (flag & 0xf<<1) flag |= 0xf<<1; + n = b->n_smpl; n2 = n - n1; + pdg = get_pdg3(b); + if (pdg == 0) return -1; + for (i = 0; i < 10; ++i) x[i] = -1.; // set to negative + { + if ((x[0] = est_freq(n, pdg)) < 0.) { + free(pdg); + return -1; // no data + } + x[0] = freqml(x[0], 0, n, pdg); + } + if (flag & (0xf<<1|3<<8)) { // estimate the genotype frequency and test HWE + double *g = x + 1, f3[3], r; + f3[0] = g[0] = (1 - x[0]) * (1 - x[0]); + f3[1] = g[1] = 2 * x[0] * (1 - x[0]); + f3[2] = g[2] = x[0] * x[0]; + for (i = 0; i < ITER_MAX; ++i) + if (g3_iter(g, pdg, 0, n) < EPS) break; + // Hardy-Weinberg equilibrium (HWE) + for (i = 0, r = 1.; i < n; ++i) { + double *p = pdg + i * 3; + r *= (p[0] * g[0] + p[1] * g[1] + p[2] * g[2]) / (p[0] * f3[0] + p[1] * f3[1] + p[2] * f3[2]); + } + x[4] = kf_gammaq(.5, log(r)); + } + if ((flag & 7<<5) && n1 > 0 && n1 < n) { // group frequency + x[5] = freqml(x[0], 0, n1, pdg); + x[6] = freqml(x[0], n1, n, pdg); + } + if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value + double f[3], f3[3][3], tmp; + f[0] = x[0]; f[1] = x[5]; f[2] = x[6]; + for (i = 0; i < 3; ++i) + f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i]; + tmp = log(lk_ratio_test(n, n1, pdg, f3)); + if (tmp < 0) tmp = 0; + x[7] = kf_gammaq(.5, tmp); + } + if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value + double g[3][3], tmp; + for (i = 0; i < 3; ++i) memcpy(g[i], x + 1, 3 * sizeof(double)); + for (i = 0; i < ITER_MAX; ++i) + if (g3_iter(g[1], pdg, 0, n1) < EPS) break; + for (i = 0; i < ITER_MAX; ++i) + if (g3_iter(g[2], pdg, n1, n) < EPS) break; + tmp = log(lk_ratio_test(n, n1, pdg, g)); + if (tmp < 0) tmp = 0; + x[8] = kf_gammaq(1., tmp); + } + // free + free(pdg); + return 0; +} + +/* + Two-locus EM (LD) + */ + +#define _G1(h, k) ((h>>1&1) + (k>>1&1)) +#define _G2(h, k) ((h&1) + (k&1)) + +// 0: the previous site; 1: the current site +static int pair_freq_iter(int n, double *pdg[2], double f[4]) +{ + double ff[4]; + int i, k, h; +// printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]); + memset(ff, 0, 4 * sizeof(double)); + for (i = 0; i < n; ++i) { + double *p[2], sum, tmp; + p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3; + for (k = 0, sum = 0.; k < 4; ++k) + for (h = 0; h < 4; ++h) + sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)]; + for (k = 0; k < 4; ++k) { + tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)]) + + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)]) + + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)]) + + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]); + ff[k] += f[k] * tmp / sum; + } + } + for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n); + return 0; +} + +double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]) +{ + const bcf1_t *b[2]; + int i, j, n_smpl; + double *pdg[2], flast[4], r, f0[2]; + // initialize others + if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples + n_smpl = b0->n_smpl; + b[0] = b0; b[1] = b1; + f[0] = f[1] = f[2] = f[3] = -1.; + if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only + pdg[0] = get_pdg3(b0); pdg[1] = get_pdg3(b1); + if (pdg[0] == 0 || pdg[1] == 0) { + free(pdg[0]); free(pdg[1]); + return -1; + } + // set the initial value + f0[0] = est_freq(n_smpl, pdg[0]); + f0[1] = est_freq(n_smpl, pdg[1]); + f[0] = (1 - f0[0]) * (1 - f0[1]); f[3] = f0[0] * f0[1]; + f[1] = (1 - f0[0]) * f0[1]; f[2] = f0[0] * (1 - f0[1]); + // iteration + for (j = 0; j < ITER_MAX; ++j) { + double eps = 0; + memcpy(flast, f, 4 * sizeof(double)); + pair_freq_iter(n_smpl, pdg, f); + for (i = 0; i < 4; ++i) { + double x = fabs(f[i] - flast[i]); + if (x > eps) eps = x; + } + if (eps < EPS) break; + } + // free + free(pdg[0]); free(pdg[1]); + { // calculate r^2 + double p[2], q[2], D; + p[0] = f[0] + f[1]; q[0] = 1 - p[0]; + p[1] = f[0] + f[2]; q[1] = 1 - p[1]; + D = f[0] * f[3] - f[1] * f[2]; + r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1])); +// printf("R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r); + if (isnan(r)) r = -1.; + } + return r; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/fet.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,112 @@ +#include <math.h> +#include <stdlib.h> + +/* This program is implemented with ideas from this web page: + * + * http://www.langsrud.com/fisher.htm + */ + +// log\binom{n}{k} +static double lbinom(int n, int k) +{ + if (k == 0 || n == k) return 0; + return lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1); +} + +// n11 n12 | n1_ +// n21 n22 | n2_ +//-----------+---- +// n_1 n_2 | n + +// hypergeometric distribution +static double hypergeo(int n11, int n1_, int n_1, int n) +{ + return exp(lbinom(n1_, n11) + lbinom(n-n1_, n_1-n11) - lbinom(n, n_1)); +} + +typedef struct { + int n11, n1_, n_1, n; + double p; +} hgacc_t; + +// incremental version of hypergenometric distribution +static double hypergeo_acc(int n11, int n1_, int n_1, int n, hgacc_t *aux) +{ + if (n1_ || n_1 || n) { + aux->n11 = n11; aux->n1_ = n1_; aux->n_1 = n_1; aux->n = n; + } else { // then only n11 changed; the rest fixed + if (n11%11 && n11 + aux->n - aux->n1_ - aux->n_1) { + if (n11 == aux->n11 + 1) { // incremental + aux->p *= (double)(aux->n1_ - aux->n11) / n11 + * (aux->n_1 - aux->n11) / (n11 + aux->n - aux->n1_ - aux->n_1); + aux->n11 = n11; + return aux->p; + } + if (n11 == aux->n11 - 1) { // incremental + aux->p *= (double)aux->n11 / (aux->n1_ - n11) + * (aux->n11 + aux->n - aux->n1_ - aux->n_1) / (aux->n_1 - n11); + aux->n11 = n11; + return aux->p; + } + } + aux->n11 = n11; + } + aux->p = hypergeo(aux->n11, aux->n1_, aux->n_1, aux->n); + return aux->p; +} + +double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two) +{ + int i, j, max, min; + double p, q, left, right; + hgacc_t aux; + int n1_, n_1, n; + + n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n + max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail + min = n1_ + n_1 - n; + if (min < 0) min = 0; // min n11, for left tail + *two = *_left = *_right = 1.; + if (min == max) return 1.; // no need to do test + q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table + // left tail + p = hypergeo_acc(min, 0, 0, 0, &aux); + for (left = 0., i = min + 1; p < 0.99999999 * q; ++i) // loop until underflow + left += p, p = hypergeo_acc(i, 0, 0, 0, &aux); + --i; + if (p < 1.00000001 * q) left += p; + else --i; + // right tail + p = hypergeo_acc(max, 0, 0, 0, &aux); + for (right = 0., j = max - 1; p < 0.99999999 * q; --j) // loop until underflow + right += p, p = hypergeo_acc(j, 0, 0, 0, &aux); + ++j; + if (p < 1.00000001 * q) right += p; + else ++j; + // two-tail + *two = left + right; + if (*two > 1.) *two = 1.; + // adjust left and right + if (abs(i - n11) < abs(j - n11)) right = 1. - left + q; + else left = 1.0 - right + q; + *_left = left; *_right = right; + return q; +} + +#ifdef FET_MAIN +#include <stdio.h> + +int main(int argc, char *argv[]) +{ + char id[1024]; + int n11, n12, n21, n22; + double left, right, twotail, prob; + + while (scanf("%s%d%d%d%d", id, &n11, &n12, &n21, &n22) == 5) { + prob = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &twotail); + printf("%s\t%d\t%d\t%d\t%d\t%.6g\t%.6g\t%.6g\t%.6g\n", id, n11, n12, n21, n22, + prob, left, right, twotail); + } + return 0; +} +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/index.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,336 @@ +#include <assert.h> +#include <ctype.h> +#include <sys/stat.h> +#include "bam_endian.h" +#include "kstring.h" +#include "bcf.h" +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +#define TAD_LIDX_SHIFT 13 + +typedef struct { + int32_t n, m; + uint64_t *offset; +} bcf_lidx_t; + +struct __bcf_idx_t { + int32_t n; + bcf_lidx_t *index2; +}; + +/************ + * indexing * + ************/ + +static inline void insert_offset2(bcf_lidx_t *index2, int _beg, int _end, uint64_t offset) +{ + int i, beg, end; + beg = _beg >> TAD_LIDX_SHIFT; + end = (_end - 1) >> TAD_LIDX_SHIFT; + if (index2->m < end + 1) { + int old_m = index2->m; + index2->m = end + 1; + kroundup32(index2->m); + index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); + memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); + } + if (beg == end) { + if (index2->offset[beg] == 0) index2->offset[beg] = offset; + } else { + for (i = beg; i <= end; ++i) + if (index2->offset[i] == 0) index2->offset[i] = offset; + } + if (index2->n < end + 1) index2->n = end + 1; +} + +bcf_idx_t *bcf_idx_core(bcf_t *bp, bcf_hdr_t *h) +{ + bcf_idx_t *idx; + int32_t last_coor, last_tid; + uint64_t last_off; + kstring_t *str; + BGZF *fp = bp->fp; + bcf1_t *b; + int ret; + + b = calloc(1, sizeof(bcf1_t)); + str = calloc(1, sizeof(kstring_t)); + idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t)); + idx->n = h->n_ref; + idx->index2 = calloc(h->n_ref, sizeof(bcf_lidx_t)); + + last_tid = 0xffffffffu; + last_off = bgzf_tell(fp); last_coor = 0xffffffffu; + while ((ret = bcf_read(bp, h, b)) > 0) { + int end, tmp; + if (last_tid != b->tid) { // change of chromosomes + last_tid = b->tid; + } else if (last_coor > b->pos) { + fprintf(stderr, "[bcf_idx_core] the input is out of order\n"); + free(str->s); free(str); free(idx); bcf_destroy(b); + return 0; + } + tmp = strlen(b->ref); + end = b->pos + (tmp > 0? tmp : 1); + insert_offset2(&idx->index2[b->tid], b->pos, end, last_off); + last_off = bgzf_tell(fp); + last_coor = b->pos; + } + free(str->s); free(str); bcf_destroy(b); + return idx; +} + +void bcf_idx_destroy(bcf_idx_t *idx) +{ + int i; + if (idx == 0) return; + for (i = 0; i < idx->n; ++i) free(idx->index2[i].offset); + free(idx->index2); + free(idx); +} + +/****************** + * index file I/O * + ******************/ + +void bcf_idx_save(const bcf_idx_t *idx, BGZF *fp) +{ + int32_t i, ti_is_be; + ti_is_be = bam_is_big_endian(); + bgzf_write(fp, "BCI\4", 4); + if (ti_is_be) { + uint32_t x = idx->n; + bgzf_write(fp, bam_swap_endian_4p(&x), 4); + } else bgzf_write(fp, &idx->n, 4); + for (i = 0; i < idx->n; ++i) { + bcf_lidx_t *index2 = idx->index2 + i; + // write linear index (index2) + if (ti_is_be) { + int x = index2->n; + bgzf_write(fp, bam_swap_endian_4p(&x), 4); + } else bgzf_write(fp, &index2->n, 4); + if (ti_is_be) { // big endian + int x; + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + bgzf_write(fp, index2->offset, 8 * index2->n); + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + } else bgzf_write(fp, index2->offset, 8 * index2->n); + } +} + +static bcf_idx_t *bcf_idx_load_core(BGZF *fp) +{ + int i, ti_is_be; + char magic[4]; + bcf_idx_t *idx; + ti_is_be = bam_is_big_endian(); + if (fp == 0) { + fprintf(stderr, "[%s] fail to load index.\n", __func__); + return 0; + } + bgzf_read(fp, magic, 4); + if (strncmp(magic, "BCI\4", 4)) { + fprintf(stderr, "[%s] wrong magic number.\n", __func__); + return 0; + } + idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t)); + bgzf_read(fp, &idx->n, 4); + if (ti_is_be) bam_swap_endian_4p(&idx->n); + idx->index2 = (bcf_lidx_t*)calloc(idx->n, sizeof(bcf_lidx_t)); + for (i = 0; i < idx->n; ++i) { + bcf_lidx_t *index2 = idx->index2 + i; + int j; + bgzf_read(fp, &index2->n, 4); + if (ti_is_be) bam_swap_endian_4p(&index2->n); + index2->m = index2->n; + index2->offset = (uint64_t*)calloc(index2->m, 8); + bgzf_read(fp, index2->offset, index2->n * 8); + if (ti_is_be) + for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); + } + return idx; +} + +bcf_idx_t *bcf_idx_load_local(const char *fnidx) +{ + BGZF *fp; + fp = bgzf_open(fnidx, "r"); + if (fp) { + bcf_idx_t *idx = bcf_idx_load_core(fp); + bgzf_close(fp); + return idx; + } else return 0; +} + +#ifdef _USE_KNETFILE +static void download_from_remote(const char *url) +{ + const int buf_size = 1 * 1024 * 1024; + char *fn; + FILE *fp; + uint8_t *buf; + knetFile *fp_remote; + int l; + if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return; + l = strlen(url); + for (fn = (char*)url + l - 1; fn >= url; --fn) + if (*fn == '/') break; + ++fn; // fn now points to the file name + fp_remote = knet_open(url, "r"); + if (fp_remote == 0) { + fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); + return; + } + if ((fp = fopen(fn, "w")) == 0) { + fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); + knet_close(fp_remote); + return; + } + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + fwrite(buf, 1, l, fp); + free(buf); + fclose(fp); + knet_close(fp_remote); +} +#else +static void download_from_remote(const char *url) +{ + return; +} +#endif + +static char *get_local_version(const char *fn) +{ + struct stat sbuf; + char *fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcat(strcpy(fnidx, fn), ".bci"); + if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) { + char *p, *url; + int l = strlen(fnidx); + for (p = fnidx + l - 1; p >= fnidx; --p) + if (*p == '/') break; + url = fnidx; fnidx = strdup(p + 1); + if (stat(fnidx, &sbuf) == 0) { + free(url); + return fnidx; + } + fprintf(stderr, "[%s] downloading the index file...\n", __func__); + download_from_remote(url); + free(url); + } + if (stat(fnidx, &sbuf) == 0) return fnidx; + free(fnidx); return 0; +} + +bcf_idx_t *bcf_idx_load(const char *fn) +{ + bcf_idx_t *idx; + char *fname = get_local_version(fn); + if (fname == 0) return 0; + idx = bcf_idx_load_local(fname); + free(fname); + return idx; +} + +int bcf_idx_build2(const char *fn, const char *_fnidx) +{ + char *fnidx; + BGZF *fpidx; + bcf_t *bp; + bcf_idx_t *idx; + bcf_hdr_t *h; + if ((bp = bcf_open(fn, "r")) == 0) { + fprintf(stderr, "[bcf_idx_build2] fail to open the BAM file.\n"); + return -1; + } + h = bcf_hdr_read(bp); + idx = bcf_idx_core(bp, h); + bcf_close(bp); + if (_fnidx == 0) { + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".bci"); + } else fnidx = strdup(_fnidx); + fpidx = bgzf_open(fnidx, "w"); + if (fpidx == 0) { + fprintf(stderr, "[bcf_idx_build2] fail to create the index file.\n"); + free(fnidx); + bcf_idx_destroy(idx); + return -1; + } + bcf_idx_save(idx, fpidx); + bcf_idx_destroy(idx); + bgzf_close(fpidx); + free(fnidx); + return 0; +} + +int bcf_idx_build(const char *fn) +{ + return bcf_idx_build2(fn, 0); +} + +/******************************************** + * parse a region in the format chr:beg-end * + ********************************************/ + +int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end) +{ + char *s, *p; + int i, l, k; + l = strlen(str); + p = s = (char*)malloc(l+1); + /* squeeze out "," */ + for (i = k = 0; i != l; ++i) + if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; + s[k] = 0; + for (i = 0; i != k; ++i) if (s[i] == ':') break; + s[i] = 0; + if ((*tid = bcf_str2id(str2id, s)) < 0) { + free(s); + return -1; + } + if (i == k) { /* dump the whole sequence */ + *begin = 0; *end = 1<<29; free(s); + return 0; + } + for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; + *begin = atoi(p); + if (i < k) { + p = s + i + 1; + *end = atoi(p); + } else *end = 1<<29; + if (*begin > 0) --*begin; + free(s); + if (*begin > *end) return -1; + return 0; +} + +/******************************* + * retrieve a specified region * + *******************************/ + +uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg) +{ + uint64_t min_off, *offset; + int i; + if (beg < 0) beg = 0; + offset = idx->index2[tid].offset; + for (i = beg>>TAD_LIDX_SHIFT; i < idx->index2[tid].n && offset[i] == 0; ++i); + min_off = (i == idx->index2[tid].n)? offset[idx->index2[tid].n-1] : offset[i]; + return min_off; +} + +int bcf_main_index(int argc, char *argv[]) +{ + if (argc == 1) { + fprintf(stderr, "Usage: bcftools index <in.bcf>\n"); + return 1; + } + bcf_idx_build(argv[1]); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/kfunc.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,162 @@ +#include <math.h> + + +/* Log gamma function + * \log{\Gamma(z)} + * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245 + */ +double kf_lgamma(double z) +{ + double x = 0; + x += 0.1659470187408462e-06 / (z+7); + x += 0.9934937113930748e-05 / (z+6); + x -= 0.1385710331296526 / (z+5); + x += 12.50734324009056 / (z+4); + x -= 176.6150291498386 / (z+3); + x += 771.3234287757674 / (z+2); + x -= 1259.139216722289 / (z+1); + x += 676.5203681218835 / z; + x += 0.9999999999995183; + return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5); +} + +/* complementary error function + * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt + * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66 + */ +double kf_erfc(double x) +{ + const double p0 = 220.2068679123761; + const double p1 = 221.2135961699311; + const double p2 = 112.0792914978709; + const double p3 = 33.912866078383; + const double p4 = 6.37396220353165; + const double p5 = .7003830644436881; + const double p6 = .03526249659989109; + const double q0 = 440.4137358247522; + const double q1 = 793.8265125199484; + const double q2 = 637.3336333788311; + const double q3 = 296.5642487796737; + const double q4 = 86.78073220294608; + const double q5 = 16.06417757920695; + const double q6 = 1.755667163182642; + const double q7 = .08838834764831844; + double expntl, z, p; + z = fabs(x) * M_SQRT2; + if (z > 37.) return x > 0.? 0. : 2.; + expntl = exp(z * z * - .5); + if (z < 10. / M_SQRT2) // for small z + p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0) + / (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0); + else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65))))); + return x > 0.? 2. * p : 2. * (1. - p); +} + +/* The following computes regularized incomplete gamma functions. + * Formulas are taken from Wiki, with additional input from Numerical + * Recipes in C (for modified Lentz's algorithm) and AS245 + * (http://lib.stat.cmu.edu/apstat/245). + * + * A good online calculator is available at: + * + * http://www.danielsoper.com/statcalc/calc23.aspx + * + * It calculates upper incomplete gamma function, which equals + * kf_gammaq(s,z)*tgamma(s). + */ + +#define KF_GAMMA_EPS 1e-14 +#define KF_TINY 1e-290 + +// regularized lower incomplete gamma function, by series expansion +static double _kf_gammap(double s, double z) +{ + double sum, x; + int k; + for (k = 1, sum = x = 1.; k < 100; ++k) { + sum += (x *= z / (s + k)); + if (x / sum < KF_GAMMA_EPS) break; + } + return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum)); +} +// regularized upper incomplete gamma function, by continued fraction +static double _kf_gammaq(double s, double z) +{ + int j; + double C, D, f; + f = 1. + z - s; C = f; D = 0.; + // Modified Lentz's algorithm for computing continued fraction + // See Numerical Recipes in C, 2nd edition, section 5.2 + for (j = 1; j < 100; ++j) { + double a = j * (s - j), b = (j<<1) + 1 + z - s, d; + D = b + a * D; + if (D < KF_TINY) D = KF_TINY; + C = b + a / C; + if (C < KF_TINY) C = KF_TINY; + D = 1. / D; + d = C * D; + f *= d; + if (fabs(d - 1.) < KF_GAMMA_EPS) break; + } + return exp(s * log(z) - z - kf_lgamma(s) - log(f)); +} + +double kf_gammap(double s, double z) +{ + return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z); +} + +double kf_gammaq(double s, double z) +{ + return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z); +} + +/* Regularized incomplete beta function. The method is taken from + * Numerical Recipe in C, 2nd edition, section 6.4. The following web + * page calculates the incomplete beta function, which equals + * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b): + * + * http://www.danielsoper.com/statcalc/calc36.aspx + */ +static double kf_betai_aux(double a, double b, double x) +{ + double C, D, f; + int j; + if (x == 0.) return 0.; + if (x == 1.) return 1.; + f = 1.; C = f; D = 0.; + // Modified Lentz's algorithm for computing continued fraction + for (j = 1; j < 200; ++j) { + double aa, d; + int m = j>>1; + aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1)) + : m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m)); + D = 1. + aa * D; + if (D < KF_TINY) D = KF_TINY; + C = 1. + aa / C; + if (C < KF_TINY) C = KF_TINY; + D = 1. / D; + d = C * D; + f *= d; + if (fabs(d - 1.) < KF_GAMMA_EPS) break; + } + return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f; +} +double kf_betai(double a, double b, double x) +{ + return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x); +} + +#ifdef KF_MAIN +#include <stdio.h> +int main(int argc, char *argv[]) +{ + double x = 5.5, y = 3; + double a, b; + printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x)); + printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y)); + a = 2; b = 2; x = 0.5; + printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b))); + return 0; +} +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/kmin.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,209 @@ +/* The MIT License + + Copyright (c) 2008, 2010 by Attractive Chaos <attractor@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Hooke-Jeeves algorithm for nonlinear minimization + + Based on the pseudocodes by Bell and Pike (CACM 9(9):684-685), and + the revision by Tomlin and Smith (CACM 12(11):637-638). Both of the + papers are comments on Kaupe's Algorithm 178 "Direct Search" (ACM + 6(6):313-314). The original algorithm was designed by Hooke and + Jeeves (ACM 8:212-229). This program is further revised according to + Johnson's implementation at Netlib (opt/hooke.c). + + Hooke-Jeeves algorithm is very simple and it works quite well on a + few examples. However, it might fail to converge due to its heuristic + nature. A possible improvement, as is suggested by Johnson, may be to + choose a small r at the beginning to quickly approach to the minimum + and a large r at later step to hit the minimum. + */ + +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include "kmin.h" + +static double __kmin_hj_aux(kmin_f func, int n, double *x1, void *data, double fx1, double *dx, int *n_calls) +{ + int k, j = *n_calls; + double ftmp; + for (k = 0; k != n; ++k) { + x1[k] += dx[k]; + ftmp = func(n, x1, data); ++j; + if (ftmp < fx1) fx1 = ftmp; + else { /* search the opposite direction */ + dx[k] = 0.0 - dx[k]; + x1[k] += dx[k] + dx[k]; + ftmp = func(n, x1, data); ++j; + if (ftmp < fx1) fx1 = ftmp; + else x1[k] -= dx[k]; /* back to the original x[k] */ + } + } + *n_calls = j; + return fx1; /* here: fx1=f(n,x1) */ +} + +double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls) +{ + double fx, fx1, *x1, *dx, radius; + int k, n_calls = 0; + x1 = (double*)calloc(n, sizeof(double)); + dx = (double*)calloc(n, sizeof(double)); + for (k = 0; k != n; ++k) { /* initial directions, based on MGJ */ + dx[k] = fabs(x[k]) * r; + if (dx[k] == 0) dx[k] = r; + } + radius = r; + fx1 = fx = func(n, x, data); ++n_calls; + for (;;) { + memcpy(x1, x, n * sizeof(double)); /* x1 = x */ + fx1 = __kmin_hj_aux(func, n, x1, data, fx, dx, &n_calls); + while (fx1 < fx) { + for (k = 0; k != n; ++k) { + double t = x[k]; + dx[k] = x1[k] > x[k]? fabs(dx[k]) : 0.0 - fabs(dx[k]); + x[k] = x1[k]; + x1[k] = x1[k] + x1[k] - t; + } + fx = fx1; + if (n_calls >= max_calls) break; + fx1 = func(n, x1, data); ++n_calls; + fx1 = __kmin_hj_aux(func, n, x1, data, fx1, dx, &n_calls); + if (fx1 >= fx) break; + for (k = 0; k != n; ++k) + if (fabs(x1[k] - x[k]) > .5 * fabs(dx[k])) break; + if (k == n) break; + } + if (radius >= eps) { + if (n_calls >= max_calls) break; + radius *= r; + for (k = 0; k != n; ++k) dx[k] *= r; + } else break; /* converge */ + } + free(x1); free(dx); + return fx1; +} + +// I copied this function somewhere several years ago with some of my modifications, but I forgot the source. +double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin) +{ + double bound, u, r, q, fu, tmp, fa, fb, fc, c; + const double gold1 = 1.6180339887; + const double gold2 = 0.3819660113; + const double tiny = 1e-20; + const int max_iter = 100; + + double e, d, w, v, mid, tol1, tol2, p, eold, fv, fw; + int iter; + + fa = func(a, data); fb = func(b, data); + if (fb > fa) { // swap, such that f(a) > f(b) + tmp = a; a = b; b = tmp; + tmp = fa; fa = fb; fb = tmp; + } + c = b + gold1 * (b - a), fc = func(c, data); // golden section extrapolation + while (fb > fc) { + bound = b + 100.0 * (c - b); // the farthest point where we want to go + r = (b - a) * (fb - fc); + q = (b - c) * (fb - fa); + if (fabs(q - r) < tiny) { // avoid 0 denominator + tmp = q > r? tiny : 0.0 - tiny; + } else tmp = q - r; + u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp); // u is the parabolic extrapolation point + if ((b > u && u > c) || (b < u && u < c)) { // u lies between b and c + fu = func(u, data); + if (fu < fc) { // (b,u,c) bracket the minimum + a = b; b = u; fa = fb; fb = fu; + break; + } else if (fu > fb) { // (a,b,u) bracket the minimum + c = u; fc = fu; + break; + } + u = c + gold1 * (c - b); fu = func(u, data); // golden section extrapolation + } else if ((c > u && u > bound) || (c < u && u < bound)) { // u lies between c and bound + fu = func(u, data); + if (fu < fc) { // fb > fc > fu + b = c; c = u; u = c + gold1 * (c - b); + fb = fc; fc = fu; fu = func(u, data); + } else { // (b,c,u) bracket the minimum + a = b; b = c; c = u; + fa = fb; fb = fc; fc = fu; + break; + } + } else if ((u > bound && bound > c) || (u < bound && bound < c)) { // u goes beyond the bound + u = bound; fu = func(u, data); + } else { // u goes the other way around, use golden section extrapolation + u = c + gold1 * (c - b); fu = func(u, data); + } + a = b; b = c; c = u; + fa = fb; fb = fc; fc = fu; + } + if (a > c) u = a, a = c, c = u; // swap + + // now, a<b<c, fa>fb and fb<fc, move on to Brent's algorithm + e = d = 0.0; + w = v = b; fv = fw = fb; + for (iter = 0; iter != max_iter; ++iter) { + mid = 0.5 * (a + c); + tol2 = 2.0 * (tol1 = tol * fabs(b) + tiny); + if (fabs(b - mid) <= (tol2 - 0.5 * (c - a))) { + *xmin = b; return fb; // found + } + if (fabs(e) > tol1) { + // related to parabolic interpolation + r = (b - w) * (fb - fv); + q = (b - v) * (fb - fw); + p = (b - v) * q - (b - w) * r; + q = 2.0 * (q - r); + if (q > 0.0) p = 0.0 - p; + else q = 0.0 - q; + eold = e; e = d; + if (fabs(p) >= fabs(0.5 * q * eold) || p <= q * (a - b) || p >= q * (c - b)) { + d = gold2 * (e = (b >= mid ? a - b : c - b)); + } else { + d = p / q; u = b + d; // actual parabolic interpolation happens here + if (u - a < tol2 || c - u < tol2) + d = (mid > b)? tol1 : 0.0 - tol1; + } + } else d = gold2 * (e = (b >= mid ? a - b : c - b)); // golden section interpolation + u = fabs(d) >= tol1 ? b + d : b + (d > 0.0? tol1 : -tol1); + fu = func(u, data); + if (fu <= fb) { // u is the minimum point so far + if (u >= b) a = b; + else c = b; + v = w; w = b; b = u; fv = fw; fw = fb; fb = fu; + } else { // adjust (a,c) and (u,v,w) + if (u < b) a = u; + else c = u; + if (fu <= fw || w == b) { + v = w; w = u; + fv = fw; fw = fu; + } else if (fu <= fv || v == b || v == w) { + v = u; fv = fu; + } + } + } + *xmin = b; + return fb; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/kmin.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,46 @@ +/* + Copyright (c) 2008, 2010 by Attractive Chaos <attractor@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef KMIN_H +#define KMIN_H + +#define KMIN_RADIUS 0.5 +#define KMIN_EPS 1e-7 +#define KMIN_MAXCALL 50000 + +typedef double (*kmin_f)(int, double*, void*); +typedef double (*kmin1_f)(double, void*); + +#ifdef __cplusplus +extern "C" { +#endif + + double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls); + double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin); + +#ifdef __cplusplus +} +#endif + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/main.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,191 @@ +#include <string.h> +#include <stdlib.h> +#include <sys/stat.h> +#include <unistd.h> +#include "knetfile.h" +#include "bcf.h" + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 0x10000) + +int bcfview(int argc, char *argv[]); +int bcf_main_index(int argc, char *argv[]); + +#define BUF_SIZE 0x10000 + +int bcf_cat(int n, char * const *fn) +{ + int i; + bcf_t *out; + uint8_t *buf; + buf = malloc(BUF_SIZE); + out = bcf_open("-", "w"); + for (i = 0; i < n; ++i) { + bcf_t *in; + bcf_hdr_t *h; + off_t end; + struct stat s; + in = bcf_open(fn[i], "r"); + h = bcf_hdr_read(in); + if (i == 0) bcf_hdr_write(out, h); + bcf_hdr_destroy(h); +#ifdef _USE_KNETFILE + fstat(knet_fileno((knetFile*)in->fp->fp), &s); + end = s.st_size - 28; + while (knet_tell((knetFile*)in->fp->fp) < end) { + int size = knet_tell((knetFile*)in->fp->fp) + BUF_SIZE < end? BUF_SIZE : end - knet_tell((knetFile*)in->fp->fp); + knet_read(in->fp->fp, buf, size); + fwrite(buf, 1, size, out->fp->fp); + } +#else + abort(); // FIXME: not implemented +#endif + bcf_close(in); + } + bcf_close(out); + free(buf); + return 0; +} + +extern double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); + +int bcf_main_ldpair(int argc, char *argv[]) +{ + bcf_t *fp; + bcf_hdr_t *h; + bcf1_t *b0, *b1; + bcf_idx_t *idx; + kstring_t str; + void *str2id; + gzFile fplist; + kstream_t *ks; + int dret, lineno = 0; + if (argc < 3) { + fprintf(stderr, "Usage: bcftools ldpair <in.bcf> <in.list>\n"); + return 1; + } + fplist = gzopen(argv[2], "rb"); + ks = ks_init(fplist); + memset(&str, 0, sizeof(kstring_t)); + fp = bcf_open(argv[1], "rb"); + h = bcf_hdr_read(fp); + str2id = bcf_build_refhash(h); + idx = bcf_idx_load(argv[1]); + if (idx == 0) { + fprintf(stderr, "[%s] No bcf index is found. Abort!\n", __func__); + return 1; + } + b0 = calloc(1, sizeof(bcf1_t)); + b1 = calloc(1, sizeof(bcf1_t)); + while (ks_getuntil(ks, '\n', &str, &dret) >= 0) { + char *p, *q; + int k; + int tid0 = -1, tid1 = -1, pos0 = -1, pos1 = -1; + ++lineno; + for (p = q = str.s, k = 0; *p; ++p) { + if (*p == ' ' || *p == '\t') { + *p = '\0'; + if (k == 0) tid0 = bcf_str2id(str2id, q); + else if (k == 1) pos0 = atoi(q) - 1; + else if (k == 2) tid1 = strcmp(q, "=")? bcf_str2id(str2id, q) : tid0; + else if (k == 3) pos1 = atoi(q) - 1; + q = p + 1; + ++k; + } + } + if (k == 3) pos1 = atoi(q) - 1; + if (tid0 >= 0 && tid1 >= 0 && pos0 >= 0 && pos1 >= 0) { + uint64_t off; + double r, f[4]; + off = bcf_idx_query(idx, tid0, pos0); + bgzf_seek(fp->fp, off, SEEK_SET); + while (bcf_read(fp, h, b0) >= 0 && b0->pos != pos0); + off = bcf_idx_query(idx, tid1, pos1); + bgzf_seek(fp->fp, off, SEEK_SET); + while (bcf_read(fp, h, b1) >= 0 && b1->pos != pos1); + r = bcf_pair_freq(b0, b1, f); + r *= r; + printf("%s\t%d\t%s\t%d\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\n", h->ns[tid0], pos0+1, h->ns[tid1], pos1+1, + r, f[0], f[1], f[2], f[3]); + } //else fprintf(stderr, "[%s] Parse error at line %d.\n", __func__, lineno); + } + bcf_destroy(b0); bcf_destroy(b1); + bcf_idx_destroy(idx); + bcf_str2id_destroy(str2id); + bcf_hdr_destroy(h); + bcf_close(fp); + free(str.s); + ks_destroy(ks); + gzclose(fplist); + return 0; +} + +int bcf_main_ld(int argc, char *argv[]) +{ + bcf_t *fp; + bcf_hdr_t *h; + bcf1_t **b, *b0; + int i, j, m, n; + double f[4]; + if (argc == 1) { + fprintf(stderr, "Usage: bcftools ld <in.bcf>\n"); + return 1; + } + fp = bcf_open(argv[1], "rb"); + h = bcf_hdr_read(fp); + // read the entire BCF + m = n = 0; b = 0; + b0 = calloc(1, sizeof(bcf1_t)); + while (bcf_read(fp, h, b0) >= 0) { + if (m == n) { + m = m? m<<1 : 16; + b = realloc(b, sizeof(void*) * m); + } + b[n] = calloc(1, sizeof(bcf1_t)); + bcf_cpy(b[n++], b0); + } + bcf_destroy(b0); + // compute pair-wise r^2 + printf("%d\n", n); // the number of loci + for (i = 0; i < n; ++i) { + printf("%s:%d", h->ns[b[i]->tid], b[i]->pos + 1); + for (j = 0; j < i; ++j) { + double r = bcf_pair_freq(b[i], b[j], f); + printf("\t%.3f", r*r); + } + printf("\t1.000\n"); + } + // free + for (i = 0; i < n; ++i) bcf_destroy(b[i]); + free(b); + bcf_hdr_destroy(h); + bcf_close(fp); + return 0; +} + +int main(int argc, char *argv[]) +{ + if (argc == 1) { + fprintf(stderr, "\n"); + fprintf(stderr, "Program: bcftools (Tools for data in the VCF/BCF formats)\n"); + fprintf(stderr, "Version: %s\n\n", BCF_VERSION); + fprintf(stderr, "Usage: bcftools <command> <arguments>\n\n"); + fprintf(stderr, "Command: view print, extract, convert and call SNPs from BCF\n"); + fprintf(stderr, " index index BCF\n"); + fprintf(stderr, " cat concatenate BCFs\n"); + fprintf(stderr, " ld compute all-pair r^2\n"); + fprintf(stderr, " ldpair compute r^2 between requested pairs\n"); + fprintf(stderr, "\n"); + return 1; + } + if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1); + else if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1); + else if (strcmp(argv[1], "ld") == 0) return bcf_main_ld(argc-1, argv+1); + else if (strcmp(argv[1], "ldpair") == 0) return bcf_main_ldpair(argc-1, argv+1); + else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); // cat is different ... + else { + fprintf(stderr, "[main] Unrecognized command.\n"); + return 1; + } + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/mut.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,127 @@ +#include <stdlib.h> +#include <stdint.h> +#include "bcf.h" + +#define MAX_GENO 359 + +int8_t seq_bitcnt[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; +char *seq_nt16rev = "XACMGRSVTWYHKDBN"; + +uint32_t *bcf_trio_prep(int is_x, int is_son) +{ + int i, j, k, n, map[10]; + uint32_t *ret; + ret = calloc(MAX_GENO, 4); + for (i = 0, k = 0; i < 4; ++i) + for (j = i; j < 4; ++j) + map[k++] = 1<<i|1<<j; + for (i = 0, n = 1; i < 10; ++i) { // father + if (is_x && seq_bitcnt[map[i]] != 1) continue; + if (is_x && is_son) { + for (j = 0; j < 10; ++j) // mother + for (k = 0; k < 10; ++k) // child + if (seq_bitcnt[map[k]] == 1 && (map[j]&map[k])) + ret[n++] = j<<16 | i<<8 | k; + } else { + for (j = 0; j < 10; ++j) // mother + for (k = 0; k < 10; ++k) // child + if ((map[i]&map[k]) && (map[j]&map[k]) && ((map[i]|map[j])&map[k]) == map[k]) + ret[n++] = j<<16 | i<<8 | k; + } + } + ret[0] = n - 1; + return ret; +} + + +int bcf_trio_call(const uint32_t *prep, const bcf1_t *b, int *llr, int64_t *gt) +{ + int i, j, k; + const bcf_ginfo_t *PL; + uint8_t *gl10; + int map[10]; + if (b->n_smpl != 3) return -1; // not a trio + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + gl10 = alloca(10 * b->n_smpl); + if (bcf_gl10(b, gl10) < 0) { + if (bcf_gl10_indel(b, gl10) < 0) return -1; + } + PL = b->gi + i; + for (i = 0, k = 0; i < 4; ++i) + for (j = i; j < 4; ++j) + map[k++] = seq_nt16rev[1<<i|1<<j]; + for (j = 0; j < 3; ++j) // check if ref hom is the most probable in all members + if (((uint8_t*)PL->data)[j * PL->len] != 0) break; + if (j < 3) { // we need to go through the complex procedure + uint8_t *g[3]; + int minc = 1<<30, minc_j = -1, minf = 0, gtf = 0, gtc = 0; + g[0] = gl10; + g[1] = gl10 + 10; + g[2] = gl10 + 20; + for (j = 1; j <= (int)prep[0]; ++j) { // compute LK with constraint + int sum = g[0][prep[j]&0xff] + g[1][prep[j]>>8&0xff] + g[2][prep[j]>>16&0xff]; + if (sum < minc) minc = sum, minc_j = j; + } + gtc |= map[prep[minc_j]&0xff]; gtc |= map[prep[minc_j]>>8&0xff]<<8; gtc |= map[prep[minc_j]>>16]<<16; + for (j = 0; j < 3; ++j) { // compute LK without constraint + int min = 1<<30, min_k = -1; + for (k = 0; k < 10; ++k) + if (g[j][k] < min) min = g[j][k], min_k = k; + gtf |= map[min_k]<<(j*8); + minf += min; + } + *llr = minc - minf; *gt = (int64_t)gtc<<32 | gtf; + } else *llr = 0, *gt = -1; + return 0; +} + +int bcf_pair_call(const bcf1_t *b) +{ + int i, j, k; + const bcf_ginfo_t *PL; + if (b->n_smpl != 2) return -1; // not a pair + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + PL = b->gi + i; + for (j = 0; j < 2; ++j) // check if ref hom is the most probable in all members + if (((uint8_t*)PL->data)[j * PL->len] != 0) break; + if (j < 2) { // we need to go through the complex procedure + uint8_t *g[2]; + int minc = 1<<30, minf = 0; + g[0] = PL->data; + g[1] = (uint8_t*)PL->data + PL->len; + for (j = 0; j < PL->len; ++j) // compute LK with constraint + minc = minc < g[0][j] + g[1][j]? minc : g[0][j] + g[1][j]; + for (j = 0; j < 2; ++j) { // compute LK without constraint + int min = 1<<30; + for (k = 0; k < PL->len; ++k) + min = min < g[j][k]? min : g[j][k]; + minf += min; + } + return minc - minf; + } else return 0; +} + +int bcf_min_diff(const bcf1_t *b) +{ + int i, min = 1<<30; + const bcf_ginfo_t *PL; + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + PL = b->gi + i; + for (i = 0; i < b->n_smpl; ++i) { + int m1, m2, j; + const uint8_t *p = (uint8_t*)PL->data; + m1 = m2 = 1<<30; + for (j = 0; j < PL->len; ++j) { + if ((int)p[j] < m1) m2 = m1, m1 = p[j]; + else if ((int)p[j] < m2) m2 = p[j]; + } + min = min < m2 - m1? min : m2 - m1; + } + return min; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/prob1.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,988 @@ +#include <math.h> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <errno.h> +#include <assert.h> +#include <limits.h> +#include <zlib.h> +#include "prob1.h" +#include "kstring.h" + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 16384) + +#define MC_MAX_EM_ITER 16 +#define MC_EM_EPS 1e-5 +#define MC_DEF_INDEL 0.15 + +gzFile bcf_p1_fp_lk; + +unsigned char seq_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +struct __bcf_p1aux_t { + int n, M, n1, is_indel; + uint8_t *ploidy; // haploid or diploid ONLY + double *q2p, *pdg; // pdg -> P(D|g) + double *phi, *phi_indel; + double *z, *zswap; // aux for afs + double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set + double **hg; // hypergeometric distribution + double *lf; // log factorial + double t, t1, t2; + double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution + const uint8_t *PL; // point to PL + int PL_len; +}; + +void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x) +{ + int i; + for (i = 0; i < ma->M; ++i) + ma->phi_indel[i] = ma->phi[i] * x; + ma->phi_indel[ma->M] = 1. - ma->phi[ma->M] * x; +} + +static void init_prior(int type, double theta, int M, double *phi) +{ + int i; + if (type == MC_PTYPE_COND2) { + for (i = 0; i <= M; ++i) + phi[i] = 2. * (i + 1) / (M + 1) / (M + 2); + } else if (type == MC_PTYPE_FLAT) { + for (i = 0; i <= M; ++i) + phi[i] = 1. / (M + 1); + } else { + double sum; + for (i = 0, sum = 0.; i < M; ++i) + sum += (phi[i] = theta / (M - i)); + phi[M] = 1. - sum; + } +} + +void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta) +{ + init_prior(type, theta, ma->M, ma->phi); + bcf_p1_indel_prior(ma, MC_DEF_INDEL); +} + +void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta) +{ + if (ma->n1 <= 0 || ma->n1 >= ma->M) return; + init_prior(type, theta, 2*ma->n1, ma->phi1); + init_prior(type, theta, 2*(ma->n - ma->n1), ma->phi2); +} + +int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn) +{ + gzFile fp; + kstring_t s; + kstream_t *ks; + long double sum; + int dret, k; + memset(&s, 0, sizeof(kstring_t)); + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + memset(ma->phi, 0, sizeof(double) * (ma->M + 1)); + while (ks_getuntil(ks, '\n', &s, &dret) >= 0) { + if (strstr(s.s, "[afs] ") == s.s) { + char *p = s.s + 6; + for (k = 0; k <= ma->M; ++k) { + int x; + double y; + x = strtol(p, &p, 10); + if (x != k && (errno == EINVAL || errno == ERANGE)) return -1; + ++p; + y = strtod(p, &p); + if (y == 0. && (errno == EINVAL || errno == ERANGE)) return -1; + ma->phi[ma->M - k] += y; + } + } + } + ks_destroy(ks); + gzclose(fp); + free(s.s); + for (sum = 0., k = 0; k <= ma->M; ++k) sum += ma->phi[k]; + fprintf(stderr, "[prior]"); + for (k = 0; k <= ma->M; ++k) ma->phi[k] /= sum; + for (k = 0; k <= ma->M; ++k) fprintf(stderr, " %d:%.3lg", k, ma->phi[ma->M - k]); + fputc('\n', stderr); + for (sum = 0., k = 1; k < ma->M; ++k) sum += ma->phi[ma->M - k] * (2.* k * (ma->M - k) / ma->M / (ma->M - 1)); + fprintf(stderr, "[%s] heterozygosity=%lf, ", __func__, (double)sum); + for (sum = 0., k = 1; k <= ma->M; ++k) sum += k * ma->phi[ma->M - k] / ma->M; + fprintf(stderr, "theta=%lf\n", (double)sum); + bcf_p1_indel_prior(ma, MC_DEF_INDEL); + return 0; +} + +bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy) +{ + bcf_p1aux_t *ma; + int i; + ma = calloc(1, sizeof(bcf_p1aux_t)); + ma->n1 = -1; + ma->n = n; ma->M = 2 * n; + if (ploidy) { + ma->ploidy = malloc(n); + memcpy(ma->ploidy, ploidy, n); + for (i = 0, ma->M = 0; i < n; ++i) ma->M += ploidy[i]; + if (ma->M == 2 * n) { + free(ma->ploidy); + ma->ploidy = 0; + } + } + ma->q2p = calloc(256, sizeof(double)); + ma->pdg = calloc(3 * ma->n, sizeof(double)); + ma->phi = calloc(ma->M + 1, sizeof(double)); + ma->phi_indel = calloc(ma->M + 1, sizeof(double)); + ma->phi1 = calloc(ma->M + 1, sizeof(double)); + ma->phi2 = calloc(ma->M + 1, sizeof(double)); + ma->z = calloc(ma->M + 1, sizeof(double)); + ma->zswap = calloc(ma->M + 1, sizeof(double)); + ma->z1 = calloc(ma->M + 1, sizeof(double)); // actually we do not need this large + ma->z2 = calloc(ma->M + 1, sizeof(double)); + ma->afs = calloc(ma->M + 1, sizeof(double)); + ma->afs1 = calloc(ma->M + 1, sizeof(double)); + ma->lf = calloc(ma->M + 1, sizeof(double)); + for (i = 0; i < 256; ++i) + ma->q2p[i] = pow(10., -i / 10.); + for (i = 0; i <= ma->M; ++i) ma->lf[i] = lgamma(i + 1); + bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior + return ma; +} + +int bcf_p1_get_M(bcf_p1aux_t *b) { return b->M; } + +int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) +{ + if (n1 == 0 || n1 >= b->n) return -1; + if (b->M != b->n * 2) { + fprintf(stderr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__); + return -1; + } + b->n1 = n1; + return 0; +} + +void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma) +{ + // bcf_p1aux_t fields are not visible outside of prob1.c, hence this wrapper. + // Ideally, this should set ploidy per site to allow pseudo-autosomal regions + b->ploidy = ma->ploidy; +} + +void bcf_p1_destroy(bcf_p1aux_t *ma) +{ + if (ma) { + int k; + free(ma->lf); + if (ma->hg && ma->n1 > 0) { + for (k = 0; k <= 2*ma->n1; ++k) free(ma->hg[k]); + free(ma->hg); + } + free(ma->ploidy); free(ma->q2p); free(ma->pdg); + free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2); + free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2); + free(ma->afs); free(ma->afs1); + free(ma); + } +} + +extern double kf_gammap(double s, double z); +int test16(bcf1_t *b, anno16_t *a); + +// Wigginton 2005, PMID: 15789306 +// written by Jan Wigginton +double calc_hwe(int obs_hom1, int obs_hom2, int obs_hets) +{ + if (obs_hom1 + obs_hom2 + obs_hets == 0 ) return 1; + + assert(obs_hom1 >= 0 && obs_hom2 >= 0 && obs_hets >= 0); + + int obs_homc = obs_hom1 < obs_hom2 ? obs_hom2 : obs_hom1; + int obs_homr = obs_hom1 < obs_hom2 ? obs_hom1 : obs_hom2; + + int rare_copies = 2 * obs_homr + obs_hets; + int genotypes = obs_hets + obs_homc + obs_homr; + + double *het_probs = (double*) calloc(rare_copies+1, sizeof(double)); + + /* start at midpoint */ + int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes); + + /* check to ensure that midpoint and rare alleles have same parity */ + if ((rare_copies & 1) ^ (mid & 1)) mid++; + + int curr_hets = mid; + int curr_homr = (rare_copies - mid) / 2; + int curr_homc = genotypes - curr_hets - curr_homr; + + het_probs[mid] = 1.0; + double sum = het_probs[mid]; + for (curr_hets = mid; curr_hets > 1; curr_hets -= 2) + { + het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0)); + sum += het_probs[curr_hets - 2]; + + /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */ + curr_homr++; + curr_homc++; + } + + curr_hets = mid; + curr_homr = (rare_copies - mid) / 2; + curr_homc = genotypes - curr_hets - curr_homr; + for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2) + { + het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc /((curr_hets + 2.0) * (curr_hets + 1.0)); + sum += het_probs[curr_hets + 2]; + + /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */ + curr_homr--; + curr_homc--; + } + int i; + for (i = 0; i <= rare_copies; i++) het_probs[i] /= sum; + + /* p-value calculation for p_hwe */ + double p_hwe = 0.0; + for (i = 0; i <= rare_copies; i++) + { + if (het_probs[i] > het_probs[obs_hets]) + continue; + p_hwe += het_probs[i]; + } + + p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; + free(het_probs); + return p_hwe; + +} + + +static void _bcf1_set_ref(bcf1_t *b, int idp) +{ + kstring_t s; + int old_n_gi = b->n_gi; + s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str; + kputs(":GT", &s); kputc('\0', &s); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + + // Call GTs + int isample, an = 0; + for (isample = 0; isample < b->n_smpl; isample++) + { + if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 ) + ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7; + else + { + ((uint8_t*)b->gi[old_n_gi].data)[isample] = 0; + an += b->ploidy ? b->ploidy[isample] : 2; + } + } + bcf_fit_alt(b,1); + b->qual = 999; + + // Prepare BCF for output: ref, alt, filter, info, format + memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); + kputs(b->ref, &s); kputc('\0', &s); + kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); + { + ksprintf(&s, "AN=%d;", an); + kputs(b->info, &s); + anno16_t a; + int has_I16 = test16(b, &a) >= 0? 1 : 0; + if (has_I16 ) + { + if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); + ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + } + kputc('\0', &s); + rm_info(&s, "I16="); + rm_info(&s, "QS="); + } + kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); +} + +int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_only) +{ + int nals = 1; + char *p; + for (p=b->alt; *p; p++) + { + if ( *p=='X' || p[0]=='.' ) break; + if ( p[0]==',' ) nals++; + } + if ( b->alt[0] && !*p ) nals++; + + if ( nals>4 ) + { + if ( *b->ref=='N' ) return 0; + fprintf(stderr,"Not ready for this, more than 4 alleles at %d: %s, %s\n", b->pos+1, b->ref,b->alt); + exit(1); + } + + // find PL, DV and DP FORMAT indexes + uint8_t *pl = NULL; + int i, npl = 0, idp = -1, idv = -1; + for (i = 0; i < b->n_gi; ++i) + { + if (b->gi[i].fmt == bcf_str2int("PL", 2)) + { + pl = (uint8_t*)b->gi[i].data; + npl = b->gi[i].len; + } + else if (b->gi[i].fmt == bcf_str2int("DP", 2)) idp=i; + else if (b->gi[i].fmt == bcf_str2int("DV", 2)) idv=i; + } + if ( nals==1 ) + { + if ( !var_only ) _bcf1_set_ref(b, idp); + return 1; + } + if ( !pl ) return -1; + + assert(ma->q2p[0] == 1); + + // Init P(D|G) + int npdg = nals*(nals+1)/2; + double *pdg,*_pdg; + _pdg = pdg = malloc(sizeof(double)*ma->n*npdg); + for (i=0; i<ma->n; i++) + { + int j; + double sum = 0; + for (j=0; j<npdg; j++) + { + //_pdg[j] = pow(10,-0.1*pl[j]); + _pdg[j] = ma->q2p[pl[j]]; + sum += _pdg[j]; + } + if ( sum ) + for (j=0; j<npdg; j++) _pdg[j] /= sum; + _pdg += npdg; + pl += npl; + } + + if ((p = strstr(b->info, "QS=")) == 0) { fprintf(stderr,"INFO/QS is required with -m, exiting\n"); exit(1); } + double qsum[4]; + if ( sscanf(p+3,"%lf,%lf,%lf,%lf",&qsum[0],&qsum[1],&qsum[2],&qsum[3])!=4 ) { fprintf(stderr,"Could not parse %s\n",p); exit(1); } + + + // Calculate the most likely combination of alleles, remembering the most and second most likely set + int ia,ib,ic, max_als=0, max_als2=0; + double ref_lk = 0, max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN, lk_sums[3]; + for (ia=0; ia<nals; ia++) + { + double lk_tot = 0; + int iaa = (ia+1)*(ia+2)/2-1; + int isample; + for (isample=0; isample<ma->n; isample++) + { + double *p = pdg + isample*npdg; + // assert( log(p[iaa]) <= 0 ); + lk_tot += log(p[iaa]); + } + if ( ia==0 ) ref_lk = lk_tot; + if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia; } + else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia; } + lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); + } + lk_sums[0] = lk_sum; + if ( nals>1 ) + { + for (ia=0; ia<nals; ia++) + { + if ( qsum[ia]==0 ) continue; + int iaa = (ia+1)*(ia+2)/2-1; + for (ib=0; ib<ia; ib++) + { + if ( qsum[ib]==0 ) continue; + double lk_tot = 0; + double fa = qsum[ia]/(qsum[ia]+qsum[ib]); + double fb = qsum[ib]/(qsum[ia]+qsum[ib]); + double fab = 2*fa*fb; fa *= fa; fb *= fb; + int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib; + for (isample=0; isample<ma->n; isample++) + { + double *p = pdg + isample*npdg; + //assert( log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]) <= 0 ); + if ( b->ploidy && b->ploidy[isample]==1 ) + lk_tot += log(fa*p[iaa] + fb*p[ibb]); + else + lk_tot += log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]); + } + if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia|1<<ib; } + else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia|1<<ib; } + lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); + } + } + lk_sums[1] = lk_sum; + } + if ( nals>2 ) + { + for (ia=0; ia<nals; ia++) + { + if ( qsum[ia]==0 ) continue; + int iaa = (ia+1)*(ia+2)/2-1; + for (ib=0; ib<ia; ib++) + { + if ( qsum[ib]==0 ) continue; + int ibb = (ib+1)*(ib+2)/2-1; + int iab = iaa - ia + ib; + for (ic=0; ic<ib; ic++) + { + if ( qsum[ic]==0 ) continue; + double lk_tot = 0; + double fa = qsum[ia]/(qsum[ia]+qsum[ib]+qsum[ic]); + double fb = qsum[ib]/(qsum[ia]+qsum[ib]+qsum[ic]); + double fc = qsum[ic]/(qsum[ia]+qsum[ib]+qsum[ic]); + double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; fa *= fa; fb *= fb; fc *= fc; + int isample, icc = (ic+1)*(ic+2)/2-1; + int iac = iaa - ia + ic, ibc = ibb - ib + ic; + for (isample=0; isample<ma->n; isample++) + { + double *p = pdg + isample*npdg; + //assert( log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]) <= 0 ); + if ( b->ploidy && b->ploidy[isample]==1 ) + lk_tot += log(fa*p[iaa] + fb*p[ibb] + fc*p[icc]); + else + lk_tot += log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]); + } + if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia|1<<ib|1<<ic; } + else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia|1<<ib|1<<ic; } + lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); + } + } + } + lk_sums[2] = lk_sum; + } + + // Should we add another allele, does it increase the likelihood significantly? + int n1=0, n2=0; + for (i=0; i<nals; i++) if ( max_als&1<<i) n1++; + for (i=0; i<nals; i++) if ( max_als2&1<<i) n2++; + if ( n2<n1 && kf_gammap(1,2.0*(max_lk-max_lk2))<threshold ) + { + // the threshold not exceeded, use the second most likely set with fewer alleles + max_lk = max_lk2; + max_als = max_als2; + n1 = n2; + } + lk_sum = lk_sums[n1-1]; + + // Get the BCF record ready for GT and GQ + kstring_t s; + int old_n_gi = b->n_gi; + s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str; + kputs(":GT:GQ", &s); kputc('\0', &s); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + + // Call GTs + int isample, gts=0, ac[4] = {0,0,0,0}; + int nRR = 0, nAA = 0, nRA = 0, max_dv = 0; + for (isample = 0; isample < b->n_smpl; isample++) + { + int ploidy = b->ploidy ? b->ploidy[isample] : 2; + double *p = pdg + isample*npdg; + int ia, als = 0; + double lk = 0, lk_s = 0; + for (ia=0; ia<nals; ia++) + { + if ( !(max_als&1<<ia) ) continue; + int iaa = (ia+1)*(ia+2)/2-1; + double _lk = p[iaa]*qsum[ia]*qsum[ia]; + if ( _lk > lk ) { lk = _lk; als = ia<<3 | ia; } + lk_s += _lk; + } + if ( ploidy==2 ) + { + for (ia=0; ia<nals; ia++) + { + if ( !(max_als&1<<ia) ) continue; + int iaa = (ia+1)*(ia+2)/2-1; + for (ib=0; ib<ia; ib++) + { + if ( !(max_als&1<<ib) ) continue; + int iab = iaa - ia + ib; + double _lk = 2*qsum[ia]*qsum[ib]*p[iab]; + if ( _lk > lk ) { lk = _lk; als = ib<<3 | ia; } + lk_s += _lk; + } + } + } + lk = -log(1-lk/lk_s)/0.2302585; + int dp = 0; + if ( idp>=0 && (dp=((uint16_t*)b->gi[idp].data)[isample])==0 ) + { + // no coverage + ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7; + ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = 0; + continue; + } + if ( lk>99 ) lk = 99; + ((uint8_t*)b->gi[old_n_gi].data)[isample] = als; + ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = (int)lk; + + // For MDV annotation + int dv; + if ( als && idv>=0 && (dv=((uint16_t*)b->gi[idv].data)[isample]) ) + { + if ( max_dv < dv ) max_dv = dv; + } + + // For HWE annotation; multiple ALT alleles treated as one + if ( !als ) nRR++; + else if ( !(als>>3&7) || !(als&7) ) nRA++; + else nAA++; + + gts |= 1<<(als>>3&7) | 1<<(als&7); + ac[ als>>3&7 ]++; + ac[ als&7 ]++; + } + free(pdg); + bcf_fit_alt(b,max_als); + + // The VCF spec is ambiguous about QUAL: is it the probability of anything else + // (that is QUAL(non-ref) = P(ref)+P(any non-ref other than ALT)) or is it + // QUAL(non-ref)=P(ref) and QUAL(ref)=1-P(ref)? Assuming the latter. + b->qual = gts>1 ? -4.343*(ref_lk - lk_sum) : -4.343*log(1-exp(ref_lk - lk_sum)); + if ( b->qual>999 ) b->qual = 999; + + // Prepare BCF for output: ref, alt, filter, info, format + memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); + kputs(b->ref, &s); kputc('\0', &s); + kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); + { + int an=0, nalts=0; + for (i=0; i<nals; i++) + { + an += ac[i]; + if ( i>0 && ac[i] ) nalts++; + } + ksprintf(&s, "AN=%d;", an); + if ( nalts ) + { + kputs("AC=", &s); + for (i=1; i<nals; i++) + { + if ( !(gts&1<<i) ) continue; + nalts--; + ksprintf(&s,"%d", ac[i]); + if ( nalts>0 ) kputc(',', &s); + } + kputc(';', &s); + } + kputs(b->info, &s); + anno16_t a; + int has_I16 = test16(b, &a) >= 0? 1 : 0; + if (has_I16 ) + { + if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); + ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + ksprintf(&s, ";QBD=%e", b->qual/(a.d[0] + a.d[1] + a.d[2] + a.d[3])); + if ( max_dv ) ksprintf(&s, ";MDV=%d", max_dv); + } + if ( nAA+nRA ) + { + double hwe = calc_hwe(nAA, nRR, nRA); + ksprintf(&s, ";HWE=%e", hwe); + } + kputc('\0', &s); + rm_info(&s, "I16="); + rm_info(&s, "QS="); + } + kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + + return gts; +} + +static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) +{ + int i, j; + long *p, tmp; + p = alloca(b->n_alleles * sizeof(long)); + memset(p, 0, sizeof(long) * b->n_alleles); + for (j = 0; j < ma->n; ++j) { + const uint8_t *pi = ma->PL + j * ma->PL_len; + double *pdg = ma->pdg + j * 3; + pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; + for (i = 0; i < b->n_alleles; ++i) + p[i] += (int)pi[(i+1)*(i+2)/2-1]; + } + for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i; + for (i = 1; i < b->n_alleles; ++i) // insertion sort + for (j = i; j > 0 && p[j] < p[j-1]; --j) + tmp = p[j], p[j] = p[j-1], p[j-1] = tmp; + for (i = b->n_alleles - 1; i >= 0; --i) + if ((p[i]&0xf) == 0) break; + return i; +} + + +int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) +{ + double sum, g[3]; + double max, f3[3], *pdg = ma->pdg + k * 3; + int q, i, max_i, ploidy; + ploidy = ma->ploidy? ma->ploidy[k] : 2; + if (ploidy == 2) { + f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; + } else { + f3[0] = 1. - f0; f3[1] = 0; f3[2] = f0; + } + for (i = 0, sum = 0.; i < 3; ++i) + sum += (g[i] = pdg[i] * f3[i]); + for (i = 0, max = -1., max_i = 0; i < 3; ++i) { + g[i] /= sum; + if (g[i] > max) max = g[i], max_i = i; + } + max = 1. - max; + if (max < 1e-308) max = 1e-308; + q = (int)(-4.343 * log(max) + .499); + if (q > 99) q = 99; + return q<<2|max_i; +} + +#define TINY 1e-20 + +static void mc_cal_y_core(bcf_p1aux_t *ma, int beg) +{ + double *z[2], *tmp, *pdg; + int _j, last_min, last_max; + assert(beg == 0 || ma->M == ma->n*2); + z[0] = ma->z; + z[1] = ma->zswap; + pdg = ma->pdg; + memset(z[0], 0, sizeof(double) * (ma->M + 1)); + memset(z[1], 0, sizeof(double) * (ma->M + 1)); + z[0][0] = 1.; + last_min = last_max = 0; + ma->t = 0.; + if (ma->M == ma->n * 2) { + int M = 0; + for (_j = beg; _j < ma->n; ++_j) { + int k, j = _j - beg, _min = last_min, _max = last_max, M0; + double p[3], sum; + M0 = M; M += 2; + pdg = ma->pdg + _j * 3; + p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2]; + for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; + for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; + _max += 2; + if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k]; + if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1]; + for (k = _min < 2? 2 : _min; k <= _max; ++k) + z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2]; + for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; + ma->t += log(sum / (M * (M - 1.))); + for (k = _min; k <= _max; ++k) z[1][k] /= sum; + if (_min >= 1) z[1][_min-1] = 0.; + if (_min >= 2) z[1][_min-2] = 0.; + if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; + if (_j == ma->n1 - 1) { // set pop1; ma->n1==-1 when unset + ma->t1 = ma->t; + memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1)); + } + tmp = z[0]; z[0] = z[1]; z[1] = tmp; + last_min = _min; last_max = _max; + } + //for (_j = 0; _j < last_min; ++_j) z[0][_j] = 0.; // TODO: are these necessary? + //for (_j = last_max + 1; _j < ma->M; ++_j) z[0][_j] = 0.; + } else { // this block is very similar to the block above; these two might be merged in future + int j, M = 0; + for (j = 0; j < ma->n; ++j) { + int k, M0, _min = last_min, _max = last_max; + double p[3], sum; + pdg = ma->pdg + j * 3; + for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; + for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; + M0 = M; + M += ma->ploidy[j]; + if (ma->ploidy[j] == 1) { + p[0] = pdg[0]; p[1] = pdg[2]; + _max++; + if (_min == 0) k = 0, z[1][k] = (M0+1-k) * p[0] * z[0][k]; + for (k = _min < 1? 1 : _min; k <= _max; ++k) + z[1][k] = (M0+1-k) * p[0] * z[0][k] + k * p[1] * z[0][k-1]; + for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; + ma->t += log(sum / M); + for (k = _min; k <= _max; ++k) z[1][k] /= sum; + if (_min >= 1) z[1][_min-1] = 0.; + if (j < ma->n - 1) z[1][_max+1] = 0.; + } else if (ma->ploidy[j] == 2) { + p[0] = pdg[0]; p[1] = 2 * pdg[1]; p[2] = pdg[2]; + _max += 2; + if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k]; + if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1]; + for (k = _min < 2? 2 : _min; k <= _max; ++k) + z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2]; + for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; + ma->t += log(sum / (M * (M - 1.))); + for (k = _min; k <= _max; ++k) z[1][k] /= sum; + if (_min >= 1) z[1][_min-1] = 0.; + if (_min >= 2) z[1][_min-2] = 0.; + if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; + } + tmp = z[0]; z[0] = z[1]; z[1] = tmp; + last_min = _min; last_max = _max; + } + } + if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1)); + if (bcf_p1_fp_lk) + gzwrite(bcf_p1_fp_lk, ma->z, sizeof(double) * (ma->M + 1)); +} + +static void mc_cal_y(bcf_p1aux_t *ma) +{ + if (ma->n1 > 0 && ma->n1 < ma->n && ma->M == ma->n * 2) { // NB: ma->n1 is ineffective when there are haploid samples + int k; + long double x; + memset(ma->z1, 0, sizeof(double) * (2 * ma->n1 + 1)); + memset(ma->z2, 0, sizeof(double) * (2 * (ma->n - ma->n1) + 1)); + ma->t1 = ma->t2 = 0.; + mc_cal_y_core(ma, ma->n1); + ma->t2 = ma->t; + memcpy(ma->z2, ma->z, sizeof(double) * (2 * (ma->n - ma->n1) + 1)); + mc_cal_y_core(ma, 0); + // rescale z + x = expl(ma->t - (ma->t1 + ma->t2)); + for (k = 0; k <= ma->M; ++k) ma->z[k] *= x; + } else mc_cal_y_core(ma, 0); +} + +#define CONTRAST_TINY 1e-30 + +extern double kf_gammaq(double s, double z); // incomplete gamma function for chi^2 test + +static inline double chi2_test(int a, int b, int c, int d) +{ + double x, z; + x = (double)(a+b) * (c+d) * (b+d) * (a+c); + if (x == 0.) return 1; + z = a * d - b * c; + return kf_gammaq(.5, .5 * z * z * (a+b+c+d) / x); +} + +// chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)] +static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int k1, int k2, double x[3]) +{ + double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2]; + int n1 = p1->n1, n2 = p1->n - p1->n1; + if (p < CONTRAST_TINY) return -1; + if (.5*k1/n1 < .5*k2/n2) x[1] += p; + else if (.5*k1/n1 > .5*k2/n2) x[2] += p; + else x[0] += p; + return p * chi2_test(k1, k2, (n1<<1) - k1, (n2<<1) - k2); +} + +static double contrast2(bcf_p1aux_t *p1, double ret[3]) +{ + int k, k1, k2, k10, k20, n1, n2; + double sum; + // get n1 and n2 + n1 = p1->n1; n2 = p1->n - p1->n1; + if (n1 <= 0 || n2 <= 0) return 0.; + if (p1->hg == 0) { // initialize the hypergeometric distribution + /* NB: the hg matrix may take a lot of memory when there are many samples. There is a way + to avoid precomputing this matrix, but it is slower and quite intricate. The following + computation in this block can be accelerated with a similar strategy, but perhaps this + is not a serious concern for now. */ + double tmp = lgamma(2*(n1+n2)+1) - (lgamma(2*n1+1) + lgamma(2*n2+1)); + p1->hg = calloc(2*n1+1, sizeof(void*)); + for (k1 = 0; k1 <= 2*n1; ++k1) { + p1->hg[k1] = calloc(2*n2+1, sizeof(double)); + for (k2 = 0; k2 <= 2*n2; ++k2) + p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp)); + } + } + { // compute + long double suml = 0; + for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k]; + sum = suml; + } + { // get the max k1 and k2 + double max; + int max_k; + for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) { + double x = p1->phi1[k] * p1->z1[k]; + if (x > max) max = x, max_k = k; + } + k10 = max_k; + for (k = 0, max = 0, max_k = -1; k <= 2*n2; ++k) { + double x = p1->phi2[k] * p1->z2[k]; + if (x > max) max = x, max_k = k; + } + k20 = max_k; + } + { // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N. + double x[3], y; + long double z = 0., L[2]; + x[0] = x[1] = x[2] = 0; L[0] = L[1] = 0; + for (k1 = k10; k1 >= 0; --k1) { + for (k2 = k20; k2 >= 0; --k2) { + if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; + else z += y; + } + for (k2 = k20 + 1; k2 <= 2*n2; ++k2) { + if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; + else z += y; + } + } + ret[0] = x[0]; ret[1] = x[1]; ret[2] = x[2]; + x[0] = x[1] = x[2] = 0; + for (k1 = k10 + 1; k1 <= 2*n1; ++k1) { + for (k2 = k20; k2 >= 0; --k2) { + if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; + else z += y; + } + for (k2 = k20 + 1; k2 <= 2*n2; ++k2) { + if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; + else z += y; + } + } + ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2]; + if (ret[0] + ret[1] + ret[2] < 0.95) { // in case of bad things happened + ret[0] = ret[1] = ret[2] = 0; L[0] = L[1] = 0; + for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1) + for (k2 = 0; k2 <= 2*n2; ++k2) + if ((y = contrast2_aux(p1, sum, k1, k2, ret)) >= 0) z += y; + if (ret[0] + ret[1] + ret[2] < 0.95) // It seems that this may be caused by floating point errors. I do not really understand why... + z = 1.0, ret[0] = ret[1] = ret[2] = 1./3; + } + return (double)z; + } +} + +static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded) +{ + int k; + long double sum = 0., sum2; + double *phi = ma->is_indel? ma->phi_indel : ma->phi; + memset(ma->afs1, 0, sizeof(double) * (ma->M + 1)); + mc_cal_y(ma); + // compute AFS + for (k = 0, sum = 0.; k <= ma->M; ++k) + sum += (long double)phi[k] * ma->z[k]; + for (k = 0; k <= ma->M; ++k) { + ma->afs1[k] = phi[k] * ma->z[k] / sum; + if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.; + } + // compute folded variant probability + for (k = 0, sum = 0.; k <= ma->M; ++k) + sum += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k]; + for (k = 1, sum2 = 0.; k < ma->M; ++k) + sum2 += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k]; + *p_var_folded = sum2 / sum; + *p_ref_folded = (phi[k] + phi[ma->M - k]) / 2. * (ma->z[ma->M] + ma->z[0]) / sum; + // the expected frequency + for (k = 0, sum = 0.; k <= ma->M; ++k) { + ma->afs[k] += ma->afs1[k]; + sum += k * ma->afs1[k]; + } + return sum / ma->M; +} + +int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst) +{ + int i, k; + long double sum = 0.; + ma->is_indel = bcf_is_indel(b); + rst->perm_rank = -1; + // set PL and PL_len + for (i = 0; i < b->n_gi; ++i) { + if (b->gi[i].fmt == bcf_str2int("PL", 2)) { + ma->PL = (uint8_t*)b->gi[i].data; + ma->PL_len = b->gi[i].len; + break; + } + } + if (i == b->n_gi) return -1; // no PL + if (b->n_alleles < 2) return -1; // FIXME: find a better solution + // + rst->rank0 = cal_pdg(b, ma); + rst->f_exp = mc_cal_afs(ma, &rst->p_ref_folded, &rst->p_var_folded); + rst->p_ref = ma->afs1[ma->M]; + for (k = 0, sum = 0.; k < ma->M; ++k) + sum += ma->afs1[k]; + rst->p_var = (double)sum; + { // compute the allele count + double max = -1; + rst->ac = -1; + for (k = 0; k <= ma->M; ++k) + if (max < ma->z[k]) max = ma->z[k], rst->ac = k; + rst->ac = ma->M - rst->ac; + } + // calculate f_flat and f_em + for (k = 0, sum = 0.; k <= ma->M; ++k) + sum += (long double)ma->z[k]; + rst->f_flat = 0.; + for (k = 0; k <= ma->M; ++k) { + double p = ma->z[k] / sum; + rst->f_flat += k * p; + } + rst->f_flat /= ma->M; + { // estimate equal-tail credible interval (95% level) + int l, h; + double p; + for (i = 0, p = 0.; i <= ma->M; ++i) + if (p + ma->afs1[i] > 0.025) break; + else p += ma->afs1[i]; + l = i; + for (i = ma->M, p = 0.; i >= 0; --i) + if (p + ma->afs1[i] > 0.025) break; + else p += ma->afs1[i]; + h = i; + rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M; + } + if (ma->n1 > 0) { // compute LRT + double max0, max1, max2; + for (k = 0, max0 = -1; k <= ma->M; ++k) + if (max0 < ma->z[k]) max0 = ma->z[k]; + for (k = 0, max1 = -1; k <= ma->n1 * 2; ++k) + if (max1 < ma->z1[k]) max1 = ma->z1[k]; + for (k = 0, max2 = -1; k <= ma->M - ma->n1 * 2; ++k) + if (max2 < ma->z2[k]) max2 = ma->z2[k]; + rst->lrt = log(max1 * max2 / max0); + rst->lrt = rst->lrt < 0? 1 : kf_gammaq(.5, rst->lrt); + } else rst->lrt = -1.0; + rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0; + if (do_contrast && rst->p_var > 0.5) // skip contrast2() if the locus is a strong non-variant + rst->p_chi2 = contrast2(ma, rst->cmp); + return 0; +} + +void bcf_p1_dump_afs(bcf_p1aux_t *ma) +{ + int k; + fprintf(stderr, "[afs]"); + for (k = 0; k <= ma->M; ++k) + fprintf(stderr, " %d:%.3lf", k, ma->afs[ma->M - k]); + fprintf(stderr, "\n"); + memset(ma->afs, 0, sizeof(double) * (ma->M + 1)); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/prob1.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,49 @@ +#ifndef BCF_PROB1_H +#define BCF_PROB1_H + +#include "bcf.h" + +struct __bcf_p1aux_t; +typedef struct __bcf_p1aux_t bcf_p1aux_t; + +typedef struct { + int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal() + int ac; // ML alternative allele count + double f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var; + double cil, cih; + double cmp[3], p_chi2, lrt; // used by contrast2() +} bcf_p1rst_t; + +typedef struct { + double p[4]; + int mq, depth, is_tested, d[4]; +} anno16_t; + +#define MC_PTYPE_FULL 1 +#define MC_PTYPE_COND2 2 +#define MC_PTYPE_FLAT 3 + +#ifdef __cplusplus +extern "C" { +#endif + + bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy); + void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta); + void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta); + void bcf_p1_destroy(bcf_p1aux_t *ma); + void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma); + int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst); + int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_only); + int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); + void bcf_p1_dump_afs(bcf_p1aux_t *ma); + int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn); + int bcf_p1_set_n1(bcf_p1aux_t *b, int n1); + void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called + + int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]); + +#ifdef __cplusplus +} +#endif + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/vcf.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,249 @@ +#include <zlib.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include "bcf.h" +#include "kstring.h" +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 4096) + +typedef struct { + gzFile fp; + FILE *fpout; + kstream_t *ks; + void *refhash; + kstring_t line; + int max_ref; +} vcf_t; + +bcf_hdr_t *vcf_hdr_read(bcf_t *bp) +{ + kstring_t meta, smpl; + int dret; + vcf_t *v; + bcf_hdr_t *h; + if (!bp->is_vcf) return bcf_hdr_read(bp); + h = calloc(1, sizeof(bcf_hdr_t)); + v = (vcf_t*)bp->v; + v->line.l = 0; + memset(&meta, 0, sizeof(kstring_t)); + memset(&smpl, 0, sizeof(kstring_t)); + while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) { + if (v->line.l < 2) continue; + if (v->line.s[0] != '#') { + free(meta.s); + free(smpl.s); + free(h); + return 0; // no sample line + } + if (v->line.s[0] == '#' && v->line.s[1] == '#') { + kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta); + } else if (v->line.s[0] == '#') { + int k; + ks_tokaux_t aux; + char *p; + for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) { + if (k >= 9) { + kputsn(p, aux.p - p, &smpl); + kputc('\0', &smpl); + } + } + break; + } + } + kputc('\0', &meta); + h->name = 0; + h->sname = smpl.s; h->l_smpl = smpl.l; + h->txt = meta.s; h->l_txt = meta.l; + bcf_hdr_sync(h); + return h; +} + +bcf_t *vcf_open(const char *fn, const char *mode) +{ + bcf_t *bp; + vcf_t *v; + if (strchr(mode, 'b')) return bcf_open(fn, mode); + bp = calloc(1, sizeof(bcf_t)); + v = calloc(1, sizeof(vcf_t)); + bp->is_vcf = 1; + bp->v = v; + v->refhash = bcf_str2id_init(); + if (strchr(mode, 'r')) { + v->fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + v->ks = ks_init(v->fp); + } else if (strchr(mode, 'w')) + v->fpout = strcmp(fn, "-")? fopen(fn, "w") : stdout; + return bp; +} + +int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn) +{ + vcf_t *v; + gzFile fp; + kstream_t *ks; + kstring_t s, rn; + int dret; + if (bp == 0) return -1; + if (!bp->is_vcf) return 0; + s.l = s.m = 0; s.s = 0; + rn.m = rn.l = h->l_nm; rn.s = h->name; + v = (vcf_t*)bp->v; + fp = gzopen(fn, "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, &s, &dret) >= 0) { + bcf_str2id_add(v->refhash, strdup(s.s)); + kputs(s.s, &rn); kputc('\0', &rn); + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); + } + ks_destroy(ks); + gzclose(fp); + h->l_nm = rn.l; h->name = rn.s; + bcf_hdr_sync(h); + free(s.s); + return 0; +} + +int vcf_close(bcf_t *bp) +{ + vcf_t *v; + if (bp == 0) return -1; + if (!bp->is_vcf) return bcf_close(bp); + v = (vcf_t*)bp->v; + if (v->fp) { + ks_destroy(v->ks); + gzclose(v->fp); + } + if (v->fpout) fclose(v->fpout); + free(v->line.s); + bcf_str2id_thorough_destroy(v->refhash); + free(v); + free(bp); + return 0; +} + +int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h) +{ + vcf_t *v = (vcf_t*)bp->v; + int i, has_ver = 0; + if (!bp->is_vcf) return bcf_hdr_write(bp, h); + if (h->l_txt > 0) { + if (strstr(h->txt, "##fileformat=")) has_ver = 1; + if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n"); + fwrite(h->txt, 1, h->l_txt - 1, v->fpout); + } + if (h->l_txt == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n"); + fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); + for (i = 0; i < h->n_smpl; ++i) + fprintf(v->fpout, "\t%s", h->sns[i]); + fputc('\n', v->fpout); + return 0; +} + +int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) +{ + vcf_t *v = (vcf_t*)bp->v; + extern void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s); + if (!bp->is_vcf) return bcf_write(bp, h, b); + bcf_fmt_core(h, b, &v->line); + fwrite(v->line.s, 1, v->line.l, v->fpout); + fputc('\n', v->fpout); + return v->line.l + 1; +} + +int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) +{ + int dret, k, i, sync = 0; + vcf_t *v = (vcf_t*)bp->v; + char *p, *q; + kstring_t str, rn; + ks_tokaux_t aux, a2; + if (!bp->is_vcf) return bcf_read(bp, h, b); + v->line.l = 0; + str.l = 0; str.m = b->m_str; str.s = b->str; + rn.l = rn.m = h->l_nm; rn.s = h->name; + if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1; + b->n_smpl = h->n_smpl; + for (p = kstrtok(v->line.s, "\t", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) { + *(char*)aux.p = 0; + if (k == 0) { // ref + int tid = bcf_str2id(v->refhash, p); + if (tid < 0) { + tid = bcf_str2id_add(v->refhash, strdup(p)); + kputs(p, &rn); kputc('\0', &rn); + sync = 1; + } + b->tid = tid; + } else if (k == 1) { // pos + b->pos = atoi(p) - 1; + } else if (k == 5) { // qual + b->qual = (p[0] >= '0' && p[0] <= '9')? atof(p) : 0; + } else if (k <= 8) { // variable length strings + kputs(p, &str); kputc('\0', &str); + b->l_str = str.l; b->m_str = str.m; b->str = str.s; + if (k == 8) bcf_sync(b); + } else { // k > 9 + if (strncmp(p, "./.", 3) == 0) { + for (i = 0; i < b->n_gi; ++i) { + if (b->gi[i].fmt == bcf_str2int("GT", 2)) { + ((uint8_t*)b->gi[i].data)[k-9] = 1<<7; + } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { + ((uint8_t*)b->gi[i].data)[k-9] = 0; + } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { + ((int32_t*)b->gi[i].data)[k-9] = 0; + } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { + ((uint16_t*)b->gi[i].data)[k-9] = 0; + } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) { + int y = b->n_alleles * (b->n_alleles + 1) / 2; + memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y); + } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { + int y = b->n_alleles * (b->n_alleles + 1) / 2; + memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4); + } + } + goto endblock; + } + for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) { + if (b->gi[i].fmt == bcf_str2int("GT", 2)) { + ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6; + } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { + double _x = strtod(q, &q); + int x = (int)(_x + .499); + if (x > 255) x = 255; + ((uint8_t*)b->gi[i].data)[k-9] = x; + } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { + int x = strtol(q, &q, 10); + if (x > 0xffff) x = 0xffff; + ((uint32_t*)b->gi[i].data)[k-9] = x; + } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { + int x = strtol(q, &q, 10); + if (x > 0xffff) x = 0xffff; + ((uint16_t*)b->gi[i].data)[k-9] = x; + } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) { + int x, y, j; + uint8_t *data = (uint8_t*)b->gi[i].data; + y = b->n_alleles * (b->n_alleles + 1) / 2; + for (j = 0; j < y; ++j) { + x = strtol(q, &q, 10); + if (x > 255) x = 255; + data[(k-9) * y + j] = x; + ++q; + } + } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { + int j, y; + float x, *data = (float*)b->gi[i].data; + y = b->n_alleles * (b->n_alleles + 1) / 2; + for (j = 0; j < y; ++j) { + x = strtod(q, &q); + data[(k-9) * y + j] = x > 0? -x/10. : x; + ++q; + } + } + } + endblock: i = i; + } + } + h->l_nm = rn.l; h->name = rn.s; + if (sync) bcf_hdr_sync(h); + return v->line.l + 1; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bcftools/vcfutils.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,567 @@ +#!/usr/bin/perl -w + +# Author: lh3 + +use strict; +use warnings; +use Getopt::Std; + +&main; +exit; + +sub main { + &usage if (@ARGV < 1); + my $command = shift(@ARGV); + my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter, + hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats, + gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq); + die("Unknown command \"$command\".\n") if (!defined($func{$command})); + &{$func{$command}}; +} + +sub splitchr { + my %opts = (l=>5000000); + getopts('l:', \%opts); + my $l = $opts{l}; + die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] <in.fa.fai>\n/) if (@ARGV == 0 && -t STDIN); + while (<>) { + my @t = split; + my $last = 0; + for (my $i = 0; $i < $t[1];) { + my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l; + print "$t[0]:".($i+1)."-$e\n"; + $i = $e; + } + } +} + +sub subsam { + die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0); + my ($fh, %h); + my $fn = shift(@ARGV); + my @col; + open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die; + $h{$_} = 1 for (@ARGV); + while (<$fh>) { + if (/^##/) { + print; + } elsif (/^#/) { + my @t = split; + my @s = @t[0..8]; # all fixed fields + FORMAT + for (9 .. $#t) { + if ($h{$t[$_]}) { + push(@s, $t[$_]); + push(@col, $_); + } + } + pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field + print join("\t", @s), "\n"; + } else { + my @t = split; + if (@col == 0) { + print join("\t", @t[0..7]), "\n"; + } else { + print join("\t", @t[0..8], map {$t[$_]} @col), "\n"; + } + } + } + close($fh); +} + +sub listsam { + die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN); + while (<>) { + if (/^#/ && !/^##/) { + my @t = split; + print join("\n", @t[9..$#t]), "\n"; + exit; + } + } +} + +sub fillac { + die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN); + while (<>) { + if (/^#/) { + print; + } else { + my @t = split; + my @c = (0, 0); + my $n = 0; + my $s = -1; + @_ = split(":", $t[8]); + for (0 .. $#_) { + if ($_[$_] eq 'GT') { $s = $_; last; } + } + if ($s < 0) { + print join("\t", @t), "\n"; + next; + } + for (9 .. $#t) { + if ($t[$_] =~ /^0,0,0/) { + } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) { + ++$c[$2]; ++$c[$3]; + $n += 2; + } + } + my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n"; + my $info = $t[7]; + $info =~ s/(;?)AC=(\d+)//; + $info =~ s/(;?)AN=(\d+)//; + if ($info eq '.') { + $info = $AC; + } else { + $info .= ";$AC"; + } + $t[7] = $info; + print join("\t", @t), "\n"; + } + } +} + +sub ldstats { + my %opts = (t=>0.9); + getopts('t:', \%opts); + die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN); + my $cutoff = $opts{t}; + my ($last, $lastchr) = (0x7fffffff, ''); + my ($x, $y, $n) = (0, 0, 0); + while (<>) { + if (/^([^#\s]+)\s(\d+)/) { + my ($chr, $pos) = ($1, $2); + if (/NEIR=([\d\.]+)/) { + ++$n; + ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff); + } + $last = $pos; $lastchr = $chr; + } + } + print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n"; + print "Fraction: ", $y/$n, "\n"; + print "Length: $x\n"; +} + +sub qstats { + my %opts = (r=>'', s=>0.02, v=>undef); + getopts('r:s:v', \%opts); + die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n +Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN); + my %ts = (AG=>1, GA=>1, CT=>1, TC=>1); + my %h = (); + my $is_vcf = defined($opts{v})? 1 : 0; + if ($opts{r}) { # read the reference positions + my $fh; + open($fh, $opts{r}) || die; + while (<$fh>) { + next if (/^#/); + if ($is_vcf) { + my @t = split; + $h{$t[0],$t[1]} = $t[4]; + } else { + $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/); + } + } + close($fh); + } + my $hsize = scalar(keys %h); + my @a; + while (<>) { + next if (/^#/); + my @t = split; + next if (length($t[3]) != 1 || uc($t[3]) eq 'N'); + $t[3] = uc($t[3]); $t[4] = uc($t[4]); + my @s = split(',', $t[4]); + $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0); + next if (length($s[0]) != 1); + my $hit; + if ($is_vcf) { + $hit = 0; + my $aa = $h{$t[0],$t[1]}; + if (defined($aa)) { + my @aaa = split(",", $aa); + for (@aaa) { + $hit = 1 if ($_ eq $s[0]); + } + } + } else { + $hit = defined($h{$t[0],$t[1]})? 1 : 0; + } + push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]); + } + push(@a, [-1, 0, 0, 0]); # end marker + die("[qstats] No SNP data!\n") if (@a == 0); + @a = sort {$b->[0]<=>$a->[0]} @a; + my $next = $opts{s}; + my $last = $a[0]; + my @c = (0, 0, 0, 0); + my @lc; + $lc[1] = $lc[2] = 0; + for my $p (@a) { + if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) { + my @x; + $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100); + $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0); + $x[2] = sprintf("%.4f", $c[3] / $c[1]); + my $a = $c[1] - $lc[1]; + my $b = $c[2] - $lc[2]; + $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100); + print join("\t", $last, @c, @x), "\n"; + $next = $c[0]/@a + $opts{s}; + $lc[1] = $c[1]; $lc[2] = $c[2]; + } + ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3]; + $last = $p->[0]; + } +} + +sub varFilter { + my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>3, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000, e=>1e-4); + getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:e:', \%opts); + die(qq/ +Usage: vcfutils.pl varFilter [options] <in.vcf> + +Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] + -d INT minimum read depth [$opts{d}] + -D INT maximum read depth [$opts{D}] + -a INT minimum number of alternate bases [$opts{a}] + -w INT SNP within INT bp around a gap to be filtered [$opts{w}] + -W INT window size for filtering adjacent gaps [$opts{W}] + -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}] + -2 FLOAT min P-value for baseQ bias [$opts{2}] + -3 FLOAT min P-value for mapQ bias [$opts{3}] + -4 FLOAT min P-value for end distance bias [$opts{4}] + -e FLOAT min P-value for HWE (plus F<0) [$opts{e}] + -p print filtered variants + +Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. +\n/) if (@ARGV == 0 && -t STDIN); + + # calculate the window size + my ($ol, $ow) = ($opts{W}, $opts{w}); + my $max_dist = $ol > $ow? $ol : $ow; + # the core loop + my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...) + while (<>) { + my @t = split; + if (/^#/) { + print; next; + } + next if ($t[4] eq '.'); # skip non-var sites + next if ($t[3] eq 'N'); # skip sites with unknown ref ('N') + # check if the site is a SNP + my $type = 1; # SNP + if (length($t[3]) > 1) { + $type = 2; # MNP + my @s = split(',', $t[4]); + for (@s) { + $type = 3 if (length != length($t[3])); + } + } else { + my @s = split(',', $t[4]); + for (@s) { + $type = 3 if (length > 1); + } + } + # clear the out-of-range elements + while (@staging) { + # Still on the same chromosome and the first element's window still affects this position? + last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]); + varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much + } + my $flt = 0; + # parse annotations + my ($dp, $mq, $dp_alt) = (-1, -1, -1); + if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) { + $dp = $1 + $2 + $3 + $4; + $dp_alt = $3 + $4; + } + if ($t[7] =~ /DP=(\d+)/i) { + $dp = $1; + } + $mq = $1 if ($t[7] =~ /MQ=(\d+)/i); + # the depth and mapQ filter + if ($dp >= 0) { + if ($dp < $opts{d}) { + $flt = 2; + } elsif ($dp > $opts{D}) { + $flt = 3; + } + } + $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a}); + $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q}); + $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ + && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4})); + $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S}))); + # HWE filter + if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/ && $4 < $opts{e}) { + my $p = 2*$1 + $2; + my $f = ($p > 0 && $p < 1)? 1 - $2 / ($p * (1-$p)) : 0; + $flt = 9 if ($f < 0); + } + + my $score = $t[5] * 100 + $dp_alt; + my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs + if ($flt == 0) { + if ($type == 3) { # an indel + # filtering SNPs and MNPs + for my $x (@staging) { + next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]); + $x->[1] = 5; + } + # check the staging list for indel filtering + for my $x (@staging) { + next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]); + if ($x->[0]>>2 < $score) { + $x->[1] = 6; + } else { + $flt = 6; last; + } + } + } else { # SNP or MNP + for my $x (@staging) { + next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]); + if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1) + && length($x->[7]) - length($x->[6]) == 1) { + $x->[1] = 5; + } else { $flt = 5; } + last; + } + # check MNP + for my $x (@staging) { + next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]); + if ($x->[0]>>2 < $score) { + $x->[1] = 8; + } else { + $flt = 8; last; + } + } + } + } + push(@staging, [$score<<2|$type, $flt, $rlen, @t]); + } + # output the last few elements in the staging list + while (@staging) { + varFilter_aux(shift @staging, $opts{p}); + } +} + +sub varFilter_aux { + my ($first, $is_print) = @_; + if ($first->[1] == 0) { + print join("\t", @$first[3 .. @$first-1]), "\n"; + } elsif ($is_print) { + print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; + } +} + +sub gapstats { + my (@c0, @c1); + $c0[$_] = $c1[$_] = 0 for (0 .. 10000); + while (<>) { + next if (/^#/); + my @t = split; + next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel + my @s = split(',', $t[4]); + for my $x (@s) { + my $l = length($x) - length($t[3]) + 5000; + if ($x =~ /^-/) { + $l = -(length($x) - 1) + 5000; + } elsif ($x =~ /^\+/) { + $l = length($x) - 1 + 5000; + } + $c0[$l] += 1 / @s; + } + } + for (my $i = 0; $i < 10000; ++$i) { + next if ($c0[$i] == 0); + $c1[0] += $c0[$i]; + $c1[1] += $c0[$i] if (($i-5000)%3 == 0); + printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]); + } + printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]); +} + +sub ucscsnp2vcf { + die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN); + print "##fileformat=VCFv4.0\n"; + print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n"; + while (<>) { + my @t = split("\t"); + my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1; + my $pos = $t[2] + 1; + my @alt; + push(@alt, $t[7]); + if ($t[6] eq '-') { + $t[9] = reverse($t[9]); + $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/; + } + my @a = split("/", $t[9]); + for (@a) { + push(@alt, $_) if ($_ ne $alt[0]); + } + if ($indel) { + --$pos; + for (0 .. $#alt) { + $alt[$_] =~ tr/-//d; + $alt[$_] = "N$alt[$_]"; + } + } + my $ref = shift(@alt); + my $af = $t[13] > 0? ";AF=$t[13]" : ''; + my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]"; + my $info = "molType=$t[10];class=$t[11]$valid$af"; + print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n"; + } +} + +sub hapmap2vcf { + die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0); + my $fn = shift(@ARGV); + # parse UCSC SNP + warn("Parsing UCSC SNPs...\n"); + my ($fh, %map); + open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die; + while (<$fh>) { + my @t = split; + next if ($t[3] - $t[2] != 1); # not SNP + @{$map{$t[4]}} = @t[1,3,7]; + } + close($fh); + # write VCF + warn("Writing VCF...\n"); + print "##fileformat=VCFv4.0\n"; + while (<>) { + my @t = split; + if ($t[0] eq 'rs#') { # the first line + print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n"; + } else { + next unless ($map{$t[0]}); + next if (length($t[1]) != 3); # skip non-SNPs + my $a = \@{$map{$t[0]}}; + my $ref = $a->[2]; + my @u = split('/', $t[1]); + if ($u[1] eq $ref) { + $u[1] = $u[0]; $u[0] = $ref; + } elsif ($u[0] ne $ref) { next; } + my $alt = $u[1]; + my %w; + $w{$u[0]} = 0; $w{$u[1]} = 1; + my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT'); + my $is_tri = 0; + for (@t[11..$#t]) { + if ($_ eq 'NN') { + push(@s, './.'); + } else { + my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)}); + if (!defined($a[0]) || !defined($a[1])) { + $is_tri = 1; + last; + } + push(@s, "$a[0]/$a[1]"); + } + } + next if ($is_tri); + print join("\t", @s), "\n"; + } + } +} + +sub vcf2fq { + my %opts = (d=>3, D=>100000, Q=>10, l=>5); + getopts('d:D:Q:l:', \%opts); + die(qq/ +Usage: vcfutils.pl vcf2fq [options] <all-site.vcf> + +Options: -d INT minimum depth [$opts{d}] + -D INT maximum depth [$opts{D}] + -Q INT min RMS mapQ [$opts{Q}] + -l INT INDEL filtering window [$opts{l}] +\n/) if (@ARGV == 0 && -t STDIN); + + my ($last_chr, $seq, $qual, $last_pos, @gaps); + my $_Q = $opts{Q}; + my $_d = $opts{d}; + my $_D = $opts{D}; + + my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y', + GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K'); + + $last_chr = ''; + while (<>) { + next if (/^#/); + my @t = split; + if ($last_chr ne $t[0]) { + &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr); + ($last_chr, $last_pos) = ($t[0], 0); + $seq = $qual = ''; + @gaps = (); + } + die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0); + if ($t[1] - $last_pos > 1) { + $seq .= 'n' x ($t[1] - $last_pos - 1); + $qual .= '!' x ($t[1] - $last_pos - 1); + } + if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference + my ($ref, $alt) = ($t[3], $1); + my ($b, $q); + $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/); + if ($q < 0) { + $_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0; + $b = ($_ < .5 || $alt eq '.')? $ref : $alt; + $q = -$q; + } else { + $b = $het{"$ref$alt"}; + $b ||= 'N'; + } + $b = lc($b); + $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D)); + $q = int($q + 33 + .499); + $q = chr($q <= 126? $q : 126); + $seq .= $b; + $qual .= $q; + } elsif ($t[4] ne '.') { # an INDEL + push(@gaps, [$t[1], length($t[3])]); + } + $last_pos = $t[1]; + } + &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}); +} + +sub v2q_post_process { + my ($chr, $seq, $qual, $gaps, $l) = @_; + for my $g (@$gaps) { + my $beg = $g->[0] > $l? $g->[0] - $l : 0; + my $end = $g->[0] + $g->[1] + $l; + $end = length($$seq) if ($end > length($$seq)); + substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg)); + } + print "\@$chr\n"; &v2q_print_str($seq); + print "+\n"; &v2q_print_str($qual); +} + +sub v2q_print_str { + my ($s) = @_; + my $l = length($$s); + for (my $i = 0; $i < $l; $i += 60) { + print substr($$s, $i, 60), "\n"; + } +} + +sub usage { + die(qq/ +Usage: vcfutils.pl <command> [<arguments>]\n +Command: subsam get a subset of samples + listsam list the samples + fillac fill the allele count field + qstats SNP stats stratified by QUAL + + hapmap2vcf convert the hapmap format to VCF + ucscsnp2vcf convert UCSC SNP SQL dump to VCF + + varFilter filtering short variants (*) + vcf2fq VCF->fastq (**) + +Notes: Commands with description endting with (*) may need bcftools + specific annotations. +\n/); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bedcov.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,127 @@ +#include <zlib.h> +#include <stdio.h> +#include <ctype.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include "kstring.h" +#include "bgzf.h" +#include "bam.h" + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 16384) + +typedef struct { + bamFile fp; + bam_iter_t iter; + int min_mapQ; +} aux_t; + +static int read_bam(void *data, bam1_t *b) +{ + aux_t *aux = (aux_t*)data; + int ret = bam_iter_read(aux->fp, aux->iter, b); + if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; + return ret; +} + +int main_bedcov(int argc, char *argv[]) +{ + extern void bam_init_header_hash(bam_header_t*); + gzFile fp; + kstring_t str; + kstream_t *ks; + bam_index_t **idx; + bam_header_t *h = 0; + aux_t **aux; + int *n_plp, dret, i, n, c, min_mapQ = 0; + int64_t *cnt; + const bam_pileup1_t **plp; + + while ((c = getopt(argc, argv, "Q:")) >= 0) { + switch (c) { + case 'Q': min_mapQ = atoi(optarg); break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: samtools bedcov <in.bed> <in1.bam> [...]\n"); + return 1; + } + memset(&str, 0, sizeof(kstring_t)); + n = argc - optind - 1; + aux = calloc(n, sizeof(void*)); + idx = calloc(n, sizeof(void*)); + for (i = 0; i < n; ++i) { + aux[i] = calloc(1, sizeof(aux_t)); + aux[i]->min_mapQ = min_mapQ; + aux[i]->fp = bam_open(argv[i+optind+1], "r"); + idx[i] = bam_index_load(argv[i+optind+1]); + if (aux[i]->fp == 0 || idx[i] == 0) { + fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); + return 2; + } + bgzf_set_cache_size(aux[i]->fp, 20); + if (i == 0) h = bam_header_read(aux[0]->fp); + } + bam_init_header_hash(h); + cnt = calloc(n, 8); + + fp = gzopen(argv[optind], "rb"); + ks = ks_init(fp); + n_plp = calloc(n, sizeof(int)); + plp = calloc(n, sizeof(void*)); + while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { + char *p, *q; + int tid, beg, end, pos; + bam_mplp_t mplp; + + for (p = q = str.s; *p && *p != '\t'; ++p); + if (*p != '\t') goto bed_error; + *p = 0; tid = bam_get_tid(h, q); *p = '\t'; + if (tid < 0) goto bed_error; + for (q = p = p + 1; isdigit(*p); ++p); + if (*p != '\t') goto bed_error; + *p = 0; beg = atoi(q); *p = '\t'; + for (q = p = p + 1; isdigit(*p); ++p); + if (*p == '\t' || *p == 0) { + int c = *p; + *p = 0; end = atoi(q); *p = c; + } else goto bed_error; + + for (i = 0; i < n; ++i) { + if (aux[i]->iter) bam_iter_destroy(aux[i]->iter); + aux[i]->iter = bam_iter_query(idx[i], tid, beg, end); + } + mplp = bam_mplp_init(n, read_bam, (void**)aux); + bam_mplp_set_maxcnt(mplp, 64000); + memset(cnt, 0, 8 * n); + while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) + if (pos >= beg && pos < end) + for (i = 0; i < n; ++i) cnt[i] += n_plp[i]; + for (i = 0; i < n; ++i) { + kputc('\t', &str); + kputl(cnt[i], &str); + } + puts(str.s); + bam_mplp_destroy(mplp); + continue; + +bed_error: + fprintf(stderr, "Errors in BED line '%s'\n", str.s); + } + free(n_plp); free(plp); + ks_destroy(ks); + gzclose(fp); + + free(cnt); + for (i = 0; i < n; ++i) { + if (aux[i]->iter) bam_iter_destroy(aux[i]->iter); + bam_index_destroy(idx[i]); + bam_close(aux[i]->fp); + free(aux[i]); + } + bam_header_destroy(h); + free(aux); free(idx); + free(str.s); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bedidx.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,162 @@ +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <stdio.h> +#include <zlib.h> + +#ifdef _WIN32 +#define drand48() ((double)rand() / RAND_MAX) +#endif + +#include "ksort.h" +KSORT_INIT_GENERIC(uint64_t) + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 8192) + +typedef struct { + int n, m; + uint64_t *a; + int *idx; +} bed_reglist_t; + +#include "khash.h" +KHASH_MAP_INIT_STR(reg, bed_reglist_t) + +#define LIDX_SHIFT 13 + +typedef kh_reg_t reghash_t; + +int *bed_index_core(int n, uint64_t *a, int *n_idx) +{ + int i, j, m, *idx; + m = *n_idx = 0; idx = 0; + for (i = 0; i < n; ++i) { + int beg, end; + beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT; + if (m < end + 1) { + int oldm = m; + m = end + 1; + kroundup32(m); + idx = realloc(idx, m * sizeof(int)); + for (j = oldm; j < m; ++j) idx[j] = -1; + } + if (beg == end) { + if (idx[beg] < 0) idx[beg] = i; + } else { + for (j = beg; j <= end; ++j) + if (idx[j] < 0) idx[j] = i; + } + *n_idx = end + 1; + } + return idx; +} + +void bed_index(void *_h) +{ + reghash_t *h = (reghash_t*)_h; + khint_t k; + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + bed_reglist_t *p = &kh_val(h, k); + if (p->idx) free(p->idx); + ks_introsort(uint64_t, p->n, p->a); + p->idx = bed_index_core(p->n, p->a, &p->m); + } + } +} + +int bed_overlap_core(const bed_reglist_t *p, int beg, int end) +{ + int i, min_off; + if (p->n == 0) return 0; + min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; + if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here + int n = beg>>LIDX_SHIFT; + if (n > p->n) n = p->n; + for (i = n - 1; i >= 0; --i) + if (p->idx[i] >= 0) break; + min_off = i >= 0? p->idx[i] : 0; + } + for (i = min_off; i < p->n; ++i) { + if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed + if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) + return 1; // find the overlap; return + } + return 0; +} + +int bed_overlap(const void *_h, const char *chr, int beg, int end) +{ + const reghash_t *h = (const reghash_t*)_h; + khint_t k; + if (!h) return 0; + k = kh_get(reg, h, chr); + if (k == kh_end(h)) return 0; + return bed_overlap_core(&kh_val(h, k), beg, end); +} + +void *bed_read(const char *fn) +{ + reghash_t *h = kh_init(reg); + gzFile fp; + kstream_t *ks; + int dret; + kstring_t *str; + // read the list + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) return 0; + str = calloc(1, sizeof(kstring_t)); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name + int beg = -1, end = -1; + bed_reglist_t *p; + khint_t k = kh_get(reg, h, str->s); + if (k == kh_end(h)) { // absent from the hash table + int ret; + char *s = strdup(str->s); + k = kh_put(reg, h, s, &ret); + memset(&kh_val(h, k), 0, sizeof(bed_reglist_t)); + } + p = &kh_val(h, k); + if (dret != '\n') { // if the lines has other characters + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + beg = atoi(str->s); // begin + if (dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + end = atoi(str->s); // end + if (end < beg) end = -1; + } + } + } + } + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line + if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column + if (beg >= 0 && end > beg) { + if (p->n == p->m) { + p->m = p->m? p->m<<1 : 4; + p->a = realloc(p->a, p->m * 8); + } + p->a[p->n++] = (uint64_t)beg<<32 | end; + } + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + bed_index(h); + return h; +} + +void bed_destroy(void *_h) +{ + reghash_t *h = (reghash_t*)_h; + khint_t k; + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + free(kh_val(h, k).idx); + free((char*)kh_key(h, k)); + } + } + kh_destroy(reg, h); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bgzf.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,694 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + 2011 Attractive Chaos <attractor@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> +#include <sys/types.h> +#include "bgzf.h" + +#ifdef _USE_KNETFILE +#include "knetfile.h" +typedef knetFile *_bgzf_file_t; +#define _bgzf_open(fn, mode) knet_open(fn, mode) +#define _bgzf_dopen(fp, mode) knet_dopen(fp, mode) +#define _bgzf_close(fp) knet_close(fp) +#define _bgzf_fileno(fp) ((fp)->fd) +#define _bgzf_tell(fp) knet_tell(fp) +#define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence) +#define _bgzf_read(fp, buf, len) knet_read(fp, buf, len) +#define _bgzf_write(fp, buf, len) knet_write(fp, buf, len) +#else // ~defined(_USE_KNETFILE) +#if defined(_WIN32) || defined(_MSC_VER) +#define ftello(fp) ftell(fp) +#define fseeko(fp, offset, whence) fseek(fp, offset, whence) +#else // ~defined(_WIN32) +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); +#endif // ~defined(_WIN32) +typedef FILE *_bgzf_file_t; +#define _bgzf_open(fn, mode) fopen(fn, mode) +#define _bgzf_dopen(fp, mode) fdopen(fp, mode) +#define _bgzf_close(fp) fclose(fp) +#define _bgzf_fileno(fp) fileno(fp) +#define _bgzf_tell(fp) ftello(fp) +#define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence) +#define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp) +#define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp) +#endif // ~define(_USE_KNETFILE) + +#define BLOCK_HEADER_LENGTH 18 +#define BLOCK_FOOTER_LENGTH 8 + + +/* BGZF/GZIP header (speciallized from RFC 1952; little endian): + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN| + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ +*/ +static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0"; + +#ifdef BGZF_CACHE +typedef struct { + int size; + uint8_t *block; + int64_t end_offset; +} cache_t; +#include "khash.h" +KHASH_MAP_INIT_INT64(cache, cache_t) +#endif + +static inline void packInt16(uint8_t *buffer, uint16_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; +} + +static inline int unpackInt16(const uint8_t *buffer) +{ + return buffer[0] | buffer[1] << 8; +} + +static inline void packInt32(uint8_t *buffer, uint32_t value) +{ + buffer[0] = value; + buffer[1] = value >> 8; + buffer[2] = value >> 16; + buffer[3] = value >> 24; +} + +static BGZF *bgzf_read_init() +{ + BGZF *fp; + fp = calloc(1, sizeof(BGZF)); + fp->is_write = 0; + fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE); + fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE); +#ifdef BGZF_CACHE + fp->cache = kh_init(cache); +#endif + return fp; +} + +static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level +{ + BGZF *fp; + fp = calloc(1, sizeof(BGZF)); + fp->is_write = 1; + fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE); + fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE); + fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 + if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; + return fp; +} +// get the compress level from the mode string +static int mode2level(const char *__restrict mode) +{ + int i, compress_level = -1; + for (i = 0; mode[i]; ++i) + if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = (int)mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + return compress_level; +} + +BGZF *bgzf_open(const char *path, const char *mode) +{ + BGZF *fp = 0; + assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); + if (strchr(mode, 'r') || strchr(mode, 'R')) { + _bgzf_file_t fpr; + if ((fpr = _bgzf_open(path, "r")) == 0) return 0; + fp = bgzf_read_init(); + fp->fp = fpr; + } else if (strchr(mode, 'w') || strchr(mode, 'W')) { + FILE *fpw; + if ((fpw = fopen(path, "w")) == 0) return 0; + fp = bgzf_write_init(mode2level(mode)); + fp->fp = fpw; + } + return fp; +} + +BGZF *bgzf_dopen(int fd, const char *mode) +{ + BGZF *fp = 0; + assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); + if (strchr(mode, 'r') || strchr(mode, 'R')) { + _bgzf_file_t fpr; + if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0; + fp = bgzf_read_init(); + fp->fp = fpr; + } else if (strchr(mode, 'w') || strchr(mode, 'W')) { + FILE *fpw; + if ((fpw = fdopen(fd, "w")) == 0) return 0; + fp = bgzf_write_init(mode2level(mode)); + fp->fp = fpw; + } + return fp; +} + +static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level) +{ + uint32_t crc; + z_stream zs; + uint8_t *dst = (uint8_t*)_dst; + + // compress the body + zs.zalloc = NULL; zs.zfree = NULL; + zs.next_in = src; + zs.avail_in = slen; + zs.next_out = dst + BLOCK_HEADER_LENGTH; + zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer + if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1; + if (deflateEnd(&zs) != Z_OK) return -1; + *dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + // write the header + memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block + packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes + // write the footer + crc = crc32(crc32(0L, NULL, 0L), src, slen); + packInt32((uint8_t*)&dst[*dlen - 8], crc); + packInt32((uint8_t*)&dst[*dlen - 4], slen); + return 0; +} + +// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length. +static int deflate_block(BGZF *fp, int block_length) +{ + int comp_size = BGZF_MAX_BLOCK_SIZE; + if (bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level) != 0) { + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + fp->block_offset = 0; + return comp_size; +} + +// Inflate the block in fp->compressed_block into fp->uncompressed_block +static int inflate_block(BGZF* fp, int block_length) +{ + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->compressed_block + 18; + zs.avail_in = block_length - 16; + zs.next_out = fp->uncompressed_block; + zs.avail_out = BGZF_MAX_BLOCK_SIZE; + + if (inflateInit2(&zs, -15) != Z_OK) { + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + if (inflate(&zs, Z_FINISH) != Z_STREAM_END) { + inflateEnd(&zs); + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + if (inflateEnd(&zs) != Z_OK) { + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + return zs.total_out; +} + +static int check_header(const uint8_t *header) +{ + return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0 + && unpackInt16((uint8_t*)&header[10]) == 6 + && header[12] == 'B' && header[13] == 'C' + && unpackInt16((uint8_t*)&header[14]) == 2); +} + +#ifdef BGZF_CACHE +static void free_cache(BGZF *fp) +{ + khint_t k; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (fp->is_write) return; + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) free(kh_val(h, k).block); + kh_destroy(cache, h); +} + +static int load_block_from_cache(BGZF *fp, int64_t block_address) +{ + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + k = kh_get(cache, h, block_address); + if (k == kh_end(h)) return 0; + p = &kh_val(h, k); + if (fp->block_length != 0) fp->block_offset = 0; + fp->block_address = block_address; + fp->block_length = p->size; + memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE); + _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET); + return p->size; +} + +static void cache_block(BGZF *fp, int size) +{ + int ret; + khint_t k; + cache_t *p; + khash_t(cache) *h = (khash_t(cache)*)fp->cache; + if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return; + if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > fp->cache_size) { + /* A better way would be to remove the oldest block in the + * cache, but here we remove a random one for simplicity. This + * should not have a big impact on performance. */ + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) break; + if (k < kh_end(h)) { + free(kh_val(h, k).block); + kh_del(cache, h, k); + } + } + k = kh_put(cache, h, fp->block_address, &ret); + if (ret == 0) return; // if this happens, a bug! + p = &kh_val(h, k); + p->size = fp->block_length; + p->end_offset = fp->block_address + size; + p->block = malloc(BGZF_MAX_BLOCK_SIZE); + memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE); +} +#else +static void free_cache(BGZF *fp) {} +static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;} +static void cache_block(BGZF *fp, int size) {} +#endif + +int bgzf_read_block(BGZF *fp) +{ + uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block; + int count, size = 0, block_length, remaining; + int64_t block_address; + block_address = _bgzf_tell((_bgzf_file_t)fp->fp); + if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0; + count = _bgzf_read(fp->fp, header, sizeof(header)); + if (count == 0) { // no data read + fp->block_length = 0; + return 0; + } + if (count != sizeof(header) || !check_header(header)) { + fp->errcode |= BGZF_ERR_HEADER; + return -1; + } + size = count; + block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1" + compressed_block = (uint8_t*)fp->compressed_block; + memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); + remaining = block_length - BLOCK_HEADER_LENGTH; + count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining); + if (count != remaining) { + fp->errcode |= BGZF_ERR_IO; + return -1; + } + size += count; + if ((count = inflate_block(fp, block_length)) < 0) return -1; + if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek. + fp->block_address = block_address; + fp->block_length = count; + cache_block(fp, size); + return 0; +} + +ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length) +{ + ssize_t bytes_read = 0; + uint8_t *output = data; + if (length <= 0) return 0; + assert(fp->is_write == 0); + while (bytes_read < length) { + int copy_length, available = fp->block_length - fp->block_offset; + uint8_t *buffer; + if (available <= 0) { + if (bgzf_read_block(fp) != 0) return -1; + available = fp->block_length - fp->block_offset; + if (available <= 0) break; + } + copy_length = length - bytes_read < available? length - bytes_read : available; + buffer = fp->uncompressed_block; + memcpy(output, buffer + fp->block_offset, copy_length); + fp->block_offset += copy_length; + output += copy_length; + bytes_read += copy_length; + } + if (fp->block_offset == fp->block_length) { + fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); + fp->block_offset = fp->block_length = 0; + } + return bytes_read; +} + +/***** BEGIN: multi-threading *****/ + +typedef struct { + BGZF *fp; + struct mtaux_t *mt; + void *buf; + int i, errcode, toproc; +} worker_t; + +typedef struct mtaux_t { + int n_threads, n_blks, curr, done; + volatile int proc_cnt; + void **blk; + int *len; + worker_t *w; + pthread_t *tid; + pthread_mutex_t lock; + pthread_cond_t cv; +} mtaux_t; + +static int worker_aux(worker_t *w) +{ + int i, tmp, stop = 0; + // wait for condition: to process or all done + pthread_mutex_lock(&w->mt->lock); + while (!w->toproc && !w->mt->done) + pthread_cond_wait(&w->mt->cv, &w->mt->lock); + if (w->mt->done) stop = 1; + w->toproc = 0; + pthread_mutex_unlock(&w->mt->lock); + if (stop) return 1; // to quit the thread + w->errcode = 0; + for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) { + int clen = BGZF_MAX_BLOCK_SIZE; + if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->fp->compress_level) != 0) + w->errcode |= BGZF_ERR_ZLIB; + memcpy(w->mt->blk[i], w->buf, clen); + w->mt->len[i] = clen; + } + tmp = __sync_fetch_and_add(&w->mt->proc_cnt, 1); + return 0; +} + +static void *mt_worker(void *data) +{ + while (worker_aux(data) == 0); + return 0; +} + +int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks) +{ + int i; + mtaux_t *mt; + pthread_attr_t attr; + if (!fp->is_write || fp->mt || n_threads <= 1) return -1; + mt = calloc(1, sizeof(mtaux_t)); + mt->n_threads = n_threads; + mt->n_blks = n_threads * n_sub_blks; + mt->len = calloc(mt->n_blks, sizeof(int)); + mt->blk = calloc(mt->n_blks, sizeof(void*)); + for (i = 0; i < mt->n_blks; ++i) + mt->blk[i] = malloc(BGZF_MAX_BLOCK_SIZE); + mt->tid = calloc(mt->n_threads, sizeof(pthread_t)); // tid[0] is not used, as the worker 0 is launched by the master + mt->w = calloc(mt->n_threads, sizeof(worker_t)); + for (i = 0; i < mt->n_threads; ++i) { + mt->w[i].i = i; + mt->w[i].mt = mt; + mt->w[i].fp = fp; + mt->w[i].buf = malloc(BGZF_MAX_BLOCK_SIZE); + } + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + pthread_mutex_init(&mt->lock, 0); + pthread_cond_init(&mt->cv, 0); + for (i = 1; i < mt->n_threads; ++i) // worker 0 is effectively launched by the master thread + pthread_create(&mt->tid[i], &attr, mt_worker, &mt->w[i]); + fp->mt = mt; + return 0; +} + +static void mt_destroy(mtaux_t *mt) +{ + int i; + // signal all workers to quit + pthread_mutex_lock(&mt->lock); + mt->done = 1; mt->proc_cnt = 0; + pthread_cond_broadcast(&mt->cv); + pthread_mutex_unlock(&mt->lock); + for (i = 1; i < mt->n_threads; ++i) pthread_join(mt->tid[i], 0); // worker 0 is effectively launched by the master thread + // free other data allocated on heap + for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]); + for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf); + free(mt->blk); free(mt->len); free(mt->w); free(mt->tid); + pthread_cond_destroy(&mt->cv); + pthread_mutex_destroy(&mt->lock); + free(mt); +} + +static void mt_queue(BGZF *fp) +{ + mtaux_t *mt = (mtaux_t*)fp->mt; + assert(mt->curr < mt->n_blks); // guaranteed by the caller + memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset); + mt->len[mt->curr] = fp->block_offset; + fp->block_offset = 0; + ++mt->curr; +} + +static int mt_flush(BGZF *fp) +{ + int i; + mtaux_t *mt = (mtaux_t*)fp->mt; + if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail + // signal all the workers to compress + pthread_mutex_lock(&mt->lock); + for (i = 0; i < mt->n_threads; ++i) mt->w[i].toproc = 1; + mt->proc_cnt = 0; + pthread_cond_broadcast(&mt->cv); + pthread_mutex_unlock(&mt->lock); + // worker 0 is doing things here + worker_aux(&mt->w[0]); + // wait for all the threads to complete + while (mt->proc_cnt < mt->n_threads); + // dump data to disk + for (i = 0; i < mt->n_threads; ++i) fp->errcode |= mt->w[i].errcode; + for (i = 0; i < mt->curr; ++i) + if (fwrite(mt->blk[i], 1, mt->len[i], fp->fp) != mt->len[i]) + fp->errcode |= BGZF_ERR_IO; + mt->curr = 0; + return 0; +} + +static int mt_lazy_flush(BGZF *fp) +{ + mtaux_t *mt = (mtaux_t*)fp->mt; + if (fp->block_offset) mt_queue(fp); + if (mt->curr == mt->n_blks) + return mt_flush(fp); + return -1; +} + +static ssize_t mt_write(BGZF *fp, const void *data, ssize_t length) +{ + const uint8_t *input = data; + ssize_t rest = length; + while (rest) { + int copy_length = BGZF_BLOCK_SIZE - fp->block_offset < rest? BGZF_BLOCK_SIZE - fp->block_offset : rest; + memcpy(fp->uncompressed_block + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; input += copy_length; rest -= copy_length; + if (fp->block_offset == BGZF_BLOCK_SIZE) mt_lazy_flush(fp); + } + return length - rest; +} + +/***** END: multi-threading *****/ + +int bgzf_flush(BGZF *fp) +{ + if (!fp->is_write) return 0; + if (fp->mt) return mt_flush(fp); + while (fp->block_offset > 0) { + int block_length; + block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) return -1; + if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) { + fp->errcode |= BGZF_ERR_IO; // possibly truncated file + return -1; + } + fp->block_address += block_length; + } + return 0; +} + +int bgzf_flush_try(BGZF *fp, ssize_t size) +{ + if (fp->block_offset + size > BGZF_BLOCK_SIZE) { + if (fp->mt) return mt_lazy_flush(fp); + else return bgzf_flush(fp); + } + return -1; +} + +ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length) +{ + const uint8_t *input = data; + int block_length = BGZF_BLOCK_SIZE, bytes_written = 0; + assert(fp->is_write); + if (fp->mt) return mt_write(fp, data, length); + while (bytes_written < length) { + uint8_t* buffer = fp->uncompressed_block; + int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written; + memcpy(buffer + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; + input += copy_length; + bytes_written += copy_length; + if (fp->block_offset == block_length && bgzf_flush(fp)) break; + } + return bytes_written; +} + +int bgzf_close(BGZF* fp) +{ + int ret, count, block_length; + if (fp == 0) return -1; + if (fp->is_write) { + if (bgzf_flush(fp) != 0) return -1; + fp->compress_level = -1; + block_length = deflate_block(fp, 0); // write an empty block + count = fwrite(fp->compressed_block, 1, block_length, fp->fp); + if (fflush(fp->fp) != 0) { + fp->errcode |= BGZF_ERR_IO; + return -1; + } + if (fp->mt) mt_destroy(fp->mt); + } + ret = fp->is_write? fclose(fp->fp) : _bgzf_close(fp->fp); + if (ret != 0) return -1; + free(fp->uncompressed_block); + free(fp->compressed_block); + free_cache(fp); + free(fp); + return 0; +} + +void bgzf_set_cache_size(BGZF *fp, int cache_size) +{ + if (fp) fp->cache_size = cache_size; +} + +int bgzf_check_EOF(BGZF *fp) +{ + static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; + uint8_t buf[28]; + off_t offset; + offset = _bgzf_tell((_bgzf_file_t)fp->fp); + if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0; + _bgzf_read(fp->fp, buf, 28); + _bgzf_seek(fp->fp, offset, SEEK_SET); + return (memcmp(magic, buf, 28) == 0)? 1 : 0; +} + +int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) +{ + int block_offset; + int64_t block_address; + + if (fp->is_write || where != SEEK_SET) { + fp->errcode |= BGZF_ERR_MISUSE; + return -1; + } + block_offset = pos & 0xFFFF; + block_address = pos >> 16; + if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) { + fp->errcode |= BGZF_ERR_IO; + return -1; + } + fp->block_length = 0; // indicates current block has not been loaded + fp->block_address = block_address; + fp->block_offset = block_offset; + return 0; +} + +int bgzf_is_bgzf(const char *fn) +{ + uint8_t buf[16]; + int n; + _bgzf_file_t fp; + if ((fp = _bgzf_open(fn, "r")) == 0) return 0; + n = _bgzf_read(fp, buf, 16); + _bgzf_close(fp); + if (n != 16) return 0; + return memcmp(g_magic, buf, 16) == 0? 1 : 0; +} + +int bgzf_getc(BGZF *fp) +{ + int c; + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) return -2; /* error */ + if (fp->block_length == 0) return -1; /* end-of-file */ + } + c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; + if (fp->block_offset == fp->block_length) { + fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); + fp->block_offset = 0; + fp->block_length = 0; + } + return c; +} + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +int bgzf_getline(BGZF *fp, int delim, kstring_t *str) +{ + int l, state = 0; + unsigned char *buf = (unsigned char*)fp->uncompressed_block; + str->l = 0; + do { + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) { state = -2; break; } + if (fp->block_length == 0) { state = -1; break; } + } + for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l); + if (l < fp->block_length) state = 1; + l -= fp->block_offset; + if (str->l + l + 1 >= str->m) { + str->m = str->l + l + 2; + kroundup32(str->m); + str->s = (char*)realloc(str->s, str->m); + } + memcpy(str->s + str->l, buf + fp->block_offset, l); + str->l += l; + fp->block_offset += l + 1; + if (fp->block_offset >= fp->block_length) { + fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); + fp->block_offset = 0; + fp->block_length = 0; + } + } while (state == 0); + if (str->l == 0 && state < 0) return state; + str->s[str->l] = 0; + return str->l; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bgzf.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,207 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + 2011, 2012 Attractive Chaos <attractor@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +/* The BGZF library was originally written by Bob Handsaker from the Broad + * Institute. It was later improved by the SAMtools developers. */ + +#ifndef __BGZF_H +#define __BGZF_H + +#include <stdint.h> +#include <stdio.h> +#include <zlib.h> +#include <sys/types.h> + +#define BGZF_BLOCK_SIZE 0xff00 // make sure compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE +#define BGZF_MAX_BLOCK_SIZE 0x10000 + +#define BGZF_ERR_ZLIB 1 +#define BGZF_ERR_HEADER 2 +#define BGZF_ERR_IO 4 +#define BGZF_ERR_MISUSE 8 + +typedef struct { + int errcode:16, is_write:2, compress_level:14; + int cache_size; + int block_length, block_offset; + int64_t block_address; + void *uncompressed_block, *compressed_block; + void *cache; // a pointer to a hash table + void *fp; // actual file handler; FILE* on writing; FILE* or knetFile* on reading + void *mt; // only used for multi-threading +} BGZF; + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + /****************** + * Basic routines * + ******************/ + + /** + * Open an existing file descriptor for reading or writing. + * + * @param fd file descriptor + * @param mode mode matching /[rwu0-9]+/: 'r' for reading, 'w' for writing and a digit specifies + * the zlib compression level; if both 'r' and 'w' are present, 'w' is ignored. + * @return BGZF file handler; 0 on error + */ + BGZF* bgzf_dopen(int fd, const char *mode); + + #define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility + + /** + * Open the specified file for reading or writing. + */ + BGZF* bgzf_open(const char* path, const char *mode); + + /** + * Close the BGZF and free all associated resources. + * + * @param fp BGZF file handler + * @return 0 on success and -1 on error + */ + int bgzf_close(BGZF *fp); + + /** + * Read up to _length_ bytes from the file storing into _data_. + * + * @param fp BGZF file handler + * @param data data array to read into + * @param length size of data to read + * @return number of bytes actually read; 0 on end-of-file and -1 on error + */ + ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length); + + /** + * Write _length_ bytes from _data_ to the file. + * + * @param fp BGZF file handler + * @param data data array to write + * @param length size of data to write + * @return number of bytes actually written; -1 on error + */ + ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length); + + /** + * Write the data in the buffer to the file. + */ + int bgzf_flush(BGZF *fp); + + /** + * Return a virtual file pointer to the current location in the file. + * No interpetation of the value should be made, other than a subsequent + * call to bgzf_seek can be used to position the file at the same point. + * Return value is non-negative on success. + */ + #define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) + + /** + * Set the file to read from the location specified by _pos_. + * + * @param fp BGZF file handler + * @param pos virtual file offset returned by bgzf_tell() + * @param whence must be SEEK_SET + * @return 0 on success and -1 on error + */ + int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence); + + /** + * Check if the BGZF end-of-file (EOF) marker is present + * + * @param fp BGZF file handler opened for reading + * @return 1 if EOF is present; 0 if not or on I/O error + */ + int bgzf_check_EOF(BGZF *fp); + + /** + * Check if a file is in the BGZF format + * + * @param fn file name + * @return 1 if _fn_ is BGZF; 0 if not or on I/O error + */ + int bgzf_is_bgzf(const char *fn); + + /********************* + * Advanced routines * + *********************/ + + /** + * Set the cache size. Only effective when compiled with -DBGZF_CACHE. + * + * @param fp BGZF file handler + * @param size size of cache in bytes; 0 to disable caching (default) + */ + void bgzf_set_cache_size(BGZF *fp, int size); + + /** + * Flush the file if the remaining buffer size is smaller than _size_ + */ + int bgzf_flush_try(BGZF *fp, ssize_t size); + + /** + * Read one byte from a BGZF file. It is faster than bgzf_read() + * @param fp BGZF file handler + * @return byte read; -1 on end-of-file or error + */ + int bgzf_getc(BGZF *fp); + + /** + * Read one line from a BGZF file. It is faster than bgzf_getc() + * + * @param fp BGZF file handler + * @param delim delimitor + * @param str string to write to; must be initialized + * @return length of the string; 0 on end-of-file; negative on error + */ + int bgzf_getline(BGZF *fp, int delim, kstring_t *str); + + /** + * Read the next BGZF block. + */ + int bgzf_read_block(BGZF *fp); + + /** + * Enable multi-threading (only effective on writing) + * + * @param fp BGZF file handler; must be opened for writing + * @param n_threads #threads used for writing + * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended + */ + int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks); + +#ifdef __cplusplus +} +#endif + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/bgzip.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,206 @@ +/* The MIT License + + Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> +#include <sys/select.h> +#include <sys/stat.h> +#include "bgzf.h" + +static const int WINDOW_SIZE = 64 * 1024; + +static int bgzip_main_usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bgzip [options] [file] ...\n\n"); + fprintf(stderr, "Options: -c write on standard output, keep original files unchanged\n"); + fprintf(stderr, " -d decompress\n"); + fprintf(stderr, " -f overwrite files without asking\n"); + fprintf(stderr, " -b INT decompress at virtual file pointer INT\n"); + fprintf(stderr, " -s INT decompress INT bytes in the uncompressed file\n"); + fprintf(stderr, " -h give this help\n"); + fprintf(stderr, "\n"); + return 1; +} + +static int write_open(const char *fn, int is_forced) +{ + int fd = -1; + char c; + if (!is_forced) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) { + fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn); + scanf("%c", &c); + if (c != 'Y' && c != 'y') { + fprintf(stderr, "[bgzip] not overwritten\n"); + exit(1); + } + } + } + if (fd < 0) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) { + fprintf(stderr, "[bgzip] %s: Fail to write\n", fn); + exit(1); + } + } + return fd; +} + +static void fail(BGZF* fp) +{ + fprintf(stderr, "Error: %s\n", fp->error); + exit(1); +} + +int main(int argc, char **argv) +{ + int c, compress, pstdout, is_forced; + BGZF *fp; + void *buffer; + long start, end, size; + + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; + while((c = getopt(argc, argv, "cdhfb:s:")) >= 0){ + switch(c){ + case 'h': return bgzip_main_usage(); + case 'd': compress = 0; break; + case 'c': pstdout = 1; break; + case 'b': start = atol(optarg); break; + case 's': size = atol(optarg); break; + case 'f': is_forced = 1; break; + } + } + if (size >= 0) end = start + size; + if (end >= 0 && end < start) { + fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); + return 1; + } + if (compress == 1) { + struct stat sbuf; + int f_src = fileno(stdin); + int f_dst = fileno(stdout); + + if ( argc>optind ) + { + if ( stat(argv[optind],&sbuf)<0 ) + { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + + if ((f_src = open(argv[optind], O_RDONLY)) < 0) { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + + if (pstdout) + f_dst = fileno(stdout); + else + { + char *name = malloc(strlen(argv[optind]) + 5); + strcpy(name, argv[optind]); + strcat(name, ".gz"); + f_dst = write_open(name, is_forced); + if (f_dst < 0) return 1; + free(name); + } + } + else if (!pstdout && isatty(fileno((FILE *)stdout)) ) + return bgzip_main_usage(); + + fp = bgzf_fdopen(f_dst, "w"); + buffer = malloc(WINDOW_SIZE); + while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) + if (bgzf_write(fp, buffer, c) < 0) fail(fp); + // f_dst will be closed here + if (bgzf_close(fp) < 0) fail(fp); + if (argc > optind && !pstdout) unlink(argv[optind]); + free(buffer); + close(f_src); + return 0; + } else { + struct stat sbuf; + int f_dst; + + if ( argc>optind ) + { + if ( stat(argv[optind],&sbuf)<0 ) + { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + char *name; + int len = strlen(argv[optind]); + if ( strcmp(argv[optind]+len-3,".gz") ) + { + fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); + return 1; + } + fp = bgzf_open(argv[optind], "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); + return 1; + } + + if (pstdout) { + f_dst = fileno(stdout); + } + else { + name = strdup(argv[optind]); + name[strlen(name) - 3] = '\0'; + f_dst = write_open(name, is_forced); + free(name); + } + } + else if (!pstdout && isatty(fileno((FILE *)stdin)) ) + return bgzip_main_usage(); + else + { + f_dst = fileno(stdout); + fp = bgzf_fdopen(fileno(stdin), "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); + return 1; + } + } + buffer = malloc(WINDOW_SIZE); + if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp); + while (1) { + if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); + else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); + if (c == 0) break; + if (c < 0) fail(fp); + start += c; + write(f_dst, buffer, c); + if (end >= 0 && start >= end) break; + } + free(buffer); + if (bgzf_close(fp) < 0) fail(fp); + if (!pstdout) unlink(argv[optind]); + return 0; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/cut_target.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,193 @@ +#include <unistd.h> +#include <stdlib.h> +#include <string.h> +#include "bam.h" +#include "errmod.h" +#include "faidx.h" + +#define ERR_DEP 0.83f + +typedef struct { + int e[2][3], p[2][2]; +} score_param_t; + +/* Note that although the two matrics have 10 parameters in total, only 4 + * (probably 3) are free. Changing the scoring matrices in a sort of symmetric + * way will not change the result. */ +static score_param_t g_param = { {{0,0,0},{-4,1,6}}, {{0,-14000}, {0,0}} }; + +typedef struct { + int min_baseQ, tid, max_bases; + uint16_t *bases; + bamFile fp; + bam_header_t *h; + char *ref; + faidx_t *fai; + errmod_t *em; +} ct_t; + +static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp) +{ + int i, j, ret, tmp, k, sum[4], qual; + float q[16]; + if (n > g->max_bases) { // enlarge g->bases + g->max_bases = n; + kroundup32(g->max_bases); + g->bases = realloc(g->bases, g->max_bases * 2); + } + for (i = k = 0; i < n; ++i) { + const bam_pileup1_t *p = plp + i; + uint8_t *seq; + int q, baseQ, b; + if (p->is_refskip || p->is_del) continue; + baseQ = bam1_qual(p->b)[p->qpos]; + if (baseQ < g->min_baseQ) continue; + seq = bam1_seq(p->b); + b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)]; + if (b > 3) continue; + q = baseQ < p->b->core.qual? baseQ : p->b->core.qual; + if (q < 4) q = 4; + if (q > 63) q = 63; + g->bases[k++] = q<<5 | bam1_strand(p->b)<<4 | b; + } + if (k == 0) return 0; + errmod_cal(g->em, k, 4, g->bases, q); + for (i = 0; i < 4; ++i) sum[i] = (int)(q[i<<2|i] + .499) << 2 | i; + for (i = 1; i < 4; ++i) // insertion sort + for (j = i; j > 0 && sum[j] < sum[j-1]; --j) + tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp; + qual = (sum[1]>>2) - (sum[0]>>2); + k = k < 256? k : 255; + ret = (qual < 63? qual : 63) << 2 | (sum[0]&3); + return ret<<8|k; +} + +static void process_cns(bam_header_t *h, int tid, int l, uint16_t *cns) +{ + int i, f[2][2], *prev, *curr, *swap_tmp, s; + uint8_t *b; // backtrack array + b = calloc(l, 1); + f[0][0] = f[0][1] = 0; + prev = f[0]; curr = f[1]; + // fill the backtrack matrix + for (i = 0; i < l; ++i) { + int c = (cns[i] == 0)? 0 : (cns[i]>>8 == 0)? 1 : 2; + int tmp0, tmp1; + // compute f[0] + tmp0 = prev[0] + g_param.e[0][c] + g_param.p[0][0]; // (s[i+1],s[i])=(0,0) + tmp1 = prev[1] + g_param.e[0][c] + g_param.p[1][0]; // (0,1) + if (tmp0 > tmp1) curr[0] = tmp0, b[i] = 0; + else curr[0] = tmp1, b[i] = 1; + // compute f[1] + tmp0 = prev[0] + g_param.e[1][c] + g_param.p[0][1]; // (s[i+1],s[i])=(1,0) + tmp1 = prev[1] + g_param.e[1][c] + g_param.p[1][1]; // (1,1) + if (tmp0 > tmp1) curr[1] = tmp0, b[i] |= 0<<1; + else curr[1] = tmp1, b[i] |= 1<<1; + // swap + swap_tmp = prev; prev = curr; curr = swap_tmp; + } + // backtrack + s = prev[0] > prev[1]? 0 : 1; + for (i = l - 1; i > 0; --i) { + b[i] |= s<<2; + s = b[i]>>s&1; + } + // print + for (i = 0, s = -1; i <= l; ++i) { + if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) { + if (s >= 0) { + int j; + printf("%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s); + for (j = s; j < i; ++j) { + int c = cns[j]>>8; + if (c == 0) putchar('N'); + else putchar("ACGT"[c&3]); + } + putchar('\t'); + for (j = s; j < i; ++j) + putchar(33 + (cns[j]>>8>>2)); + putchar('\n'); + } + //if (s >= 0) printf("%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s); + s = -1; + } else if ((b[i]>>2&3) && s < 0) s = i; + } + free(b); +} + +static int read_aln(void *data, bam1_t *b) +{ + extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag); + ct_t *g = (ct_t*)data; + int ret, len; + ret = bam_read1(g->fp, b); + if (ret >= 0 && g->fai && b->core.tid >= 0 && (b->core.flag&4) == 0) { + if (b->core.tid != g->tid) { // then load the sequence + free(g->ref); + g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len); + g->tid = b->core.tid; + } + bam_prob_realn_core(b, g->ref, 1<<1|1); + } + return ret; +} + +int main_cut_target(int argc, char *argv[]) +{ + int c, tid, pos, n, lasttid = -1, lastpos = -1, l, max_l; + const bam_pileup1_t *p; + bam_plp_t plp; + uint16_t *cns; + ct_t g; + + memset(&g, 0, sizeof(ct_t)); + g.min_baseQ = 13; g.tid = -1; + while ((c = getopt(argc, argv, "f:Q:i:o:0:1:2:")) >= 0) { + switch (c) { + case 'Q': g.min_baseQ = atoi(optarg); break; // quality cutoff + case 'i': g_param.p[0][1] = -atoi(optarg); break; // 0->1 transition (in) PENALTY + case '0': g_param.e[1][0] = atoi(optarg); break; // emission SCORE + case '1': g_param.e[1][1] = atoi(optarg); break; + case '2': g_param.e[1][2] = atoi(optarg); break; + case 'f': g.fai = fai_load(optarg); + if (g.fai == 0) fprintf(stderr, "[%s] fail to load the fasta index.\n", __func__); + break; + } + } + if (argc == optind) { + fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] [-f ref] <in.bam>\n"); + return 1; + } + l = max_l = 0; cns = 0; + g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); + g.h = bam_header_read(g.fp); + g.em = errmod_init(1 - ERR_DEP); + plp = bam_plp_init(read_aln, &g); + while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) { + if (tid < 0) break; + if (tid != lasttid) { // change of chromosome + if (cns) process_cns(g.h, lasttid, l, cns); + if (max_l < g.h->target_len[tid]) { + max_l = g.h->target_len[tid]; + kroundup32(max_l); + cns = realloc(cns, max_l * 2); + } + l = g.h->target_len[tid]; + memset(cns, 0, max_l * 2); + lasttid = tid; + } + cns[pos] = gencns(&g, n, p); + lastpos = pos; + } + process_cns(g.h, lasttid, l, cns); + free(cns); + bam_header_destroy(g.h); + bam_plp_destroy(plp); + bam_close(g.fp); + if (g.fai) { + fai_destroy(g.fai); free(g.ref); + } + errmod_destroy(g.em); + free(g.bases); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/errmod.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,130 @@ +#include <math.h> +#include "errmod.h" +#include "ksort.h" +KSORT_INIT_GENERIC(uint16_t) + +typedef struct __errmod_coef_t { + double *fk, *beta, *lhet; +} errmod_coef_t; + +typedef struct { + double fsum[16], bsum[16]; + uint32_t c[16]; +} call_aux_t; + +static errmod_coef_t *cal_coef(double depcorr, double eta) +{ + int k, n, q; + long double sum, sum1; + double *lC; + errmod_coef_t *ec; + + ec = calloc(1, sizeof(errmod_coef_t)); + // initialize ->fk + ec->fk = (double*)calloc(256, sizeof(double)); + ec->fk[0] = 1.0; + for (n = 1; n != 256; ++n) + ec->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta; + // initialize ->coef + ec->beta = (double*)calloc(256 * 256 * 64, sizeof(double)); + lC = (double*)calloc(256 * 256, sizeof(double)); + for (n = 1; n != 256; ++n) { + double lgn = lgamma(n+1); + for (k = 1; k <= n; ++k) + lC[n<<8|k] = lgn - lgamma(k+1) - lgamma(n-k+1); + } + for (q = 1; q != 64; ++q) { + double e = pow(10.0, -q/10.0); + double le = log(e); + double le1 = log(1.0 - e); + for (n = 1; n <= 255; ++n) { + double *beta = ec->beta + (q<<16|n<<8); + sum1 = sum = 0.0; + for (k = n; k >= 0; --k, sum1 = sum) { + sum = sum1 + expl(lC[n<<8|k] + k*le + (n-k)*le1); + beta[k] = -10. / M_LN10 * logl(sum1 / sum); + } + } + } + // initialize ->lhet + ec->lhet = (double*)calloc(256 * 256, sizeof(double)); + for (n = 0; n < 256; ++n) + for (k = 0; k < 256; ++k) + ec->lhet[n<<8|k] = lC[n<<8|k] - M_LN2 * n; + free(lC); + return ec; +} + +errmod_t *errmod_init(float depcorr) +{ + errmod_t *em; + em = (errmod_t*)calloc(1, sizeof(errmod_t)); + em->depcorr = depcorr; + em->coef = cal_coef(depcorr, 0.03); + return em; +} + +void errmod_destroy(errmod_t *em) +{ + if (em == 0) return; + free(em->coef->lhet); free(em->coef->fk); free(em->coef->beta); + free(em->coef); free(em); +} +// qual:6, strand:1, base:4 +int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q) +{ + call_aux_t aux; + int i, j, k, w[32]; + + if (m > m) return -1; + memset(q, 0, m * m * sizeof(float)); + if (n == 0) return 0; + // calculate aux.esum and aux.fsum + if (n > 255) { // then sample 255 bases + ks_shuffle(uint16_t, n, bases); + n = 255; + } + ks_introsort(uint16_t, n, bases); + memset(w, 0, 32 * sizeof(int)); + memset(&aux, 0, sizeof(call_aux_t)); + for (j = n - 1; j >= 0; --j) { // calculate esum and fsum + uint16_t b = bases[j]; + int q = b>>5 < 4? 4 : b>>5; + if (q > 63) q = 63; + k = b&0x1f; + aux.fsum[k&0xf] += em->coef->fk[w[k]]; + aux.bsum[k&0xf] += em->coef->fk[w[k]] * em->coef->beta[q<<16|n<<8|aux.c[k&0xf]]; + ++aux.c[k&0xf]; + ++w[k]; + } + // generate likelihood + for (j = 0; j != m; ++j) { + float tmp1, tmp3; + int tmp2, bar_e; + // homozygous + for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != m; ++k) { + if (k == j) continue; + tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k]; + } + if (tmp2) { + bar_e = (int)(tmp1 / tmp3 + 0.499); + if (bar_e > 63) bar_e = 63; + q[j*m+j] = tmp1; + } + // heterozygous + for (k = j + 1; k < m; ++k) { + int cjk = aux.c[j] + aux.c[k]; + for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) { + if (i == j || i == k) continue; + tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i]; + } + if (tmp2) { + bar_e = (int)(tmp1 / tmp3 + 0.499); + if (bar_e > 63) bar_e = 63; + q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1; + } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k + } + for (k = 0; k != m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0; + } + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/errmod.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,24 @@ +#ifndef ERRMOD_H +#define ERRMOD_H + +#include <stdint.h> + +struct __errmod_coef_t; + +typedef struct { + double depcorr; + struct __errmod_coef_t *coef; +} errmod_t; + +errmod_t *errmod_init(float depcorr); +void errmod_destroy(errmod_t *em); + +/* + n: number of bases + m: maximum base + bases[i]: qual:6, strand:1, base:4 + q[i*m+j]: phred-scaled likelihood of (i,j) + */ +int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/examples/00README.txt Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,23 @@ +File ex1.fa contains two sequences cut from the human genome +build36. They were exatracted with command: + + samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550 + +Sequence names were changed manually for simplicity. File ex1.sam.gz +contains MAQ alignments exatracted with: + + (samtools view NA18507_maq.bam 2:2044001-2045500; + samtools view NA18507_maq.bam 20:68001-69500) + +and processed with `samtools fixmate' to make it self-consistent as a +standalone alignment. + +To try samtools, you may run the following commands: + + samtools faidx ex1.fa # index the reference FASTA + samtools import ex1.fa.fai ex1.sam.gz ex1.bam # SAM->BAM + samtools index ex1.bam # index BAM + samtools tview ex1.bam ex1.fa # view alignment + samtools pileup -cf ex1.fa ex1.bam # pileup and consensus + samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/examples/Makefile Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,50 @@ +all:../libbam.a ../samtools ../bcftools/bcftools \ + ex1.glf ex1.pileup.gz ex1.bam.bai ex1f-rmduppe.bam ex1f-rmdupse.bam ex1.glfview.gz ex1.bcf calDepth + @echo; echo \# You can now launch the viewer with: \'samtools tview ex1.bam ex1.fa\'; echo; + +ex1.fa.fai:ex1.fa + ../samtools faidx ex1.fa +ex1.bam:ex1.sam.gz ex1.fa.fai + ../samtools import ex1.fa.fai ex1.sam.gz ex1.bam +ex1.bam.bai:ex1.bam + ../samtools index ex1.bam +ex1.pileup.gz:ex1.bam ex1.fa + ../samtools pileup -cf ex1.fa ex1.bam | gzip > ex1.pileup.gz +ex1.glf:ex1.bam ex1.fa + ../samtools pileup -gf ex1.fa ex1.bam > ex1.glf +ex1.glfview.gz:ex1.glf + ../samtools glfview ex1.glf | gzip > ex1.glfview.gz +ex1a.bam:ex1.bam + ../samtools view -h ex1.bam | awk 'BEGIN{FS=OFS="\t"}{if(/^@/)print;else{$$1=$$1"a";print}}' | ../samtools view -bS - > $@ +ex1b.bam:ex1.bam + ../samtools view -h ex1.bam | awk 'BEGIN{FS=OFS="\t"}{if(/^@/)print;else{$$1=$$1"b";print}}' | ../samtools view -bS - > $@ +ex1f.rg: + (echo "@RG ID:ex1 LB:ex1 SM:ex1"; echo "@RG ID:ex1a LB:ex1 SM:ex1"; echo "@RG ID:ex1b LB:ex1b SM:ex1b") > $@ +ex1f.bam:ex1.bam ex1a.bam ex1b.bam ex1f.rg + ../samtools merge -rh ex1f.rg $@ ex1.bam ex1a.bam ex1b.bam +ex1f-rmduppe.bam:ex1f.bam + ../samtools rmdup ex1f.bam $@ +ex1f-rmdupse.bam:ex1f.bam + ../samtools rmdup -S ex1f.bam $@ + +ex1.bcf:ex1.bam ex1.fa.fai + ../samtools mpileup -gf ex1.fa ex1.bam > $@ + +../bcftools/bcftools: + (cd ../bcftools; make bcftools) + +../samtools: + (cd ..; make samtools) + +../libbam.a: + (cd ..; make libbam.a) + +calDepth:../libbam.a calDepth.c + gcc -g -Wall -O2 -I.. calDepth.c -o $@ -L.. -lbam -lm -lz + +clean: + rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM ex1*.rg ex1.bcf + +# ../samtools pileup ex1.bam|perl -ape '$_=$F[4];s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Z//,tr/a-z//);$_=join("\t",@F[0,1],@_)."\n"' + +# ../samtools pileup -cf ex1.fa ex1.bam|perl -ape '$_=$F[8];s/\^.//g;s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Za-z//,tr/,.//);$_=join("\t",@F[0,1],@_)."\n"'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/examples/bam2bed.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,51 @@ +#include <stdio.h> +#include "sam.h" +static int fetch_func(const bam1_t *b, void *data) +{ + samfile_t *fp = (samfile_t*)data; + uint32_t *cigar = bam1_cigar(b); + const bam1_core_t *c = &b->core; + int i, l; + if (b->core.tid < 0) return 0; + for (i = l = 0; i < c->n_cigar; ++i) { + int op = cigar[i]&0xf; + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) + l += cigar[i]>>4; + } + printf("%s\t%d\t%d\t%s\t%d\t%c\n", fp->header->target_name[c->tid], + c->pos, c->pos + l, bam1_qname(b), c->qual, (c->flag&BAM_FREVERSE)? '-' : '+'); + return 0; +} +int main(int argc, char *argv[]) +{ + samfile_t *fp; + if (argc == 1) { + fprintf(stderr, "Usage: bam2bed <in.bam> [region]\n"); + return 1; + } + if ((fp = samopen(argv[1], "rb", 0)) == 0) { + fprintf(stderr, "bam2bed: Fail to open BAM file %s\n", argv[1]); + return 1; + } + if (argc == 2) { /* if a region is not specified */ + bam1_t *b = bam_init1(); + while (samread(fp, b) >= 0) fetch_func(b, fp); + bam_destroy1(b); + } else { + int ref, beg, end; + bam_index_t *idx; + if ((idx = bam_index_load(argv[1])) == 0) { + fprintf(stderr, "bam2bed: BAM indexing file is not available.\n"); + return 1; + } + bam_parse_region(fp->header, argv[2], &ref, &beg, &end); + if (ref < 0) { + fprintf(stderr, "bam2bed: Invalid region %s\n", argv[2]); + return 1; + } + bam_fetch(fp->x.bam, idx, ref, beg, end, fp, fetch_func); + bam_index_destroy(idx); + } + samclose(fp); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/examples/calDepth.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,62 @@ +#include <stdio.h> +#include "sam.h" + +typedef struct { + int beg, end; + samfile_t *in; +} tmpstruct_t; + +// callback for bam_fetch() +static int fetch_func(const bam1_t *b, void *data) +{ + bam_plbuf_t *buf = (bam_plbuf_t*)data; + bam_plbuf_push(b, buf); + return 0; +} +// callback for bam_plbuf_init() +static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) +{ + tmpstruct_t *tmp = (tmpstruct_t*)data; + if ((int)pos >= tmp->beg && (int)pos < tmp->end) + printf("%s\t%d\t%d\n", tmp->in->header->target_name[tid], pos + 1, n); + return 0; +} + +int main(int argc, char *argv[]) +{ + tmpstruct_t tmp; + if (argc == 1) { + fprintf(stderr, "Usage: calDepth <in.bam> [region]\n"); + return 1; + } + tmp.beg = 0; tmp.end = 0x7fffffff; + tmp.in = samopen(argv[1], "rb", 0); + if (tmp.in == 0) { + fprintf(stderr, "Fail to open BAM file %s\n", argv[1]); + return 1; + } + if (argc == 2) { // if a region is not specified + sampileup(tmp.in, -1, pileup_func, &tmp); + } else { + int ref; + bam_index_t *idx; + bam_plbuf_t *buf; + idx = bam_index_load(argv[1]); // load BAM index + if (idx == 0) { + fprintf(stderr, "BAM indexing file is not available.\n"); + return 1; + } + bam_parse_region(tmp.in->header, argv[2], &ref, &tmp.beg, &tmp.end); // parse the region + if (ref < 0) { + fprintf(stderr, "Invalid region %s\n", argv[2]); + return 1; + } + buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup + bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, fetch_func); + bam_plbuf_push(0, buf); // finalize pileup + bam_index_destroy(idx); + bam_plbuf_destroy(buf); + } + samclose(tmp.in); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/examples/chk_indel.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,83 @@ +/* To compile, copy this file to the samtools source code directory and compile with: + gcc -g -O2 -Wall chk_indel_rg.c -o chk_indel_rg -Wall -I. -L. -lbam -lz +*/ + +#include <string.h> +#include "bam.h" + +typedef struct { + long cnt[4]; // short:ins, short:del, long:ins, long:del +} rgcnt_t; + +#include "khash.h" +KHASH_MAP_INIT_STR(rgcnt, rgcnt_t) + +#define MAX_LEN 127 +#define Q_THRES 10 +#define L_THRES 6 // short: <=L_THRES; otherwise long + +int main(int argc, char *argv[]) +{ + bamFile fp; + bam1_t *b; + int i, x; + khash_t(rgcnt) *h; + khint_t k; + + if (argc == 1) { + fprintf(stderr, "Usage: chk_indel_rg <in.bam>\n\n"); + fprintf(stderr, "Output: filename, RG, #ins-in-short-homopolymer, #del-in-short, #ins-in-long, #del-in-long\n"); + return 1; + } + + h = kh_init(rgcnt); + fp = bam_open(argv[1], "r"); + bam_header_destroy(bam_header_read(fp)); // we do not need the header + b = bam_init1(); + + while (bam_read1(fp, b) >= 0) { + if (b->core.n_cigar >= 3 && b->core.qual >= Q_THRES) { + const uint8_t *seq; + const uint32_t *cigar = bam1_cigar(b); + char *rg; + for (i = 0; i < b->core.n_cigar; ++i) // check if there are 1bp indels + if (bam_cigar_oplen(cigar[i]) == 1 && (bam_cigar_op(cigar[i]) == BAM_CDEL || bam_cigar_op(cigar[i]) == BAM_CINS)) + break; + if (i == b->core.n_cigar) continue; // no 1bp ins or del + if ((rg = (char*)bam_aux_get(b, "RG")) == 0) continue; // no RG tag + seq = bam1_seq(b); + for (i = x = 0; i < b->core.n_cigar; ++i) { + int op = bam_cigar_op(cigar[i]); + if (bam_cigar_oplen(cigar[i]) == 1 && (op == BAM_CDEL || op == BAM_CINS)) { + int c, j, hrun, which; + c = bam1_seqi(seq, x); + for (j = x + 1, hrun = 0; j < b->core.l_qseq; ++j, ++hrun) // calculate the hompolymer run length + if (bam1_seqi(seq, j) != c) break; + k = kh_get(rgcnt, h, rg + 1); + if (k == kh_end(h)) { // absent + char *key = strdup(rg + 1); + k = kh_put(rgcnt, h, key, &c); + memset(&kh_val(h, k), 0, sizeof(rgcnt_t)); + } + which = (hrun <= L_THRES? 0 : 1)<<1 | (op == BAM_CINS? 0 : 1); + ++kh_val(h, k).cnt[which]; + } + if (bam_cigar_type(op)&1) ++x; + } + } + } + + for (k = 0; k != kh_end(h); ++k) { + if (!kh_exist(h, k)) continue; + printf("%s\t%s", argv[1], kh_key(h, k)); + for (i = 0; i < 4; ++i) + printf("\t%ld", kh_val(h, k).cnt[i]); + putchar('\n'); + free((char*)kh_key(h, k)); + } + + bam_destroy1(b); + bam_close(fp); + kh_destroy(rgcnt, h); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/examples/ex1.fa Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,56 @@ +>seq1 +CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCT +GTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCAC +GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAG +TCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTC +AGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAA +CAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACC +AAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCT +CTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCA +ATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGC +AGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAAC +AACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACAC +ATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATAC +CATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCT +TTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTT +TCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAAT +GCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAAT +ACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGA +ACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTG +TGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTA +CGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAG +TCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGC +TTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTC +TCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTG +TTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGG +AGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATA +TTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTC +TCCCTCGTCTTCTTA +>seq2 +TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAG +CTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCT +TATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTT +CAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA +AAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTT +AGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATAC +ATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAG +GAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCAT +CAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATT +TTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTA +AGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA +ATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAAT +TAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATA +AAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACC +TCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATA +GATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATT +AATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCA +AATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGT +AAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATAT +AACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAAT +ACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGAT +GATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTG +CGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATA +GCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAA +AAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAA +TTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGC +CAGAAAAAAATATTTACAGTAACT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/examples/toy.fa Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,4 @@ +>ref +AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT +>ref2 +aggttttataaaacaattaagtctacagagcaactacgcg
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/examples/toy.sam Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,14 @@ +@SQ SN:ref LN:45 +@SQ SN:ref2 LN:40 +r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 +r002 0 ref 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * +r003 0 ref 9 30 5H6M * 0 0 AGCTAA * +r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * +r003 16 ref 29 30 6H5M * 0 0 TAGGC * +r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT * +x1 0 ref2 1 30 20M * 0 0 aggttttataaaacaaataa ???????????????????? +x2 0 ref2 2 30 21M * 0 0 ggttttataaaacaaataatt ????????????????????? +x3 0 ref2 6 30 9M4I13M * 0 0 ttataaaacAAATaattaagtctaca ?????????????????????????? +x4 0 ref2 10 30 25M * 0 0 CaaaTaattaagtctacagagcaac ????????????????????????? +x5 0 ref2 12 30 24M * 0 0 aaTaattaagtctacagagcaact ???????????????????????? +x6 0 ref2 14 30 23M * 0 0 Taattaagtctacagagcaacta ???????????????????????
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/faidx.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,437 @@ +#include <ctype.h> +#include <string.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include "faidx.h" +#include "khash.h" + +typedef struct { + int32_t line_len, line_blen; + int64_t len; + uint64_t offset; +} faidx1_t; +KHASH_MAP_INIT_STR(s, faidx1_t) + +#ifndef _NO_RAZF +#include "razf.h" +#else +#ifdef _WIN32 +#define ftello(fp) ftell(fp) +#define fseeko(fp, offset, whence) fseek(fp, offset, whence) +#else +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); +#endif +#define RAZF FILE +#define razf_read(fp, buf, size) fread(buf, 1, size, fp) +#define razf_open(fn, mode) fopen(fn, mode) +#define razf_close(fp) fclose(fp) +#define razf_seek(fp, offset, whence) fseeko(fp, offset, whence) +#define razf_tell(fp) ftello(fp) +#endif +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +struct __faidx_t { + RAZF *rz; + int n, m; + char **name; + khash_t(s) *hash; +}; + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset) +{ + khint_t k; + int ret; + faidx1_t t; + if (idx->n == idx->m) { + idx->m = idx->m? idx->m<<1 : 16; + idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m); + } + idx->name[idx->n] = strdup(name); + k = kh_put(s, idx->hash, idx->name[idx->n], &ret); + t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset; + kh_value(idx->hash, k) = t; + ++idx->n; +} + +faidx_t *fai_build_core(RAZF *rz) +{ + char c, *name; + int l_name, m_name, ret; + int line_len, line_blen, state; + int l1, l2; + faidx_t *idx; + uint64_t offset; + int64_t len; + + idx = (faidx_t*)calloc(1, sizeof(faidx_t)); + idx->hash = kh_init(s); + name = 0; l_name = m_name = 0; + len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0; + while (razf_read(rz, &c, 1)) { + if (c == '\n') { // an empty line + if (state == 1) { + offset = razf_tell(rz); + continue; + } else if ((state == 0 && len < 0) || state == 2) continue; + } + if (c == '>') { // fasta header + if (len >= 0) + fai_insert_index(idx, name, len, line_len, line_blen, offset); + l_name = 0; + while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) { + if (m_name < l_name + 2) { + m_name = l_name + 2; + kroundup32(m_name); + name = (char*)realloc(name, m_name); + } + name[l_name++] = c; + } + name[l_name] = '\0'; + if (ret == 0) { + fprintf(stderr, "[fai_build_core] the last entry has no sequence\n"); + free(name); fai_destroy(idx); + return 0; + } + if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n'); + state = 1; len = 0; + offset = razf_tell(rz); + } else { + if (state == 3) { + fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name); + free(name); fai_destroy(idx); + return 0; + } + if (state == 2) state = 3; + l1 = l2 = 0; + do { + ++l1; + if (isgraph(c)) ++l2; + } while ((ret = razf_read(rz, &c, 1)) && c != '\n'); + if (state == 3 && l2) { + fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name); + free(name); fai_destroy(idx); + return 0; + } + ++l1; len += l2; + if (state == 1) line_len = l1, line_blen = l2, state = 0; + else if (state == 0) { + if (l1 != line_len || l2 != line_blen) state = 2; + } + } + } + fai_insert_index(idx, name, len, line_len, line_blen, offset); + free(name); + return idx; +} + +void fai_save(const faidx_t *fai, FILE *fp) +{ + khint_t k; + int i; + for (i = 0; i < fai->n; ++i) { + faidx1_t x; + k = kh_get(s, fai->hash, fai->name[i]); + x = kh_value(fai->hash, k); +#ifdef _WIN32 + fprintf(fp, "%s\t%d\t%ld\t%d\t%d\n", fai->name[i], (int)x.len, (long)x.offset, (int)x.line_blen, (int)x.line_len); +#else + fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len); +#endif + } +} + +faidx_t *fai_read(FILE *fp) +{ + faidx_t *fai; + char *buf, *p; + int len, line_len, line_blen; +#ifdef _WIN32 + long offset; +#else + long long offset; +#endif + fai = (faidx_t*)calloc(1, sizeof(faidx_t)); + fai->hash = kh_init(s); + buf = (char*)calloc(0x10000, 1); + while (!feof(fp) && fgets(buf, 0x10000, fp)) { + for (p = buf; *p && isgraph(*p); ++p); + *p = 0; ++p; +#ifdef _WIN32 + sscanf(p, "%d%ld%d%d", &len, &offset, &line_blen, &line_len); +#else + sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len); +#endif + fai_insert_index(fai, buf, len, line_len, line_blen, offset); + } + free(buf); + return fai; +} + +void fai_destroy(faidx_t *fai) +{ + int i; + for (i = 0; i < fai->n; ++i) free(fai->name[i]); + free(fai->name); + kh_destroy(s, fai->hash); + if (fai->rz) razf_close(fai->rz); + free(fai); +} + +int fai_build(const char *fn) +{ + char *str; + RAZF *rz; + FILE *fp; + faidx_t *fai; + str = (char*)calloc(strlen(fn) + 5, 1); + sprintf(str, "%s.fai", fn); + rz = razf_open(fn, "r"); + if (rz == 0) { + fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn); + free(str); + return -1; + } + fai = fai_build_core(rz); + razf_close(rz); + fp = fopen(str, "wb"); + if (fp == 0) { + fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str); + fai_destroy(fai); free(str); + return -1; + } + fai_save(fai, fp); + fclose(fp); + free(str); + fai_destroy(fai); + return 0; +} + +#ifdef _USE_KNETFILE +FILE *download_and_open(const char *fn) +{ + const int buf_size = 1 * 1024 * 1024; + uint8_t *buf; + FILE *fp; + knetFile *fp_remote; + const char *url = fn; + const char *p; + int l = strlen(fn); + for (p = fn + l - 1; p >= fn; --p) + if (*p == '/') break; + fn = p + 1; + + // First try to open a local copy + fp = fopen(fn, "r"); + if (fp) + return fp; + + // If failed, download from remote and open + fp_remote = knet_open(url, "rb"); + if (fp_remote == 0) { + fprintf(stderr, "[download_from_remote] fail to open remote file %s\n",url); + return NULL; + } + if ((fp = fopen(fn, "wb")) == 0) { + fprintf(stderr, "[download_from_remote] fail to create file in the working directory %s\n",fn); + knet_close(fp_remote); + return NULL; + } + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + fwrite(buf, 1, l, fp); + free(buf); + fclose(fp); + knet_close(fp_remote); + + return fopen(fn, "r"); +} +#endif + +faidx_t *fai_load(const char *fn) +{ + char *str; + FILE *fp; + faidx_t *fai; + str = (char*)calloc(strlen(fn) + 5, 1); + sprintf(str, "%s.fai", fn); + +#ifdef _USE_KNETFILE + if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn) + { + fp = download_and_open(str); + if ( !fp ) + { + fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str); + free(str); + return 0; + } + } + else +#endif + fp = fopen(str, "rb"); + if (fp == 0) { + fprintf(stderr, "[fai_load] build FASTA index.\n"); + fai_build(fn); + fp = fopen(str, "rb"); + if (fp == 0) { + fprintf(stderr, "[fai_load] fail to open FASTA index.\n"); + free(str); + return 0; + } + } + + fai = fai_read(fp); + fclose(fp); + + fai->rz = razf_open(fn, "rb"); + free(str); + if (fai->rz == 0) { + fprintf(stderr, "[fai_load] fail to open FASTA file.\n"); + return 0; + } + return fai; +} + +char *fai_fetch(const faidx_t *fai, const char *str, int *len) +{ + char *s, c; + int i, l, k, name_end; + khiter_t iter; + faidx1_t val; + khash_t(s) *h; + int beg, end; + + beg = end = -1; + h = fai->hash; + name_end = l = strlen(str); + s = (char*)malloc(l+1); + // remove space + for (i = k = 0; i < l; ++i) + if (!isspace(str[i])) s[k++] = str[i]; + s[k] = 0; l = k; + // determine the sequence name + for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end + if (i >= 0) name_end = i; + if (name_end < l) { // check if this is really the end + int n_hyphen = 0; + for (i = name_end + 1; i < l; ++i) { + if (s[i] == '-') ++n_hyphen; + else if (!isdigit(s[i]) && s[i] != ',') break; + } + if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name + s[name_end] = 0; + iter = kh_get(s, h, s); + if (iter == kh_end(h)) { // cannot find the sequence name + iter = kh_get(s, h, str); // try str as the name + if (iter == kh_end(h)) { + *len = 0; + free(s); return 0; + } else s[name_end] = ':', name_end = l; + } + } else iter = kh_get(s, h, str); + if(iter == kh_end(h)) { + fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str); + free(s); + return 0; + }; + val = kh_value(h, iter); + // parse the interval + if (name_end < l) { + for (i = k = name_end + 1; i < l; ++i) + if (s[i] != ',') s[k++] = s[i]; + s[k] = 0; + beg = atoi(s + name_end + 1); + for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; + end = i < k? atoi(s + i + 1) : val.len; + if (beg > 0) --beg; + } else beg = 0, end = val.len; + if (beg >= val.len) beg = val.len; + if (end >= val.len) end = val.len; + if (beg > end) beg = end; + free(s); + + // now retrieve the sequence + l = 0; + s = (char*)malloc(end - beg + 2); + razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); + while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err) + if (isgraph(c)) s[l++] = c; + s[l] = '\0'; + *len = l; + return s; +} + +int faidx_main(int argc, char *argv[]) +{ + if (argc == 1) { + fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n"); + return 1; + } else { + if (argc == 2) fai_build(argv[1]); + else { + int i, j, k, l; + char *s; + faidx_t *fai; + fai = fai_load(argv[1]); + if (fai == 0) return 1; + for (i = 2; i != argc; ++i) { + printf(">%s\n", argv[i]); + s = fai_fetch(fai, argv[i], &l); + for (j = 0; j < l; j += 60) { + for (k = 0; k < 60 && k < l - j; ++k) + putchar(s[j + k]); + putchar('\n'); + } + free(s); + } + fai_destroy(fai); + } + } + return 0; +} + +int faidx_fetch_nseq(const faidx_t *fai) +{ + return fai->n; +} + +char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len) +{ + int l; + char c; + khiter_t iter; + faidx1_t val; + char *seq=NULL; + + // Adjust position + iter = kh_get(s, fai->hash, c_name); + if(iter == kh_end(fai->hash)) return 0; + val = kh_value(fai->hash, iter); + if(p_end_i < p_beg_i) p_beg_i = p_end_i; + if(p_beg_i < 0) p_beg_i = 0; + else if(val.len <= p_beg_i) p_beg_i = val.len - 1; + if(p_end_i < 0) p_end_i = 0; + else if(val.len <= p_end_i) p_end_i = val.len - 1; + + // Now retrieve the sequence + l = 0; + seq = (char*)malloc(p_end_i - p_beg_i + 2); + razf_seek(fai->rz, val.offset + p_beg_i / val.line_blen * val.line_len + p_beg_i % val.line_blen, SEEK_SET); + while (razf_read(fai->rz, &c, 1) == 1 && l < p_end_i - p_beg_i + 1) + if (isgraph(c)) seq[l++] = c; + seq[l] = '\0'; + *len = l; + return seq; +} + +#ifdef FAIDX_MAIN +int main(int argc, char *argv[]) { return faidx_main(argc, argv); } +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/faidx.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,103 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li <lh3@sanger.ac.uk> */ + +#ifndef FAIDX_H +#define FAIDX_H + +/*! + @header + + Index FASTA files and extract subsequence. + + @copyright The Wellcome Trust Sanger Institute. + */ + +struct __faidx_t; +typedef struct __faidx_t faidx_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /*! + @abstract Build index for a FASTA or razip compressed FASTA file. + @param fn FASTA file name + @return 0 on success; or -1 on failure + @discussion File "fn.fai" will be generated. + */ + int fai_build(const char *fn); + + /*! + @abstract Distroy a faidx_t struct. + @param fai Pointer to the struct to be destroyed + */ + void fai_destroy(faidx_t *fai); + + /*! + @abstract Load index from "fn.fai". + @param fn File name of the FASTA file + */ + faidx_t *fai_load(const char *fn); + + /*! + @abstract Fetch the sequence in a region. + @param fai Pointer to the faidx_t struct + @param reg Region in the format "chr2:20,000-30,000" + @param len Length of the region + @return Pointer to the sequence; null on failure + + @discussion The returned sequence is allocated by malloc family + and should be destroyed by end users by calling free() on it. + */ + char *fai_fetch(const faidx_t *fai, const char *reg, int *len); + + /*! + @abstract Fetch the number of sequences. + @param fai Pointer to the faidx_t struct + @return The number of sequences + */ + int faidx_fetch_nseq(const faidx_t *fai); + + /*! + @abstract Fetch the sequence in a region. + @param fai Pointer to the faidx_t struct + @param c_name Region name + @param p_beg_i Beginning position number (zero-based) + @param p_end_i End position number (zero-based) + @param len Length of the region + @return Pointer to the sequence; null on failure + + @discussion The returned sequence is allocated by malloc family + and should be destroyed by end users by calling free() on it. + */ + char *faidx_fetch_seq(const faidx_t *fai, char *c_name, int p_beg_i, int p_end_i, int *len); + +#ifdef __cplusplus +} +#endif + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/kaln.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,486 @@ +/* The MIT License + + Copyright (c) 2003-2006, 2008, 2009, by Heng Li <lh3lh3@gmail.com> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <stdint.h> +#include <math.h> +#include "kaln.h" + +#define FROM_M 0 +#define FROM_I 1 +#define FROM_D 2 + +typedef struct { + int i, j; + unsigned char ctype; +} path_t; + +int aln_sm_blosum62[] = { +/* A R N D C Q E G H I L K M F P S T W Y V * X */ + 4,-1,-2,-2, 0,-1,-1, 0,-2,-1,-1,-1,-1,-2,-1, 1, 0,-3,-2, 0,-4, 0, + -1, 5, 0,-2,-3, 1, 0,-2, 0,-3,-2, 2,-1,-3,-2,-1,-1,-3,-2,-3,-4,-1, + -2, 0, 6, 1,-3, 0, 0, 0, 1,-3,-3, 0,-2,-3,-2, 1, 0,-4,-2,-3,-4,-1, + -2,-2, 1, 6,-3, 0, 2,-1,-1,-3,-4,-1,-3,-3,-1, 0,-1,-4,-3,-3,-4,-1, + 0,-3,-3,-3, 9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-4,-2, + -1, 1, 0, 0,-3, 5, 2,-2, 0,-3,-2, 1, 0,-3,-1, 0,-1,-2,-1,-2,-4,-1, + -1, 0, 0, 2,-4, 2, 5,-2, 0,-3,-3, 1,-2,-3,-1, 0,-1,-3,-2,-2,-4,-1, + 0,-2, 0,-1,-3,-2,-2, 6,-2,-4,-4,-2,-3,-3,-2, 0,-2,-2,-3,-3,-4,-1, + -2, 0, 1,-1,-3, 0, 0,-2, 8,-3,-3,-1,-2,-1,-2,-1,-2,-2, 2,-3,-4,-1, + -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, 2,-3, 1, 0,-3,-2,-1,-3,-1, 3,-4,-1, + -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,-2, 2, 0,-3,-2,-1,-2,-1, 1,-4,-1, + -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,-1,-3,-1, 0,-1,-3,-2,-2,-4,-1, + -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, 0,-2,-1,-1,-1,-1, 1,-4,-1, + -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,-4,-2,-2, 1, 3,-1,-4,-1, + -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,-1,-1,-4,-3,-2,-4,-2, + 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, 1,-3,-2,-2,-4, 0, + 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,-2,-2, 0,-4, 0, + -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, 2,-3,-4,-2, + -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,-1,-4,-1, + 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,-4,-1, + -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4, 1,-4, + 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1 +}; + +int aln_sm_blast[] = { + 1, -3, -3, -3, -2, + -3, 1, -3, -3, -2, + -3, -3, 1, -3, -2, + -3, -3, -3, 1, -2, + -2, -2, -2, -2, -2 +}; + +int aln_sm_qual[] = { + 0, -23, -23, -23, 0, + -23, 0, -23, -23, 0, + -23, -23, 0, -23, 0, + -23, -23, -23, 0, 0, + 0, 0, 0, 0, 0 +}; + +ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; +ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 }; + +ka_param2_t ka_param2_qual = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 }; + +static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar) +{ + int i, n; + uint32_t *cigar; + unsigned char last_type; + + if (path_len == 0 || path == 0) { + *n_cigar = 0; + return 0; + } + + last_type = path->ctype; + for (i = n = 1; i < path_len; ++i) { + if (last_type != path[i].ctype) ++n; + last_type = path[i].ctype; + } + *n_cigar = n; + cigar = (uint32_t*)calloc(*n_cigar, 4); + + cigar[0] = 1u << 4 | path[path_len-1].ctype; + last_type = path[path_len-1].ctype; + for (i = path_len - 2, n = 0; i >= 0; --i) { + if (path[i].ctype == last_type) cigar[n] += 1u << 4; + else { + cigar[++n] = 1u << 4 | path[i].ctype; + last_type = path[i].ctype; + } + } + + return cigar; +} + +/***************************/ +/* START OF common_align.c */ +/***************************/ + +#define SET_INF(s) (s).M = (s).I = (s).D = MINOR_INF; + +#define set_M(MM, cur, p, sc) \ +{ \ + if ((p)->M >= (p)->I) { \ + if ((p)->M >= (p)->D) { \ + (MM) = (p)->M + (sc); (cur)->Mt = FROM_M; \ + } else { \ + (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ + } \ + } else { \ + if ((p)->I > (p)->D) { \ + (MM) = (p)->I + (sc); (cur)->Mt = FROM_I; \ + } else { \ + (MM) = (p)->D + (sc); (cur)->Mt = FROM_D; \ + } \ + } \ +} +#define set_I(II, cur, p) \ +{ \ + if ((p)->M - gap_open > (p)->I) { \ + (cur)->It = FROM_M; \ + (II) = (p)->M - gap_open - gap_ext; \ + } else { \ + (cur)->It = FROM_I; \ + (II) = (p)->I - gap_ext; \ + } \ +} +#define set_end_I(II, cur, p) \ +{ \ + if (gap_end_ext >= 0) { \ + if ((p)->M - gap_end_open > (p)->I) { \ + (cur)->It = FROM_M; \ + (II) = (p)->M - gap_end_open - gap_end_ext; \ + } else { \ + (cur)->It = FROM_I; \ + (II) = (p)->I - gap_end_ext; \ + } \ + } else set_I(II, cur, p); \ +} +#define set_D(DD, cur, p) \ +{ \ + if ((p)->M - gap_open > (p)->D) { \ + (cur)->Dt = FROM_M; \ + (DD) = (p)->M - gap_open - gap_ext; \ + } else { \ + (cur)->Dt = FROM_D; \ + (DD) = (p)->D - gap_ext; \ + } \ +} +#define set_end_D(DD, cur, p) \ +{ \ + if (gap_end_ext >= 0) { \ + if ((p)->M - gap_end_open > (p)->D) { \ + (cur)->Dt = FROM_M; \ + (DD) = (p)->M - gap_end_open - gap_end_ext; \ + } else { \ + (cur)->Dt = FROM_D; \ + (DD) = (p)->D - gap_end_ext; \ + } \ + } else set_D(DD, cur, p); \ +} + +typedef struct { + uint8_t Mt:3, It:2, Dt:3; +} dpcell_t; + +typedef struct { + int M, I, D; +} dpscore_t; + +/*************************** + * banded global alignment * + ***************************/ +uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar) +{ + int i, j; + dpcell_t **dpcell, *q; + dpscore_t *curr, *last, *s; + int b1, b2, tmp_end; + int *mat, end, max = 0; + uint8_t type, ctype; + uint32_t *cigar = 0; + + int gap_open, gap_ext, gap_end_open, gap_end_ext, b; + int *score_matrix, N_MATRIX_ROW; + + /* initialize some align-related parameters. just for compatibility */ + gap_open = ap->gap_open; + gap_ext = ap->gap_ext; + gap_end_open = ap->gap_end_open; + gap_end_ext = ap->gap_end_ext; + b = ap->band_width; + score_matrix = ap->matrix; + N_MATRIX_ROW = ap->row; + + if (n_cigar) *n_cigar = 0; + if (len1 == 0 || len2 == 0) return 0; + + /* calculate b1 and b2 */ + if (len1 > len2) { + b1 = len1 - len2 + b; + b2 = b; + } else { + b1 = b; + b2 = len2 - len1 + b; + } + if (b1 > len1) b1 = len1; + if (b2 > len2) b2 = len2; + --seq1; --seq2; + + /* allocate memory */ + end = (b1 + b2 <= len1)? (b1 + b2 + 1) : (len1 + 1); + dpcell = (dpcell_t**)malloc(sizeof(dpcell_t*) * (len2 + 1)); + for (j = 0; j <= len2; ++j) + dpcell[j] = (dpcell_t*)malloc(sizeof(dpcell_t) * end); + for (j = b2 + 1; j <= len2; ++j) + dpcell[j] -= j - b2; + curr = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); + last = (dpscore_t*)malloc(sizeof(dpscore_t) * (len1 + 1)); + + /* set first row */ + SET_INF(*curr); curr->M = 0; + for (i = 1, s = curr + 1; i < b1; ++i, ++s) { + SET_INF(*s); + set_end_D(s->D, dpcell[0] + i, s - 1); + } + s = curr; curr = last; last = s; + + /* core dynamic programming, part 1 */ + tmp_end = (b2 < len2)? b2 : len2 - 1; + for (j = 1; j <= tmp_end; ++j) { + q = dpcell[j]; s = curr; SET_INF(*s); + set_end_I(s->I, q, last); + end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + ++s; ++q; + for (i = 1; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_D(s->D, q, s - 1); + if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ + set_end_I(s->I, q, last + i); + } else s->I = MINOR_INF; + s = curr; curr = last; last = s; + } + /* last row for part 1, use set_end_D() instead of set_D() */ + if (j == len2 && b2 != len2 - 1) { + q = dpcell[j]; s = curr; SET_INF(*s); + set_end_I(s->I, q, last); + end = (j + b1 <= len1 + 1)? (j + b1 - 1) : len1; + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + ++s; ++q; + for (i = 1; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); /* this will change s->M ! */ + set_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_end_D(s->D, q, s - 1); + if (j + b1 - 1 > len1) { /* bug fixed, 040227 */ + set_end_I(s->I, q, last + i); + } else s->I = MINOR_INF; + s = curr; curr = last; last = s; + ++j; + } + + /* core dynamic programming, part 2 */ + for (; j <= len2 - b2 + 1; ++j) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + end = j + b1 - 1; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i != end; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_D(s->D, q, s - 1); + s->I = MINOR_INF; + s = curr; curr = last; last = s; + } + + /* core dynamic programming, part 3 */ + for (; j < len2; ++j) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + } + set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); + set_end_I(s->I, q, last + i); + set_D(s->D, q, s - 1); + s = curr; curr = last; last = s; + } + /* last row */ + if (j == len2) { + SET_INF(curr[j - b2]); + mat = score_matrix + seq2[j] * N_MATRIX_ROW; + for (i = j - b2 + 1, q = dpcell[j] + i, s = curr + i; i < len1; ++i, ++s, ++q) { + set_M(s->M, q, last + i - 1, mat[seq1[i]]); + set_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + } + set_M(s->M, q, last + len1 - 1, mat[seq1[i]]); + set_end_I(s->I, q, last + i); + set_end_D(s->D, q, s - 1); + s = curr; curr = last; last = s; + } + + *_score = last[len1].M; + if (n_cigar) { /* backtrace */ + path_t *p, *path = (path_t*)malloc(sizeof(path_t) * (len1 + len2 + 2)); + i = len1; j = len2; + q = dpcell[j] + i; + s = last + len1; + max = s->M; type = q->Mt; ctype = FROM_M; + if (s->I > max) { max = s->I; type = q->It; ctype = FROM_I; } + if (s->D > max) { max = s->D; type = q->Dt; ctype = FROM_D; } + + p = path; + p->ctype = ctype; p->i = i; p->j = j; /* bug fixed 040408 */ + ++p; + do { + switch (ctype) { + case FROM_M: --i; --j; break; + case FROM_I: --j; break; + case FROM_D: --i; break; + } + q = dpcell[j] + i; + ctype = type; + switch (type) { + case FROM_M: type = q->Mt; break; + case FROM_I: type = q->It; break; + case FROM_D: type = q->Dt; break; + } + p->ctype = ctype; p->i = i; p->j = j; + ++p; + } while (i || j); + cigar = ka_path2cigar32(path, p - path - 1, n_cigar); + free(path); + } + + /* free memory */ + for (j = b2 + 1; j <= len2; ++j) + dpcell[j] += j - b2; + for (j = 0; j <= len2; ++j) + free(dpcell[j]); + free(dpcell); + free(curr); free(last); + + return cigar; +} + +typedef struct { + int M, I, D; +} score_aux_t; + +#define MINUS_INF -0x40000000 + +// matrix: len2 rows and len1 columns +int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap) +{ + +#define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \ + int t1, t2; \ + score_aux_t *_q; \ + _q = _q0; \ + _p->M = _q->M >= _q->I? _q->M : _q->I; \ + _p->M = _p->M >= _q->D? _p->M : _q->D; \ + _p->M += (_sc); \ + ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \ + _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \ + } + + int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret; + const uint8_t *seq1, *seq2; + score_aux_t *curr, *last, *swap; + bw = abs(len1 - len2) + ap->band_width; + i = len1 > len2? len1 : len2; + if (bw > i + 1) bw = i + 1; + seq1 = _seq1 - 1; seq2 = _seq2 - 1; + curr = calloc(len1 + 2, sizeof(score_aux_t)); + last = calloc(len1 + 2, sizeof(score_aux_t)); + { // the zero-th row + int x, end = len1; + score_aux_t *p; + j = 0; + x = j + bw; end = len1 < x? len1 : x; // band end + p = curr; + p->M = 0; p->I = p->D = MINUS_INF; + for (i = 1, p = &curr[1]; i <= end; ++i, ++p) + p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i); + p->M = p->I = p->D = MINUS_INF; + swap = curr; curr = last; last = swap; + } + for (j = 1; j < len2; ++j) { + int x, beg = 0, end = len1, *scrow, col_end; + score_aux_t *p; + x = j - bw; beg = 0 > x? 0 : x; // band start + x = j + bw; end = len1 < x? len1 : x; // band end + if (beg == 0) { // from zero-th column + p = curr; + p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j); + ++beg; // then beg = 1 + } + scrow = scmat + seq2[j] * scmat_size; + if (end == len1) col_end = 1, --end; + else col_end = 0; + for (i = beg, p = &curr[beg]; i <= end; ++i, ++p) + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide); + if (col_end) { + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide); + ++p; + } + p->M = p->I = p->D = MINUS_INF; +// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n'); + swap = curr; curr = last; last = swap; + } + { // the last row + int x, beg = 0, *scrow; + score_aux_t *p; + j = len2; + x = j - bw; beg = 0 > x? 0 : x; // band start + if (beg == 0) { // from zero-th column + p = curr; + p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j); + ++beg; // then beg = 1 + } + scrow = scmat + seq2[j] * scmat_size; + for (i = beg, p = &curr[beg]; i < len1; ++i, ++p) + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede); + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede); +// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n'); + } + ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I; + ret = ret >= curr[len1].D? ret : curr[len1].D; + free(curr); free(last); + return ret; +} + +#ifdef _MAIN +int main(int argc, char *argv[]) +{ +// int len1 = 35, len2 = 35; +// uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1"; +// uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0"; + int len1 = 4, len2 = 4; + uint8_t *seq1 = (uint8_t*)"\1\0\0\1"; + uint8_t *seq2 = (uint8_t*)"\1\0\1\0"; + int sc; +// ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0); + sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual); + printf("%d\n", sc); + return 0; +} +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/kaln.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,67 @@ +/* The MIT License + + Copyright (c) 2003-2006, 2008, 2009 by Heng Li <lh3@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef LH3_KALN_H_ +#define LH3_KALN_H_ + +#include <stdint.h> + +#define MINOR_INF -1073741823 + +typedef struct { + int gap_open; + int gap_ext; + int gap_end_open; + int gap_end_ext; + + int *matrix; + int row; + int band_width; +} ka_param_t; + +typedef struct { + int iio, iie, ido, ide; + int eio, eie, edo, ede; + int *matrix; + int row; + int band_width; +} ka_param2_t; + +#ifdef __cplusplus +extern "C" { +#endif + + uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, + int *_score, int *n_cigar); + int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap); +#ifdef __cplusplus +} +#endif + +extern ka_param_t ka_param_blast; /* = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; */ +extern ka_param_t ka_param_qual; // only use this for global alignment!!! +extern ka_param2_t ka_param2_qual; // only use this for global alignment!!! + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/khash.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,528 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + if (!ret) kh_del(32, h, k); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + + 2009-09-26 (0.2.4): + + * Improve portability + + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + + @copyright Heng Li + */ + +#define AC_VERSION_KHASH_H "0.2.5" + +#include <stdlib.h> +#include <string.h> +#include <limits.h> + +/* compipler specific configuration */ + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khint64_t; +#else +typedef unsigned long long khint64_t; +#endif + +#ifdef _MSC_VER +#define inline __inline +#endif + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_HASH_PRIME_SIZE 32 +static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = +{ + 0ul, 3ul, 11ul, 23ul, 53ul, + 97ul, 193ul, 389ul, 769ul, 1543ul, + 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, + 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, + 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, + 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, + 3221225473ul, 4294967291ul +}; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +static const double __ac_HASH_UPPER = 0.77; + +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + extern kh_##name##_t *kh_init_##name(); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + SCOPE kh_##name##_t *kh_init_##name() { \ + return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ + } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + free(h->keys); free(h->flags); \ + free(h->vals); \ + free(h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t inc, k, i, last; \ + k = __hash_func(key); i = k % h->n_buckets; \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + khint_t t = __ac_HASH_PRIME_SIZE - 1; \ + while (__ac_prime_list[t] > new_n_buckets) --t; \ + new_n_buckets = __ac_prime_list[t+1]; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ + else { \ + new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ + memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + } \ + } \ + if (j) { \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { \ + khint_t inc, k, i; \ + k = __hash_func(key); \ + i = k % new_n_buckets; \ + inc = 1 + k % (new_n_buckets - 1); \ + while (!__ac_isempty(new_flags, i)) { \ + if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \ + else i += inc; \ + } \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); \ + } else { \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) \ + h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + free(h->flags); \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + } \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { \ + if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \ + else kh_resize_##name(h, h->n_buckets + 1); \ + } \ + { \ + khint_t inc, k, i, site, last; \ + x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \ + if (__ac_isempty(h->flags, i)) x = i; \ + else { \ + inc = 1 + k % (h->n_buckets - 1); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \ + else i += inc; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [khint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [khint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = *s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other necessary macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/klist.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,96 @@ +#ifndef _LH3_KLIST_H +#define _LH3_KLIST_H + +#include <stdlib.h> + +#define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f) \ + typedef struct { \ + size_t cnt, n, max; \ + kmptype_t **buf; \ + } kmp_##name##_t; \ + static inline kmp_##name##_t *kmp_init_##name() { \ + return calloc(1, sizeof(kmp_##name##_t)); \ + } \ + static inline void kmp_destroy_##name(kmp_##name##_t *mp) { \ + size_t k; \ + for (k = 0; k < mp->n; ++k) { \ + kmpfree_f(mp->buf[k]); free(mp->buf[k]); \ + } \ + free(mp->buf); free(mp); \ + } \ + static inline kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) { \ + ++mp->cnt; \ + if (mp->n == 0) return calloc(1, sizeof(kmptype_t)); \ + return mp->buf[--mp->n]; \ + } \ + static inline void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \ + --mp->cnt; \ + if (mp->n == mp->max) { \ + mp->max = mp->max? mp->max<<1 : 16; \ + mp->buf = realloc(mp->buf, sizeof(void*) * mp->max); \ + } \ + mp->buf[mp->n++] = p; \ + } + +#define kmempool_t(name) kmp_##name##_t +#define kmp_init(name) kmp_init_##name() +#define kmp_destroy(name, mp) kmp_destroy_##name(mp) +#define kmp_alloc(name, mp) kmp_alloc_##name(mp) +#define kmp_free(name, mp, p) kmp_free_##name(mp, p) + +#define KLIST_INIT(name, kltype_t, kmpfree_t) \ + struct __kl1_##name { \ + kltype_t data; \ + struct __kl1_##name *next; \ + }; \ + typedef struct __kl1_##name kl1_##name; \ + KMEMPOOL_INIT(name, kl1_##name, kmpfree_t) \ + typedef struct { \ + kl1_##name *head, *tail; \ + kmp_##name##_t *mp; \ + size_t size; \ + } kl_##name##_t; \ + static inline kl_##name##_t *kl_init_##name() { \ + kl_##name##_t *kl = calloc(1, sizeof(kl_##name##_t)); \ + kl->mp = kmp_init(name); \ + kl->head = kl->tail = kmp_alloc(name, kl->mp); \ + kl->head->next = 0; \ + return kl; \ + } \ + static inline void kl_destroy_##name(kl_##name##_t *kl) { \ + kl1_##name *p; \ + for (p = kl->head; p != kl->tail; p = p->next) \ + kmp_free(name, kl->mp, p); \ + kmp_free(name, kl->mp, p); \ + kmp_destroy(name, kl->mp); \ + free(kl); \ + } \ + static inline kltype_t *kl_pushp_##name(kl_##name##_t *kl) { \ + kl1_##name *q, *p = kmp_alloc(name, kl->mp); \ + q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p; \ + ++kl->size; \ + return &q->data; \ + } \ + static inline int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) { \ + kl1_##name *p; \ + if (kl->head->next == 0) return -1; \ + --kl->size; \ + p = kl->head; kl->head = kl->head->next; \ + if (d) *d = p->data; \ + kmp_free(name, kl->mp, p); \ + return 0; \ + } + +#define kliter_t(name) kl1_##name +#define klist_t(name) kl_##name##_t +#define kl_val(iter) ((iter)->data) +#define kl_next(iter) ((iter)->next) +#define kl_begin(kl) ((kl)->head) +#define kl_end(kl) ((kl)->tail) + +#define kl_init(name) kl_init_##name() +#define kl_destroy(name, kl) kl_destroy_##name(kl) +#define kl_pushp(name, kl) kl_pushp_##name(kl) +#define kl_shift(name, kl, d) kl_shift_##name(kl, d) + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/knetfile.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,632 @@ +/* The MIT License + + Copyright (c) 2008 by Genome Research Ltd (GRL). + 2010 by Attractive Chaos <attractor@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Probably I will not do socket programming in the next few years and + therefore I decide to heavily annotate this file, for Linux and + Windows as well. -ac */ + +#include <time.h> +#include <stdio.h> +#include <ctype.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/types.h> + +#ifndef _WIN32 +#include <netdb.h> +#include <arpa/inet.h> +#include <sys/socket.h> +#endif + +#include "knetfile.h" + +/* In winsock.h, the type of a socket is SOCKET, which is: "typedef + * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed + * integer -1. In knetfile.c, I use "int" for socket type + * throughout. This should be improved to avoid confusion. + * + * In Linux/Mac, recv() and read() do almost the same thing. You can see + * in the header file that netread() is simply an alias of read(). In + * Windows, however, they are different and using recv() is mandatory. + */ + +/* This function tests if the file handler is ready for reading (or + * writing if is_read==0). */ +static int socket_wait(int fd, int is_read) +{ + fd_set fds, *fdr = 0, *fdw = 0; + struct timeval tv; + int ret; + tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (is_read) fdr = &fds; + else fdw = &fds; + ret = select(fd+1, fdr, fdw, 0, &tv); +#ifndef _WIN32 + if (ret == -1) perror("select"); +#else + if (ret == 0) + fprintf(stderr, "select time-out\n"); + else if (ret == SOCKET_ERROR) + fprintf(stderr, "select: %d\n", WSAGetLastError()); +#endif + return ret; +} + +#ifndef _WIN32 +/* This function does not work with Windows due to the lack of + * getaddrinfo() in winsock. It is addapted from an example in "Beej's + * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ +static int socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) + + int on = 1, fd; + struct linger lng = { 0, 0 }; + struct addrinfo hints, *res = 0; + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + /* In Unix/Mac, getaddrinfo() is the most convenient way to get + * server information. */ + if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); + if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); + /* The following two setsockopt() are used by ftplib + * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they + * necessary. */ + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); + freeaddrinfo(res); + return fd; +} +#else +/* MinGW's printf has problem with "%lld" */ +char *int64tostr(char *buf, int64_t x) +{ + int cnt; + int i = 0; + do { + buf[i++] = '0' + x % 10; + x /= 10; + } while (x); + buf[i] = 0; + for (cnt = i, i = 0; i < cnt/2; ++i) { + int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c; + } + return buf; +} + +int64_t strtoint64(const char *buf) +{ + int64_t x; + for (x = 0; *buf != '\0'; ++buf) + x = x * 10 + ((int64_t) *buf - 48); + return x; +} +/* In windows, the first thing is to establish the TCP connection. */ +int knet_win32_init() +{ + WSADATA wsaData; + return WSAStartup(MAKEWORD(2, 2), &wsaData); +} +void knet_win32_destroy() +{ + WSACleanup(); +} +/* A slightly modfied version of the following function also works on + * Mac (and presummably Linux). However, this function is not stable on + * my Mac. It sometimes works fine but sometimes does not. Therefore for + * non-Windows OS, I do not use this one. */ +static SOCKET socket_connect(const char *host, const char *port) +{ +#define __err_connect(func) \ + do { \ + fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \ + return -1; \ + } while (0) + + int on = 1; + SOCKET fd; + struct linger lng = { 0, 0 }; + struct sockaddr_in server; + struct hostent *hp = 0; + // open socket + if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket"); + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt"); + if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt"); + // get host info + if (isalpha(host[0])) hp = gethostbyname(host); + else { + struct in_addr addr; + addr.s_addr = inet_addr(host); + hp = gethostbyaddr((char*)&addr, 4, AF_INET); + } + if (hp == 0) __err_connect("gethost"); + // connect + server.sin_addr.s_addr = *((unsigned long*)hp->h_addr); + server.sin_family= AF_INET; + server.sin_port = htons(atoi(port)); + if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect"); + // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!) + return fd; +} +#endif + +static off_t my_netread(int fd, void *buf, off_t len) +{ + off_t rest = len, curr, l = 0; + /* recv() and read() may not read the required length of data with + * one call. They have to be called repeatedly. */ + while (rest) { + if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading + curr = netread(fd, buf + l, rest); + /* According to the glibc manual, section 13.2, a zero returned + * value indicates end-of-file (EOF), which should mean that + * read() will not return zero if EOF has not been met but data + * are not immediately available. */ + if (curr == 0) break; + l += curr; rest -= curr; + } + return l; +} + +/************************* + * FTP specific routines * + *************************/ + +static int kftp_get_response(knetFile *ftp) +{ +#ifndef _WIN32 + unsigned char c; +#else + char c; +#endif + int n = 0; + char *p; + if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; + while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O + //fputc(c, stderr); + if (n >= ftp->max_response) { + ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; + ftp->response = realloc(ftp->response, ftp->max_response); + } + ftp->response[n++] = c; + if (c == '\n') { + if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2]) + && ftp->response[3] != '-') break; + n = 0; + continue; + } + } + if (n < 2) return -1; + ftp->response[n-2] = 0; + return strtol(ftp->response, &p, 0); +} + +static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) +{ + if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing + netwrite(ftp->ctrl_fd, cmd, strlen(cmd)); + return is_get? kftp_get_response(ftp) : 0; +} + +static int kftp_pasv_prep(knetFile *ftp) +{ + char *p; + int v[6]; + kftp_send_cmd(ftp, "PASV\r\n", 1); + for (p = ftp->response; *p && *p != '('; ++p); + if (*p != '(') return -1; + ++p; + sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); + memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); + ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; + return 0; +} + + +static int kftp_pasv_connect(knetFile *ftp) +{ + char host[80], port[10]; + if (ftp->pasv_port == 0) { + fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n"); + return -1; + } + sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); + sprintf(port, "%d", ftp->pasv_port); + ftp->fd = socket_connect(host, port); + if (ftp->fd == -1) return -1; + return 0; +} + +int kftp_connect(knetFile *ftp) +{ + ftp->ctrl_fd = socket_connect(ftp->host, ftp->port); + if (ftp->ctrl_fd == -1) return -1; + kftp_get_response(ftp); + kftp_send_cmd(ftp, "USER anonymous\r\n", 1); + kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); + kftp_send_cmd(ftp, "TYPE I\r\n", 1); + return 0; +} + +int kftp_reconnect(knetFile *ftp) +{ + if (ftp->ctrl_fd != -1) { + netclose(ftp->ctrl_fd); + ftp->ctrl_fd = -1; + } + netclose(ftp->fd); + ftp->fd = -1; + return kftp_connect(ftp); +} + +// initialize ->type, ->host, ->retr and ->size +knetFile *kftp_parse_url(const char *fn, const char *mode) +{ + knetFile *fp; + char *p; + int l; + if (strstr(fn, "ftp://") != fn) return 0; + for (p = (char*)fn + 6; *p && *p != '/'; ++p); + if (*p != '/') return 0; + l = p - fn - 6; + fp = calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_FTP; + fp->fd = -1; + /* the Linux/Mac version of socket_connect() also recognizes a port + * like "ftp", but the Windows version does not. */ + fp->port = strdup("21"); + fp->host = calloc(l + 1, 1); + if (strchr(mode, 'c')) fp->no_reconnect = 1; + strncpy(fp->host, fn + 6, l); + fp->retr = calloc(strlen(p) + 8, 1); + sprintf(fp->retr, "RETR %s\r\n", p); + fp->size_cmd = calloc(strlen(p) + 8, 1); + sprintf(fp->size_cmd, "SIZE %s\r\n", p); + fp->seek_offset = 0; + return fp; +} +// place ->fd at offset off +int kftp_connect_file(knetFile *fp) +{ + int ret; + long long file_size; + if (fp->fd != -1) { + netclose(fp->fd); + if (fp->no_reconnect) kftp_get_response(fp); + } + kftp_pasv_prep(fp); + kftp_send_cmd(fp, fp->size_cmd, 1); +#ifndef _WIN32 + if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) + { + fprintf(stderr,"[kftp_connect_file] %s\n", fp->response); + return -1; + } +#else + const char *p = fp->response; + while (*p != ' ') ++p; + while (*p < '0' || *p > '9') ++p; + file_size = strtoint64(p); +#endif + fp->file_size = file_size; + if (fp->offset>=0) { + char tmp[32]; +#ifndef _WIN32 + sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); +#else + strcpy(tmp, "REST "); + int64tostr(tmp + 5, fp->offset); + strcat(tmp, "\r\n"); +#endif + kftp_send_cmd(fp, tmp, 1); + } + kftp_send_cmd(fp, fp->retr, 0); + kftp_pasv_connect(fp); + ret = kftp_get_response(fp); + if (ret != 150) { + fprintf(stderr, "[kftp_connect_file] %s\n", fp->response); + netclose(fp->fd); + fp->fd = -1; + return -1; + } + fp->is_ready = 1; + return 0; +} + + +/************************** + * HTTP specific routines * + **************************/ + +knetFile *khttp_parse_url(const char *fn, const char *mode) +{ + knetFile *fp; + char *p, *proxy, *q; + int l; + if (strstr(fn, "http://") != fn) return 0; + // set ->http_host + for (p = (char*)fn + 7; *p && *p != '/'; ++p); + l = p - fn - 7; + fp = calloc(1, sizeof(knetFile)); + fp->http_host = calloc(l + 1, 1); + strncpy(fp->http_host, fn + 7, l); + fp->http_host[l] = 0; + for (q = fp->http_host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + // get http_proxy + proxy = getenv("http_proxy"); + // set ->host, ->port and ->path + if (proxy == 0) { + fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name. + fp->port = strdup(*q? q : "80"); + fp->path = strdup(*p? p : "/"); + } else { + fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); + for (q = fp->host; *q && *q != ':'; ++q); + if (*q == ':') *q++ = 0; + fp->port = strdup(*q? q : "80"); + fp->path = strdup(fn); + } + fp->type = KNF_TYPE_HTTP; + fp->ctrl_fd = fp->fd = -1; + fp->seek_offset = 0; + return fp; +} + +int khttp_connect_file(knetFile *fp) +{ + int ret, l = 0; + char *buf, *p; + if (fp->fd != -1) netclose(fp->fd); + fp->fd = socket_connect(fp->host, fp->port); + buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. + l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); + l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); + l += sprintf(buf + l, "\r\n"); + netwrite(fp->fd, buf, l); + l = 0; + while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency + if (buf[l] == '\n' && l >= 3) + if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; + ++l; + } + buf[l] = 0; + if (l < 14) { // prematured header + netclose(fp->fd); + fp->fd = -1; + return -1; + } + ret = strtol(buf + 8, &p, 0); // HTTP return code + if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file + off_t rest = fp->offset; + while (rest) { + off_t l = rest < 0x10000? rest : 0x10000; + rest -= my_netread(fp->fd, buf, l); + } + } else if (ret != 206 && ret != 200) { + free(buf); + fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret); + netclose(fp->fd); + fp->fd = -1; + return -1; + } + free(buf); + fp->is_ready = 1; + return 0; +} + +/******************** + * Generic routines * + ********************/ + +knetFile *knet_open(const char *fn, const char *mode) +{ + knetFile *fp = 0; + if (mode[0] != 'r') { + fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n"); + return 0; + } + if (strstr(fn, "ftp://") == fn) { + fp = kftp_parse_url(fn, mode); + if (fp == 0) return 0; + if (kftp_connect(fp) == -1) { + knet_close(fp); + return 0; + } + kftp_connect_file(fp); + } else if (strstr(fn, "http://") == fn) { + fp = khttp_parse_url(fn, mode); + if (fp == 0) return 0; + khttp_connect_file(fp); + } else { // local file +#ifdef _WIN32 + /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may + * be undefined on some systems, although it is defined on my + * Mac and the Linux I have tested on. */ + int fd = open(fn, O_RDONLY | O_BINARY); +#else + int fd = open(fn, O_RDONLY); +#endif + if (fd == -1) { + perror("open"); + return 0; + } + fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + fp->ctrl_fd = -1; + } + if (fp && fp->fd == -1) { + knet_close(fp); + return 0; + } + return fp; +} + +knetFile *knet_dopen(int fd, const char *mode) +{ + knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); + fp->type = KNF_TYPE_LOCAL; + fp->fd = fd; + return fp; +} + +off_t knet_read(knetFile *fp, void *buf, off_t len) +{ + off_t l = 0; + if (fp->fd == -1) return 0; + if (fp->type == KNF_TYPE_FTP) { + if (fp->is_ready == 0) { + if (!fp->no_reconnect) kftp_reconnect(fp); + kftp_connect_file(fp); + } + } else if (fp->type == KNF_TYPE_HTTP) { + if (fp->is_ready == 0) + khttp_connect_file(fp); + } + if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX + off_t rest = len, curr; + while (rest) { + do { + curr = read(fp->fd, buf + l, rest); + } while (curr < 0 && EINTR == errno); + if (curr < 0) return -1; + if (curr == 0) break; + l += curr; rest -= curr; + } + } else l = my_netread(fp->fd, buf, len); + fp->offset += l; + return l; +} + +off_t knet_seek(knetFile *fp, int64_t off, int whence) +{ + if (whence == SEEK_SET && off == fp->offset) return 0; + if (fp->type == KNF_TYPE_LOCAL) { + /* Be aware that lseek() returns the offset after seeking, + * while fseek() returns zero on success. */ + off_t offset = lseek(fp->fd, off, whence); + if (offset == -1) { + // Be silent, it is OK for knet_seek to fail when the file is streamed + // fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); + return -1; + } + fp->offset = offset; + return 0; + } + else if (fp->type == KNF_TYPE_FTP) + { + if (whence==SEEK_CUR) + fp->offset += off; + else if (whence==SEEK_SET) + fp->offset = off; + else if ( whence==SEEK_END) + fp->offset = fp->file_size+off; + fp->is_ready = 0; + return 0; + } + else if (fp->type == KNF_TYPE_HTTP) + { + if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future? + fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n"); + errno = ESPIPE; + return -1; + } + if (whence==SEEK_CUR) + fp->offset += off; + else if (whence==SEEK_SET) + fp->offset = off; + fp->is_ready = 0; + return 0; + } + errno = EINVAL; + fprintf(stderr,"[knet_seek] %s\n", strerror(errno)); + return -1; +} + +int knet_close(knetFile *fp) +{ + if (fp == 0) return 0; + if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific + if (fp->fd != -1) { + /* On Linux/Mac, netclose() is an alias of close(), but on + * Windows, it is an alias of closesocket(). */ + if (fp->type == KNF_TYPE_LOCAL) close(fp->fd); + else netclose(fp->fd); + } + free(fp->host); free(fp->port); + free(fp->response); free(fp->retr); // FTP specific + free(fp->path); free(fp->http_host); // HTTP specific + free(fp); + return 0; +} + +#ifdef KNETFILE_MAIN +int main(void) +{ + char *buf; + knetFile *fp; + int type = 4, l; +#ifdef _WIN32 + knet_win32_init(); +#endif + buf = calloc(0x100000, 1); + if (type == 0) { + fp = knet_open("knetfile.c", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 1) { // NCBI FTP, large file + fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); + knet_seek(fp, 2500000000ll, SEEK_SET); + l = knet_read(fp, buf, 255); + } else if (type == 2) { + fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 3) { + fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r"); + knet_seek(fp, 1000, SEEK_SET); + } else if (type == 4) { + fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r"); + knet_read(fp, buf, 10000); + knet_seek(fp, 20000, SEEK_SET); + knet_seek(fp, 10000, SEEK_SET); + l = knet_read(fp, buf+10000, 10000000) + 10000; + } + if (type != 4 && type != 1) { + knet_read(fp, buf, 255); + buf[255] = 0; + printf("%s\n", buf); + } else write(fileno(stdout), buf, l); + knet_close(fp); + free(buf); + return 0; +} +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/knetfile.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,75 @@ +#ifndef KNETFILE_H +#define KNETFILE_H + +#include <stdint.h> +#include <fcntl.h> + +#ifndef _WIN32 +#define netread(fd, ptr, len) read(fd, ptr, len) +#define netwrite(fd, ptr, len) write(fd, ptr, len) +#define netclose(fd) close(fd) +#else +#include <winsock2.h> +#define netread(fd, ptr, len) recv(fd, ptr, len, 0) +#define netwrite(fd, ptr, len) send(fd, ptr, len, 0) +#define netclose(fd) closesocket(fd) +#endif + +// FIXME: currently I/O is unbuffered + +#define KNF_TYPE_LOCAL 1 +#define KNF_TYPE_FTP 2 +#define KNF_TYPE_HTTP 3 + +typedef struct knetFile_s { + int type, fd; + int64_t offset; + char *host, *port; + + // the following are for FTP only + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr, *size_cmd; + int64_t seek_offset; // for lazy seek + int64_t file_size; + + // the following are for HTTP only + char *path, *http_host; +} knetFile; + +#define knet_tell(fp) ((fp)->offset) +#define knet_fileno(fp) ((fp)->fd) + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _WIN32 + int knet_win32_init(); + void knet_win32_destroy(); +#endif + + knetFile *knet_open(const char *fn, const char *mode); + + /* + This only works with local files. + */ + knetFile *knet_dopen(int fd, const char *mode); + + /* + If ->is_ready==0, this routine updates ->fd; otherwise, it simply + reads from ->fd. + */ + off_t knet_read(knetFile *fp, void *buf, off_t len); + + /* + This routine only sets ->offset and ->is_ready=0. It does not + communicate with the FTP server. + */ + off_t knet_seek(knetFile *fp, int64_t off, int whence); + int knet_close(knetFile *fp); + +#ifdef __cplusplus +} +#endif + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/kprobaln.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,280 @@ +/* The MIT License + + Copyright (c) 2003-2006, 2008-2010, by Heng Li <lh3lh3@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <stdint.h> +#include <math.h> +#include "kprobaln.h" + +/***************************************** + * Probabilistic banded glocal alignment * + *****************************************/ + +#define EI .25 +#define EM .33333333333 + +static float g_qual2prob[256]; + +#define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; } + +kpa_par_t kpa_par_def = { 0.001, 0.1, 10 }; +kpa_par_t kpa_par_alt = { 0.0001, 0.01, 10 }; + +/* + The topology of the profile HMM: + + /\ /\ /\ /\ + I[1] I[k-1] I[k] I[L] + ^ \ \ ^ \ ^ \ \ ^ + | \ \ | \ | \ \ | + M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1] + \ \/ \/ \/ / + \ /\ /\ /\ / + -> D[k-1] -> D[k] -> + + M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1]. + + On input, _ref is the reference sequence and _query is the query + sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an + ambiguous residue. iqual is the base quality. c sets the gap open + probability, gap extension probability and band width. + + On output, state and q are arrays of length l_query. The higher 30 + bits give the reference position the query base is matched to and the + lower two bits can be 0 (an alignment match) or 1 (an + insertion). q[i] gives the phred scaled posterior probability of + state[i] being wrong. + */ +int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual, + const kpa_par_t *c, int *state, uint8_t *q) +{ + double **f, **b = 0, *s, m[9], sI, sM, bI, bM, pb; + float *qual, *_qual; + const uint8_t *ref, *query; + int bw, bw2, i, k, is_diff = 0, is_backward = 1, Pr; + + if ( l_ref<=0 || l_query<=0 ) return 0; // FIXME: this may not be an ideal fix, just prevents sefgault + + /*** initialization ***/ + is_backward = state && q? 1 : 0; + ref = _ref - 1; query = _query - 1; // change to 1-based coordinate + bw = l_ref > l_query? l_ref : l_query; + if (bw > c->bw) bw = c->bw; + if (bw < abs(l_ref - l_query)) bw = abs(l_ref - l_query); + bw2 = bw * 2 + 1; + // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[] + f = calloc(l_query+1, sizeof(void*)); + if (is_backward) b = calloc(l_query+1, sizeof(void*)); + for (i = 0; i <= l_query; ++i) { // FIXME: this will lead in segfault for l_query==0 + f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs + if (is_backward) b[i] = calloc(bw2 * 3 + 6, sizeof(double)); + } + s = calloc(l_query+2, sizeof(double)); // s[] is the scaling factor to avoid underflow + // initialize qual + _qual = calloc(l_query, sizeof(float)); + if (g_qual2prob[0] == 0) + for (i = 0; i < 256; ++i) + g_qual2prob[i] = pow(10, -i/10.); + for (i = 0; i < l_query; ++i) _qual[i] = g_qual2prob[iqual? iqual[i] : 30]; + qual = _qual - 1; + // initialize transition probability + sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof + m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM); + m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.; + m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e; + bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1 + /*** forward ***/ + // f[0] + set_u(k, bw, 0, 0); + f[0][k] = s[0] = 1.; + { // f[1] + double *fi = f[1], sum; + int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end; + for (k = beg, sum = 0.; k <= end; ++k) { + int u; + double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM; + set_u(u, bw, 1, k); + fi[u+0] = e * bM; fi[u+1] = EI * bI; + sum += fi[u] + fi[u+1]; + } + // rescale + s[1] = sum; + set_u(_beg, bw, 1, beg); set_u(_end, bw, 1, end); _end += 2; + for (k = _beg; k <= _end; ++k) fi[k] /= sum; + } + // f[2..l_query] + for (i = 2; i <= l_query; ++i) { + double *fi = f[i], *fi1 = f[i-1], sum, qli = qual[i]; + int beg = 1, end = l_ref, x, _beg, _end; + uint8_t qyi = query[i]; + x = i - bw; beg = beg > x? beg : x; // band start + x = i + bw; end = end < x? end : x; // band end + for (k = beg, sum = 0.; k <= end; ++k) { + int u, v11, v01, v10; + double e; + e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli * EM; + set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1); + fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]); + fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]); + fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2]; + sum += fi[u] + fi[u+1] + fi[u+2]; +// fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG + } + // rescale + s[i] = sum; + set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2; + for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum; + } + { // f[l_query+1] + double sum; + for (k = 1, sum = 0.; k <= l_ref; ++k) { + int u; + set_u(u, bw, l_query, k); + if (u < 3 || u >= bw2*3+3) continue; + sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI; + } + s[l_query+1] = sum; // the last scaling factor + } + { // compute likelihood + double p = 1., Pr1 = 0.; + for (i = 0; i <= l_query + 1; ++i) { + p *= s[i]; + if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.; + } + Pr1 += -4.343 * log(p * l_ref * l_query); + Pr = (int)(Pr1 + .499); + if (!is_backward) { // skip backward and MAP + for (i = 0; i <= l_query; ++i) free(f[i]); + free(f); free(s); free(_qual); + return Pr; + } + } + /*** backward ***/ + // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from) + for (k = 1; k <= l_ref; ++k) { + int u; + double *bi = b[l_query]; + set_u(u, bw, l_query, k); + if (u < 3 || u >= bw2*3+3) continue; + bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1]; + } + // b[l_query-1..1] + for (i = l_query - 1; i >= 1; --i) { + int beg = 1, end = l_ref, x, _beg, _end; + double *bi = b[i], *bi1 = b[i+1], y = (i > 1), qli1 = qual[i+1]; + uint8_t qyi1 = query[i+1]; + x = i - bw; beg = beg > x? beg : x; + x = i + bw; end = end < x? end : x; + for (k = end; k >= beg; --k) { + int u, v11, v01, v10; + double e; + set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1); + e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 * EM) * bi1[v11]; + bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e. + bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1]; + bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y; +// fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG + } + // rescale + set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2; + for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y; + } + { // b[0] + int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1; + double sum = 0.; + for (k = end; k >= beg; --k) { + int u; + double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM; + set_u(u, bw, 1, k); + if (u < 3 || u >= bw2*3+3) continue; + sum += e * b[1][u+0] * bM + EI * b[1][u+1] * bI; + } + set_u(k, bw, 0, 0); + pb = b[0][k] = sum / s[0]; // if everything works as is expected, pb == 1.0 + } + is_diff = fabs(pb - 1.) > 1e-7? 1 : 0; + /*** MAP ***/ + for (i = 1; i <= l_query; ++i) { + double sum = 0., *fi = f[i], *bi = b[i], max = 0.; + int beg = 1, end = l_ref, x, max_k = -1; + x = i - bw; beg = beg > x? beg : x; + x = i + bw; end = end < x? end : x; + for (k = beg; k <= end; ++k) { + int u; + double z; + set_u(u, bw, i, k); + z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z; + z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z; + } + max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0 + if (state) state[i-1] = max_k; + if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k; +#ifdef _MAIN + fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2, + "ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG +#endif + } + /*** free ***/ + for (i = 0; i <= l_query; ++i) { + free(f[i]); free(b[i]); + } + free(f); free(b); free(s); free(_qual); + return Pr; +} + +#ifdef _MAIN +#include <unistd.h> +int main(int argc, char *argv[]) +{ + uint8_t conv[256], *iqual, *ref, *query; + int c, l_ref, l_query, i, q = 30, b = 10, P; + while ((c = getopt(argc, argv, "b:q:")) >= 0) { + switch (c) { + case 'b': b = atoi(optarg); break; + case 'q': q = atoi(optarg); break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: %s [-q %d] [-b %d] <ref> <query>\n", argv[0], q, b); // example: acttc attc + return 1; + } + memset(conv, 4, 256); + conv['a'] = conv['A'] = 0; conv['c'] = conv['C'] = 1; + conv['g'] = conv['G'] = 2; conv['t'] = conv['T'] = 3; + ref = (uint8_t*)argv[optind]; query = (uint8_t*)argv[optind+1]; + l_ref = strlen((char*)ref); l_query = strlen((char*)query); + for (i = 0; i < l_ref; ++i) ref[i] = conv[ref[i]]; + for (i = 0; i < l_query; ++i) query[i] = conv[query[i]]; + iqual = malloc(l_query); + memset(iqual, q, l_query); + kpa_par_def.bw = b; + P = kpa_glocal(ref, l_ref, query, l_query, iqual, &kpa_par_alt, 0, 0); + fprintf(stderr, "%d\n", P); + free(iqual); + return 0; +} +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/kprobaln.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,49 @@ +/* The MIT License + + Copyright (c) 2003-2006, 2008, 2009 by Heng Li <lh3@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef LH3_KPROBALN_H_ +#define LH3_KPROBALN_H_ + +#include <stdint.h> + +typedef struct { + float d, e; + int bw; +} kpa_par_t; + +#ifdef __cplusplus +extern "C" { +#endif + + int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual, + const kpa_par_t *c, int *state, uint8_t *q); + +#ifdef __cplusplus +} +#endif + +extern kpa_par_t kpa_par_def, kpa_par_alt; + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/kseq.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,235 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Last Modified: 05MAR2012 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include <ctype.h> +#include <string.h> +#include <stdlib.h> + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + unsigned char *buf; \ + int begin, end, is_eof; \ + type_t f; \ + } kstream_t; + +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = (unsigned char*)malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ + } + +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) return -1; \ + } \ + return (int)ks->buf[ks->begin++]; \ + } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ + { \ + if (dret) *dret = 0; \ + str->l = append? str->l : 0; \ + if (ks->begin >= ks->end && ks->is_eof) return -1; \ + for (;;) { \ + int i; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end < __bufsize) ks->is_eof = 1; \ + if (ks->end == 0) break; \ + } else break; \ + } \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ + str->s[str->l] = '\0'; \ + return str->l; \ + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } + +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) + +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + SCOPE void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + */ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (c == '\n') continue; /* skip empty lines */ \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT(type_t, __read, 16384) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/ksort.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,285 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li <lh3@sanger.ac.uk> */ + +/* + 2012-12-11 (0.1.4): + + * Defined __ks_insertsort_##name as static to compile with C99. + + 2008-11-16 (0.1.4): + + * Fixed a bug in introsort() that happens in rare cases. + + 2008-11-05 (0.1.3): + + * Fixed a bug in introsort() for complex comparisons. + + * Fixed a bug in mergesort(). The previous version is not stable. + + 2008-09-15 (0.1.2): + + * Accelerated introsort. On my Mac (not on another Linux machine), + my implementation is as fast as std::sort on random input. + + * Added combsort and in introsort, switch to combsort if the + recursion is too deep. + + 2008-09-13 (0.1.1): + + * Added k-small algorithm + + 2008-09-05 (0.1.0): + + * Initial version + +*/ + +#ifndef AC_KSORT_H +#define AC_KSORT_H + +#include <stdlib.h> +#include <string.h> + +typedef struct { + void *left, *right; + int depth; +} ks_isort_stack_t; + +#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } + +#define KSORT_INIT(name, type_t, __sort_lt) \ + void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ + { \ + type_t *a2[2], *a, *b; \ + int curr, shift; \ + \ + a2[0] = array; \ + a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ + for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) { \ + a = a2[curr]; b = a2[1-curr]; \ + if (shift == 0) { \ + type_t *p = b, *i, *eb = a + n; \ + for (i = a; i < eb; i += 2) { \ + if (i == eb - 1) *p++ = *i; \ + else { \ + if (__sort_lt(*(i+1), *i)) { \ + *p++ = *(i+1); *p++ = *i; \ + } else { \ + *p++ = *i; *p++ = *(i+1); \ + } \ + } \ + } \ + } else { \ + size_t i, step = 1ul<<shift; \ + for (i = 0; i < n; i += step<<1) { \ + type_t *p, *j, *k, *ea, *eb; \ + if (n < i + step) { \ + ea = a + n; eb = a; \ + } else { \ + ea = a + i + step; \ + eb = a + (n < i + (step<<1)? n : i + (step<<1)); \ + } \ + j = a + i; k = a + i + step; p = b + i; \ + while (j < ea && k < eb) { \ + if (__sort_lt(*k, *j)) *p++ = *k++; \ + else *p++ = *j++; \ + } \ + while (j < ea) *p++ = *j++; \ + while (k < eb) *p++ = *k++; \ + } \ + } \ + curr = 1 - curr; \ + } \ + if (curr == 1) { \ + type_t *p = a2[0], *i = a2[1], *eb = array + n; \ + for (; p < eb; ++i) *p++ = *i; \ + } \ + if (temp == 0) free(a2[1]); \ + } \ + void ks_heapadjust_##name(size_t i, size_t n, type_t l[]) \ + { \ + size_t k = i; \ + type_t tmp = l[i]; \ + while ((k = (k << 1) + 1) < n) { \ + if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k; \ + if (__sort_lt(l[k], tmp)) break; \ + l[i] = l[k]; i = k; \ + } \ + l[i] = tmp; \ + } \ + void ks_heapmake_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i) \ + ks_heapadjust_##name(i, lsize, l); \ + } \ + void ks_heapsort_##name(size_t lsize, type_t l[]) \ + { \ + size_t i; \ + for (i = lsize - 1; i > 0; --i) { \ + type_t tmp; \ + tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ + } \ + } \ + static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + { \ + type_t *i, *j, swap_tmp; \ + for (i = s + 1; i < t; ++i) \ + for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ + swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ + } \ + } \ + void ks_combsort_##name(size_t n, type_t a[]) \ + { \ + const double shrink_factor = 1.2473309501039786540366528676643; \ + int do_swap; \ + size_t gap = n; \ + type_t tmp, *i, *j; \ + do { \ + if (gap > 2) { \ + gap = (size_t)(gap / shrink_factor); \ + if (gap == 9 || gap == 10) gap = 11; \ + } \ + do_swap = 0; \ + for (i = a; i < a + n - gap; ++i) { \ + j = i + gap; \ + if (__sort_lt(*j, *i)) { \ + tmp = *i; *i = *j; *j = tmp; \ + do_swap = 1; \ + } \ + } \ + } while (do_swap || gap > 2); \ + if (gap != 1) __ks_insertsort_##name(a, a + n); \ + } \ + void ks_introsort_##name(size_t n, type_t a[]) \ + { \ + int d; \ + ks_isort_stack_t *top, *stack; \ + type_t rp, swap_tmp; \ + type_t *s, *t, *i, *j, *k; \ + \ + if (n < 1) return; \ + else if (n == 2) { \ + if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ + return; \ + } \ + for (d = 2; 1ul<<d < n; ++d); \ + stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \ + top = stack; s = a; t = a + (n-1); d <<= 1; \ + while (1) { \ + if (s < t) { \ + if (--d == 0) { \ + ks_combsort_##name(t - s + 1, s); \ + t = s; \ + continue; \ + } \ + i = s; j = t; k = i + ((j-i)>>1) + 1; \ + if (__sort_lt(*k, *i)) { \ + if (__sort_lt(*k, *j)) k = j; \ + } else k = __sort_lt(*j, *i)? i : j; \ + rp = *k; \ + if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ + for (;;) { \ + do ++i; while (__sort_lt(*i, rp)); \ + do --j; while (i <= j && __sort_lt(rp, *j)); \ + if (j <= i) break; \ + swap_tmp = *i; *i = *j; *j = swap_tmp; \ + } \ + swap_tmp = *i; *i = *t; *t = swap_tmp; \ + if (i-s > t-i) { \ + if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ + s = t-i > 16? i+1 : t; \ + } else { \ + if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ + t = i-s > 16? i-1 : s; \ + } \ + } else { \ + if (top == stack) { \ + free(stack); \ + __ks_insertsort_##name(a, a+n); \ + return; \ + } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ + } \ + } \ + } \ + /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ + /* 0 <= kk < n */ \ + type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ + { \ + type_t *low, *high, *k, *ll, *hh, *mid; \ + low = arr; high = arr + n - 1; k = arr + kk; \ + for (;;) { \ + if (high <= low) return *k; \ + if (high == low + 1) { \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + return *k; \ + } \ + mid = low + (high - low) / 2; \ + if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ + if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ + if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ + KSORT_SWAP(type_t, *mid, *(low+1)); \ + ll = low + 1; hh = high; \ + for (;;) { \ + do ++ll; while (__sort_lt(*ll, *low)); \ + do --hh; while (__sort_lt(*low, *hh)); \ + if (hh < ll) break; \ + KSORT_SWAP(type_t, *ll, *hh); \ + } \ + KSORT_SWAP(type_t, *low, *hh); \ + if (hh <= k) low = ll; \ + if (hh >= k) high = hh - 1; \ + } \ + } \ + void ks_shuffle_##name(size_t n, type_t a[]) \ + { \ + int i, j; \ + for (i = n; i > 1; --i) { \ + type_t tmp; \ + j = (int)(drand48() * i); \ + tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \ + } \ + } + +#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) +#define ks_introsort(name, n, a) ks_introsort_##name(n, a) +#define ks_combsort(name, n, a) ks_combsort_##name(n, a) +#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) +#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) +#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) +#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) +#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a) + +#define ks_lt_generic(a, b) ((a) < (b)) +#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) + +typedef const char *ksstr_t; + +#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) +#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/kstring.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,212 @@ +#include <stdarg.h> +#include <stdio.h> +#include <ctype.h> +#include <string.h> +#include <stdint.h> +#include "kstring.h" + +int ksprintf(kstring_t *s, const char *fmt, ...) +{ + va_list ap; + int l; + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); // This line does not work with glibc 2.0. See `man snprintf'. + va_end(ap); + if (l + 1 > s->m - s->l) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + va_start(ap, fmt); + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); + } + va_end(ap); + s->l += l; + return l; +} + +char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux) +{ + const char *p, *start; + if (sep) { // set up the table + if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished + aux->finished = 0; + if (sep[1]) { + aux->sep = -1; + aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; + for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f); + } else aux->sep = sep[0]; + } + if (aux->finished) return 0; + else if (str) aux->p = str - 1, aux->finished = 0; + if (aux->sep < 0) { + for (p = start = aux->p + 1; *p; ++p) + if (aux->tab[*p>>6]>>(*p&0x3f)&1) break; + } else { + for (p = start = aux->p + 1; *p; ++p) + if (*p == aux->sep) break; + } + aux->p = p; // end of token + if (*p == 0) aux->finished = 1; // no more tokens + return (char*)start; +} + +// s MUST BE a null terminated string; l = strlen(s) +int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) +{ + int i, n, max, last_char, last_start, *offsets, l; + n = 0; max = *_max; offsets = *_offsets; + l = strlen(s); + +#define __ksplit_aux do { \ + if (_offsets) { \ + s[i] = 0; \ + if (n == max) { \ + max = max? max<<1 : 2; \ + offsets = (int*)realloc(offsets, sizeof(int) * max); \ + } \ + offsets[n++] = last_start; \ + } else ++n; \ + } while (0) + + for (i = 0, last_char = last_start = 0; i <= l; ++i) { + if (delimiter == 0) { + if (isspace(s[i]) || s[i] == 0) { + if (isgraph(last_char)) __ksplit_aux; // the end of a field + } else { + if (isspace(last_char) || last_char == 0) last_start = i; + } + } else { + if (s[i] == delimiter || s[i] == 0) { + if (last_char != 0 && last_char != delimiter) __ksplit_aux; // the end of a field + } else { + if (last_char == delimiter || last_char == 0) last_start = i; + } + } + last_char = s[i]; + } + *_max = max; *_offsets = offsets; + return n; +} + +/********************** + * Boyer-Moore search * + **********************/ + +typedef unsigned char ubyte_t; + +// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html +static int *ksBM_prep(const ubyte_t *pat, int m) +{ + int i, *suff, *prep, *bmGs, *bmBc; + prep = (int*)calloc(m + 256, sizeof(int)); + bmGs = prep; bmBc = prep + m; + { // preBmBc() + for (i = 0; i < 256; ++i) bmBc[i] = m; + for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; + } + suff = (int*)calloc(m, sizeof(int)); + { // suffixes() + int f = 0, g; + suff[m - 1] = m; + g = m - 1; + for (i = m - 2; i >= 0; --i) { + if (i > g && suff[i + m - 1 - f] < i - g) + suff[i] = suff[i + m - 1 - f]; + else { + if (i < g) g = i; + f = i; + while (g >= 0 && pat[g] == pat[g + m - 1 - f]) --g; + suff[i] = f - g; + } + } + } + { // preBmGs() + int j = 0; + for (i = 0; i < m; ++i) bmGs[i] = m; + for (i = m - 1; i >= 0; --i) + if (suff[i] == i + 1) + for (; j < m - 1 - i; ++j) + if (bmGs[j] == m) + bmGs[j] = m - 1 - i; + for (i = 0; i <= m - 2; ++i) + bmGs[m - 1 - suff[i]] = m - 1 - i; + } + free(suff); + return prep; +} + +void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep) +{ + int i, j, *prep = 0, *bmGs, *bmBc; + const ubyte_t *str, *pat; + str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat; + prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep; + if (_prep && *_prep == 0) *_prep = prep; + bmGs = prep; bmBc = prep + m; + j = 0; + while (j <= n - m) { + for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); + if (i >= 0) { + int max = bmBc[str[i+j]] - m + 1 + i; + if (max < bmGs[i]) max = bmGs[i]; + j += max; + } else return (void*)(str + j); + } + if (_prep == 0) free(prep); + return 0; +} + +char *kstrstr(const char *str, const char *pat, int **_prep) +{ + return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep); +} + +char *kstrnstr(const char *str, const char *pat, int n, int **_prep) +{ + return (char*)kmemmem(str, n, pat, strlen(pat), _prep); +} + +/*********************** + * The main() function * + ***********************/ + +#ifdef KSTRING_MAIN +#include <stdio.h> +int main() +{ + kstring_t *s; + int *fields, n, i; + ks_tokaux_t aux; + char *p; + s = (kstring_t*)calloc(1, sizeof(kstring_t)); + // test ksprintf() + ksprintf(s, " abcdefg: %d ", 100); + printf("'%s'\n", s->s); + // test ksplit() + fields = ksplit(s, 0, &n); + for (i = 0; i < n; ++i) + printf("field[%d] = '%s'\n", i, s->s + fields[i]); + // test kstrtok() + s->l = 0; + for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) { + kputsn(p, aux.p - p, s); + kputc('\n', s); + } + printf("%s", s->s); + // free + free(s->s); free(s); free(fields); + + { + static char *str = "abcdefgcdgcagtcakcdcd"; + static char *pat = "cd"; + char *ret, *s = str; + int *prep = 0; + while ((ret = kstrstr(s, pat, &prep)) != 0) { + printf("match: %s\n", ret); + s = ret + prep[0]; + } + free(prep); + } + return 0; +} +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/kstring.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,169 @@ +/* The MIT License + + Copyright (c) by Attractive Chaos <attractor@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef KSTRING_H +#define KSTRING_H + +#include <stdlib.h> +#include <string.h> +#include <stdint.h> + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +typedef struct { + uint64_t tab[4]; + int sep, finished; + const char *p; // end of the current token +} ks_tokaux_t; + +#ifdef __cplusplus +extern "C" { +#endif + + int ksprintf(kstring_t *s, const char *fmt, ...); + int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + char *kstrstr(const char *str, const char *pat, int **_prep); + char *kstrnstr(const char *str, const char *pat, int n, int **_prep); + void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); + + /* kstrtok() is similar to strtok_r() except that str is not + * modified and both str and sep can be NULL. For efficiency, it is + * actually recommended to set both to NULL in the subsequent calls + * if sep is not changed. */ + char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); + +#ifdef __cplusplus +} +#endif + +static inline void ks_resize(kstring_t *s, size_t size) +{ + if (s->m < size) { + s->m = size; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } +} + +static inline int kputsn(const char *p, int l, kstring_t *s) +{ + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + memcpy(s->s + s->l, p, l); + s->l += l; + s->s[s->l] = 0; + return l; +} + +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + +static inline int kputc(int c, kstring_t *s) +{ + if (s->l + 1 >= s->m) { + s->m = s->l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + s->s[s->l++] = c; + s->s[s->l] = 0; + return c; +} + +static inline int kputw(int c, kstring_t *s) +{ + char buf[16]; + int l, x; + if (c == 0) return kputc('0', s); + if(c < 0) for (l = 0, x = c; x < 0; x /= 10) buf[l++] = '0' - (x%10); + else for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputuw(unsigned c, kstring_t *s) +{ + char buf[16]; + int l, i; + unsigned x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; + s->s[s->l] = 0; + return 0; +} + +static inline int kputl(long c, kstring_t *s) +{ + char buf[32]; + long l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + +static inline int *ksplit(kstring_t *s, int delimiter, int *n) +{ + int max = 0, *offsets = 0; + *n = ksplit_core(s->s, delimiter, &max, &offsets); + return offsets; +} + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/HmmGlocal.java Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,178 @@ +import java.io.*; +import java.lang.*; + +public class HmmGlocal +{ + private double[] qual2prob; + private double cd, ce; // gap open probility [1e-3], gap extension probability [0.1] + private int cb; // band width [7] + + public HmmGlocal(final double d, final double e, final int b) { + cd = d; ce = e; cb = b; + qual2prob = new double[256]; + for (int i = 0; i < 256; ++i) + qual2prob[i] = Math.pow(10, -i/10.); + } + private static int set_u(final int b, final int i, final int k) { + int x = i - b; + x = x > 0? x : 0; + return (k + 1 - x) * 3; + } + public int hmm_glocal(final byte[] _ref, final byte[] _query, final byte[] _iqual, int[] state, byte[] q) { + int i, k; + /*** initialization ***/ + // change coordinates + int l_ref = _ref.length; + byte[] ref = new byte[l_ref+1]; + for (i = 0; i < l_ref; ++i) ref[i+1] = _ref[i]; // FIXME: this is silly... + int l_query = _query.length; + byte[] query = new byte[l_query+1]; + double[] qual = new double[l_query+1]; + for (i = 0; i < l_query; ++i) { + query[i+1] = _query[i]; + qual[i+1] = qual2prob[_iqual[i]]; + } + // set band width + int bw2, bw = l_ref > l_query? l_ref : l_query; + if (bw > cb) bw = cb; + if (bw < Math.abs(l_ref - l_query)) bw = Math.abs(l_ref - l_query); + bw2 = bw * 2 + 1; + // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[] + double[][] f = new double[l_query+1][bw2*3 + 6]; + double[][] b = new double[l_query+1][bw2*3 + 6]; + double[] s = new double[l_query+2]; + // initialize transition probabilities + double sM, sI, bM, bI; + sM = sI = 1. / (2 * l_query + 2); + bM = (1 - cd) / l_query; bI = cd / l_query; // (bM+bI)*l_query==1 + double[] m = new double[9]; + m[0*3+0] = (1 - cd - cd) * (1 - sM); m[0*3+1] = m[0*3+2] = cd * (1 - sM); + m[1*3+0] = (1 - ce) * (1 - sI); m[1*3+1] = ce * (1 - sI); m[1*3+2] = 0.; + m[2*3+0] = 1 - ce; m[2*3+1] = 0.; m[2*3+2] = ce; + /*** forward ***/ + // f[0] + f[0][set_u(bw, 0, 0)] = s[0] = 1.; + { // f[1] + double[] fi = f[1]; + double sum; + int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end; + for (k = beg, sum = 0.; k <= end; ++k) { + int u; + double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] / 3.; + u = set_u(bw, 1, k); + fi[u+0] = e * bM; fi[u+1] = .25 * bI; + sum += fi[u] + fi[u+1]; + } + // rescale + s[1] = sum; + _beg = set_u(bw, 1, beg); _end = set_u(bw, 1, end); _end += 2; + for (k = _beg; k <= _end; ++k) fi[k] /= sum; + } + // f[2..l_query] + for (i = 2; i <= l_query; ++i) { + double[] fi = f[i], fi1 = f[i-1]; + double sum, qli = qual[i]; + int beg = 1, end = l_ref, x, _beg, _end; + byte qyi = query[i]; + x = i - bw; beg = beg > x? beg : x; // band start + x = i + bw; end = end < x? end : x; // band end + for (k = beg, sum = 0.; k <= end; ++k) { + int u, v11, v01, v10; + double e; + e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli / 3.; + u = set_u(bw, i, k); v11 = set_u(bw, i-1, k-1); v10 = set_u(bw, i-1, k); v01 = set_u(bw, i, k-1); + fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]); + fi[u+1] = .25 * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]); + fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2]; + sum += fi[u] + fi[u+1] + fi[u+2]; + //System.out.println("("+i+","+k+";"+u+"): "+fi[u]+","+fi[u+1]+","+fi[u+2]); + } + // rescale + s[i] = sum; + _beg = set_u(bw, i, beg); _end = set_u(bw, i, end); _end += 2; + for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum; + } + { // f[l_query+1] + double sum; + for (k = 1, sum = 0.; k <= l_ref; ++k) { + int u = set_u(bw, l_query, k); + if (u < 3 || u >= bw2*3+3) continue; + sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI; + } + s[l_query+1] = sum; // the last scaling factor + } + /*** backward ***/ + // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from) + for (k = 1; k <= l_ref; ++k) { + int u = set_u(bw, l_query, k); + double[] bi = b[l_query]; + if (u < 3 || u >= bw2*3+3) continue; + bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1]; + } + // b[l_query-1..1] + for (i = l_query - 1; i >= 1; --i) { + int beg = 1, end = l_ref, x, _beg, _end; + double[] bi = b[i], bi1 = b[i+1]; + double y = (i > 1)? 1. : 0., qli1 = qual[i+1]; + byte qyi1 = query[i+1]; + x = i - bw; beg = beg > x? beg : x; + x = i + bw; end = end < x? end : x; + for (k = end; k >= beg; --k) { + int u, v11, v01, v10; + double e; + u = set_u(bw, i, k); v11 = set_u(bw, i+1, k+1); v10 = set_u(bw, i+1, k); v01 = set_u(bw, i, k+1); + e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 / 3.) * bi1[v11]; + bi[u+0] = e * m[0] + .25 * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e. + bi[u+1] = e * m[3] + .25 * m[4] * bi1[v10+1]; + bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y; + } + // rescale + _beg = set_u(bw, i, beg); _end = set_u(bw, i, end); _end += 2; + for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y; + } + double pb; + { // b[0] + int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1; + double sum = 0.; + for (k = end; k >= beg; --k) { + int u = set_u(bw, 1, k); + double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] / 3.; + if (u < 3 || u >= bw2*3+3) continue; + sum += e * b[1][u+0] * bM + .25 * b[1][u+1] * bI; + } + pb = b[0][set_u(bw, 0, 0)] = sum / s[0]; // if everything works as is expected, pb == 1.0 + } + int is_diff = Math.abs(pb - 1.) > 1e-7? 1 : 0; + /*** MAP ***/ + for (i = 1; i <= l_query; ++i) { + double sum = 0., max = 0.; + double[] fi = f[i], bi = b[i]; + int beg = 1, end = l_ref, x, max_k = -1; + x = i - bw; beg = beg > x? beg : x; + x = i + bw; end = end < x? end : x; + for (k = beg; k <= end; ++k) { + int u = set_u(bw, i, k); + double z; + sum += (z = fi[u+0] * bi[u+0]); if (z > max) { max = z; max_k = (k-1)<<2 | 0; } + sum += (z = fi[u+1] * bi[u+1]); if (z > max) { max = z; max_k = (k-1)<<2 | 1; } + } + max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0 + if (state != null) state[i-1] = max_k; + if (q != null) { + k = (int)(-4.343 * Math.log(1. - max) + .499); + q[i-1] = (byte)(k > 100? 99 : k); + } + //System.out.println("("+pb+","+sum+")"+" ("+(i-1)+","+(max_k>>2)+","+(max_k&3)+","+max+")"); + } + return 0; + } + + public static void main(String[] args) { + byte[] ref = {'\0', '\1', '\3', '\3', '\1'}; + byte[] query = {'\0', '\3', '\3', '\1'}; + byte[] qual = new byte[4]; + qual[0] = qual[1] = qual[2] = qual[3] = (byte)20; + HmmGlocal hg = new HmmGlocal(1e-3, 0.1, 7); + hg.hmm_glocal(ref, query, qual, null, null); + } +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/Makefile Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,69 @@ +CC= gcc +CXX= g++ +CFLAGS= -g -Wall -O2 #-m64 #-arch ppc +CXXFLAGS= $(CFLAGS) +DFLAGS= -D_FILE_OFFSET_BITS=64 +OBJS= +PROG= md5sum-lite md5fa maq2sam-short maq2sam-long ace2sam wgsim bamcheck +INCLUDES= -I.. +SUBDIRS= . + +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all:$(PROG) + +lib-recur all-recur clean-recur cleanlocal-recur install-recur: + @target=`echo $@ | sed s/-recur//`; \ + wdir=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + cd $$subdir; \ + $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ + INCLUDES="$(INCLUDES)" $$target || exit 1; \ + cd $$wdir; \ + done; + +lib: + +bamcheck:bamcheck.o + $(CC) $(CFLAGS) -o $@ bamcheck.o -L.. -lm -lbam -lpthread -lz + +bamcheck.o:bamcheck.c ../faidx.h ../khash.h ../sam.h ../razf.h + $(CC) $(CFLAGS) -c -I.. -o $@ bamcheck.c + +ace2sam:ace2sam.o + $(CC) $(CFLAGS) -o $@ ace2sam.o -lz + +wgsim:wgsim.o + $(CC) $(CFLAGS) -o $@ wgsim.o -lm -lz + +md5fa:md5.o md5fa.o md5.h ../kseq.h + $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz + +md5sum-lite:md5sum-lite.o + $(CC) $(CFLAGS) -o $@ md5sum-lite.o + +md5sum-lite.o:md5.c md5.h + $(CC) -c $(CFLAGS) -DMD5SUM_MAIN -o $@ md5.c + +maq2sam-short:maq2sam.c + $(CC) $(CFLAGS) -o $@ maq2sam.c -lz + +maq2sam-long:maq2sam.c + $(CC) $(CFLAGS) -DMAQ_LONGREADS -o $@ maq2sam.c -lz + +md5fa.o:md5.h md5fa.c + $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c + +wgsim.o:wgsim.c ../kseq.h + $(CC) $(CFLAGS) -c -I.. -o $@ wgsim.c + +ace2sam.o:ace2sam.c ../kstring.h ../kseq.h + $(CC) $(CFLAGS) -c -I.. -o $@ ace2sam.c + +cleanlocal: + rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a + +clean:cleanlocal-recur
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/ace2sam.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,249 @@ +/* The MIT License + + Copyright (c) 2011 Heng Li <lh3@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <zlib.h> +#include "kstring.h" +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 16384) + +#define N_TMPSTR 5 +#define LINE_LEN 60 + +// append a CIGAR operation plus length +#define write_cigar(_c, _n, _m, _v) do { \ + if (_n == _m) { \ + _m = _m? _m<<1 : 4; \ + _c = realloc(_c, _m * sizeof(unsigned)); \ + } \ + _c[_n++] = (_v); \ + } while (0) + +// a fatal error +static void fatal(const char *msg) +{ + fprintf(stderr, "E %s\n", msg); + exit(1); +} +// remove pads +static void remove_pads(const kstring_t *src, kstring_t *dst) +{ + int i, j; + dst->l = 0; + kputsn(src->s, src->l, dst); + for (i = j = 0; i < dst->l; ++i) + if (dst->s[i] != '*') dst->s[j++] = dst->s[i]; + dst->s[j] = 0; + dst->l = j; +} + +int main(int argc, char *argv[]) +{ + gzFile fp; + kstream_t *ks; + kstring_t s, t[N_TMPSTR]; + int dret, i, k, af_n, af_max, af_i, c, is_padded = 0, write_cns = 0, *p2u = 0; + long m_cigar = 0, n_cigar = 0; + unsigned *af, *cigar = 0; + + while ((c = getopt(argc, argv, "pc")) >= 0) { + switch (c) { + case 'p': is_padded = 1; break; + case 'c': write_cns = 1; break; + } + } + if (argc == optind) { + fprintf(stderr, "\nUsage: ace2sam [-pc] <in.ace>\n\n"); + fprintf(stderr, "Options: -p output padded SAM\n"); + fprintf(stderr, " -c write the contig sequence in SAM\n\n"); + fprintf(stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n"); + fprintf(stderr, " 2. The order of reads in AF and in RD must be identical\n"); + fprintf(stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n"); + fprintf(stderr, " 4. This program writes the headerless SAM to stdout and header to stderr\n\n"); + return 1; + } + + s.l = s.m = 0; s.s = 0; + af_n = af_max = af_i = 0; af = 0; + for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0; + fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, &s, &dret) >= 0) { + if (strcmp(s.s, "CO") == 0) { // contig sequence + kstring_t *cns; + t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded ctg; 2: unpadded ctg/padded read; 3: unpadded read; 4: SAM line + af_n = af_i = 0; // reset the af array + ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]); // contig name + ks_getuntil(ks, '\n', &s, &dret); // read the whole line + while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputsn(s.s, s.l, &t[1]); // read the padded consensus sequence + remove_pads(&t[1], &t[2]); // construct the unpadded sequence + // compute the array for mapping padded positions to unpadded positions + p2u = realloc(p2u, t[1].m * sizeof(int)); + for (i = k = 0; i < t[1].l; ++i) { + p2u[i] = k; + if (t[1].s[i] != '*') ++k; + } + // write out the SAM header and contig sequences + fprintf(stderr, "H @SQ\tSN:%s\tLN:%ld\n", t[0].s, t[is_padded?1:2].l); // The SAM header line + cns = &t[is_padded?1:2]; + fprintf(stderr, "S >%s\n", t[0].s); + for (i = 0; i < cns->l; i += LINE_LEN) { + fputs("S ", stderr); + for (k = 0; k < LINE_LEN && i + k < cns->l; ++k) + fputc(cns->s[i + k], stderr); + fputc('\n', stderr); + } + +#define __padded2cigar(sp) do { \ + int i, l_M = 0, l_D = 0; \ + for (i = 0; i < sp.l; ++i) { \ + if (sp.s[i] == '*') { \ + if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \ + ++l_D; l_M = 0; \ + } else { \ + if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \ + ++l_M; l_D = 0; \ + } \ + } \ + if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \ + else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \ + } while (0) + + if (write_cns) { // write the consensus SAM line (dummy read) + n_cigar = 0; + if (is_padded) __padded2cigar(t[1]); + else write_cigar(cigar, n_cigar, m_cigar, t[2].l<<4); + kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]); + for (i = 0; i < n_cigar; ++i) { + kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]); + } + kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]); + } + } else if (strcmp(s.s, "BQ") == 0) { // contig quality + if (t[0].l == 0) fatal("come to 'BQ' before reading 'CO'"); + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire "BQ" line + if (write_cns) t[4].s[--t[4].l] = 0; // remove the trailing "*" + for (i = 0; i < t[2].l; ++i) { // read the consensus quality + int q; + if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(stderr, "E truncated contig quality\n"); + if (s.l) { + q = atoi(s.s) + 33; + if (q > 126) q = 126; + if (write_cns) kputc(q, &t[4]); + } else --i; + } + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); + ks_getuntil(ks, '\n', &s, &dret); // skip the empty line + if (write_cns) puts(t[4].s); t[4].l = 0; + } else if (strcmp(s.s, "AF") == 0) { // padded read position + int reversed, neg, pos; + if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'"); + if (write_cns) { + if (t[4].l) puts(t[4].s); + t[4].l = 0; + } + ks_getuntil(ks, 0, &s, &dret); // read name + ks_getuntil(ks, 0, &s, &dret); reversed = s.s[0] == 'C'? 1 : 0; // strand + ks_getuntil(ks, 0, &s, &dret); pos = atoi(s.s); neg = pos < 0? 1 : 0; pos = pos < 0? -pos : pos; // position + if (af_n == af_max) { // double the af array + af_max = af_max? af_max<<1 : 4; + af = realloc(af, af_max * sizeof(unsigned)); + } + af[af_n++] = pos << 2 | neg << 1 | reversed; // keep the placement information + } else if (strcmp(s.s, "RD") == 0) { // read sequence + if (af_i >= af_n) fatal("more 'RD' records than 'AF'"); + t[2].l = t[3].l = t[4].l = 0; + ks_getuntil(ks, 0, &t[4], &dret); // QNAME + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire RD line + while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputs(s.s, &t[2]); // read the read sequence + } else if (strcmp(s.s, "QA") == 0) { // clipping + if (af_i >= af_n) fatal("more 'QA' records than 'AF'"); + int beg, end, pos, op; + ks_getuntil(ks, 0, &s, &dret); ks_getuntil(ks, 0, &s, &dret); // skip quality clipping + ks_getuntil(ks, 0, &s, &dret); beg = atoi(s.s) - 1; // align clipping start + ks_getuntil(ks, 0, &s, &dret); end = atoi(s.s); // clipping end + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); + // compute 1-based POS + pos = af[af_i]>>2; // retrieve the position information + if (af[af_i]>>1&1) pos = -pos; + pos += beg; // now pos is the true padded position + // generate CIGAR + remove_pads(&t[2], &t[3]); // backup the unpadded read sequence + n_cigar = 0; + if (beg) write_cigar(cigar, n_cigar, m_cigar, beg<<4|4); + if (is_padded) { + __padded2cigar(t[2]); + if (beg && n_cigar > 1) cigar[1] -= beg<<4; // fix the left-hand CIGAR + if (end < t[2].l && n_cigar) cigar[n_cigar-1] -= (t[2].l - end)<<4; // fix the right-hand CIGAR + } else { + // generate flattened CIGAR string + for (i = beg, k = pos - 1; i < end; ++i, ++k) + t[2].s[i] = t[2].s[i] != '*'? (t[1].s[k] != '*'? 0 : 1) : (t[1].s[k] != '*'? 2 : 6); + // generate the proper CIGAR + for (i = beg + 1, k = 1, op = t[2].s[beg]; i < end; ++i) { + if (op != t[2].s[i]) { + write_cigar(cigar, n_cigar, m_cigar, k<<4|op); + op = t[2].s[i]; k = 1; + } else ++k; + } + write_cigar(cigar, n_cigar, m_cigar, k<<4|op); + // remove unnecessary "P" and possibly merge adjacent operations + for (i = 2; i < n_cigar; ++i) { + if ((cigar[i]&0xf) != 1 && (cigar[i-1]&0xf) == 6 && (cigar[i-2]&0xf) != 1) { + cigar[i-1] = 0; + if ((cigar[i]&0xf) == (cigar[i-2]&0xf)) // merge operations + cigar[i] += cigar[i-2], cigar[i-2] = 0; + } + } + for (i = k = 0; i < n_cigar; ++i) // squeeze out dumb operations + if (cigar[i]) cigar[k++] = cigar[i]; + n_cigar = k; + } + if (end < t[2].l) write_cigar(cigar, n_cigar, m_cigar, (t[2].l - end)<<4|4); + // write the SAM line for the read + kputc('\t', &t[4]); // QNAME has already been written + kputw((af[af_i]&1)? 16 : 0, &t[4]); kputc('\t', &t[4]); // FLAG + kputsn(t[0].s, t[0].l, &t[4]); kputc('\t', &t[4]); // RNAME + kputw(is_padded? pos : p2u[pos-1]+1, &t[4]); // POS + kputs("\t60\t", &t[4]); // MAPQ + for (i = 0; i < n_cigar; ++i) { // CIGAR + kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]); + } + kputs("\t*\t0\t0\t", &t[4]); // empty MRNM, MPOS and TLEN + kputsn(t[3].s, t[3].l, &t[4]); // unpadded SEQ + kputs("\t*", &t[4]); // QUAL + puts(t[4].s); // print to stdout + ++af_i; + } else if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); + } + ks_destroy(ks); + gzclose(fp); + free(af); free(s.s); free(cigar); free(p2u); + for (i = 0; i < N_TMPSTR; ++i) free(t[i].s); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/bamcheck.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1521 @@ +/* + Author: petr.danecek@sanger + gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread + + Assumptions, approximations and other issues: + - GC-depth graph does not split reads, the starting position determines which bin is incremented. + There are small overlaps between bins (max readlen-1). However, the bins are big (20k). + - coverage distribution ignores softclips and deletions + - some stats require sorted BAMs + - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths. + - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq) + - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar) + counts, no splicing is done, no indels or soft clips are considered, even small overlap is + good enough to include the read in the stats. + +*/ + +#define BAMCHECK_VERSION "2012-09-04" + +#define _ISOC99_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> +#include <string.h> +#include <math.h> +#include <ctype.h> +#include <getopt.h> +#include <errno.h> +#include <assert.h> +#include "faidx.h" +#include "khash.h" +#include "sam.h" +#include "sam_header.h" +#include "razf.h" + +#define BWA_MIN_RDLEN 35 +#define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP)) +#define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP) +#define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE) +#define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE) +#define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1) +#define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2) +#define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP) + +typedef struct +{ + int32_t line_len, line_blen; + int64_t len; + uint64_t offset; +} +faidx1_t; +KHASH_MAP_INIT_STR(kh_faidx, faidx1_t) +KHASH_MAP_INIT_STR(kh_bam_tid, int) +KHASH_MAP_INIT_STR(kh_rg, const char *) +struct __faidx_t { + RAZF *rz; + int n, m; + char **name; + khash_t(kh_faidx) *hash; +}; + +typedef struct +{ + float gc; + uint32_t depth; +} +gc_depth_t; + +// For coverage distribution, a simple pileup +typedef struct +{ + int64_t pos; + int size, start; + int *buffer; +} +round_buffer_t; + +typedef struct { uint32_t from, to; } pos_t; +typedef struct +{ + int npos,mpos,cpos; + pos_t *pos; +} +regions_t; + +typedef struct +{ + // Parameters + int trim_qual; // bwa trim quality + + // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd), + // insert size histogram holder + int nquals; // The number of quality bins + int nbases; // The maximum sequence length the allocated array can hold + int nisize; // The maximum insert size that the allocated array can hold + int ngc; // The size of gc_1st and gc_2nd + int nindels; // The maximum indel length for indel distribution + + // Arrays for the histogram data + uint64_t *quals_1st, *quals_2nd; + uint64_t *gc_1st, *gc_2nd; + uint64_t *isize_inward, *isize_outward, *isize_other; + uint64_t *acgt_cycles; + uint64_t *read_lengths; + uint64_t *insertions, *deletions; + uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd; + + // The extremes encountered + int max_len; // Maximum read length + int max_qual; // Maximum quality + float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part + int is_sorted; + + // Summary numbers + uint64_t total_len; + uint64_t total_len_dup; + uint64_t nreads_1st; + uint64_t nreads_2nd; + uint64_t nreads_filtered; + uint64_t nreads_dup; + uint64_t nreads_unmapped; + uint64_t nreads_unpaired; + uint64_t nreads_paired; + uint64_t nreads_anomalous; + uint64_t nreads_mq0; + uint64_t nbases_mapped; + uint64_t nbases_mapped_cigar; + uint64_t nbases_trimmed; // bwa trimmed bases + uint64_t nmismatches; + uint64_t nreads_QCfailed, nreads_secondary; + + // GC-depth related data + uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin + gc_depth_t *gcd; // The GC-depth bins holder + int gcd_bin_size; // The size of GC-depth bin + uint32_t gcd_ref_size; // The approximate size of the genome + int32_t tid, gcd_pos; // Position of the current bin + int32_t pos; // Position of the last read + + // Coverage distribution related data + int ncov; // The number of coverage bins + uint64_t *cov; // The coverage frequencies + int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins + round_buffer_t cov_rbuf; // Pileup round buffer + + // Mismatches by read cycle + uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against + int mrseq_buf; // The size of the buffer + int32_t rseq_pos; // The coordinate of the first base in the buffer + int32_t nrseq_buf; // The used part of the buffer + uint64_t *mpc_buf; // Mismatches per cycle + + // Filters + int filter_readlen; + + // Target regions + int nregions, reg_from,reg_to; + regions_t *regions; + + // Auxiliary data + int flag_require, flag_filter; + double sum_qual; // For calculating average quality value + samfile_t *sam; + khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated + faidx_t *fai; // Reference sequence for GC-depth graph + int argc; // Command line arguments to be printed on the output + char **argv; +} +stats_t; + +void error(const char *format, ...); +void bam_init_header_hash(bam_header_t *header); +int is_in_regions(bam1_t *bam_line, stats_t *stats); + + +// Coverage distribution methods +inline int coverage_idx(int min, int max, int n, int step, int depth) +{ + if ( depth < min ) + return 0; + + if ( depth > max ) + return n-1; + + return 1 + (depth - min) / step; +} + +inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos) +{ + return (offset + (pos-refpos) % size) % size; +} + +void round_buffer_flush(stats_t *stats, int64_t pos) +{ + int ibuf,idp; + + if ( pos==stats->cov_rbuf.pos ) + return; + + int64_t new_pos = pos; + if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size ) + { + // Flush the whole buffer, but in sequential order, + pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1; + } + + if ( pos < stats->cov_rbuf.pos ) + error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos); + + int ifrom = stats->cov_rbuf.start; + int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1); + if ( ifrom>ito ) + { + for (ibuf=ifrom; ibuf<stats->cov_rbuf.size; ibuf++) + { + if ( !stats->cov_rbuf.buffer[ibuf] ) + continue; + idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]); + stats->cov[idp]++; + stats->cov_rbuf.buffer[ibuf] = 0; + } + ifrom = 0; + } + for (ibuf=ifrom; ibuf<=ito; ibuf++) + { + if ( !stats->cov_rbuf.buffer[ibuf] ) + continue; + idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]); + stats->cov[idp]++; + stats->cov_rbuf.buffer[ibuf] = 0; + } + stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos); + stats->cov_rbuf.pos = new_pos; +} + +void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to) +{ + if ( to-from >= rbuf->size ) + error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size); + if ( from < rbuf->pos ) + error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos); + + int ifrom,ito,ibuf; + ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from); + ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to); + if ( ifrom>ito ) + { + for (ibuf=ifrom; ibuf<rbuf->size; ibuf++) + rbuf->buffer[ibuf]++; + ifrom = 0; + } + for (ibuf=ifrom; ibuf<=ito; ibuf++) + rbuf->buffer[ibuf]++; +} + +// Calculate the number of bases in the read trimmed by BWA +int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse) +{ + if ( len<BWA_MIN_RDLEN ) return 0; + + // Although the name implies that the read cannot be trimmed to more than BWA_MIN_RDLEN, + // the calculation can in fact trim it to (BWA_MIN_RDLEN-1). (bwa_trim_read in bwa/bwaseqio.c). + int max_trimmed = len - BWA_MIN_RDLEN + 1; + int l, sum=0, max_sum=0, max_l=0; + + for (l=0; l<max_trimmed; l++) + { + sum += trim_qual - quals[ reverse ? l : len-1-l ]; + if ( sum<0 ) break; + if ( sum>max_sum ) + { + max_sum = sum; + // This is the correct way, but bwa clips from some reason one base less + // max_l = l+1; + max_l = l; + } + } + return max_l; +} + + +void count_indels(stats_t *stats,bam1_t *bam_line) +{ + int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; + int is_1st = IS_READ1(bam_line) ? 1 : 0; + int icig; + int icycle = 0; + int read_len = bam_line->core.l_qseq; + for (icig=0; icig<bam_line->core.n_cigar; icig++) + { + // Conversion from uint32_t to MIDNSHP + // 0123456 + // MIDNSHP + int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK; + int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT; + + if ( cig==1 ) + { + int idx = is_fwd ? icycle : read_len-icycle-ncig; + if ( idx<0 ) + error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle); + if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); + if ( is_1st ) + stats->ins_cycles_1st[idx]++; + else + stats->ins_cycles_2nd[idx]++; + icycle += ncig; + if ( ncig<=stats->nindels ) + stats->insertions[ncig-1]++; + continue; + } + if ( cig==2 ) + { + int idx = is_fwd ? icycle-1 : read_len-icycle-1; + if ( idx<0 ) continue; // discard meaningless deletions + if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases); + if ( is_1st ) + stats->del_cycles_1st[idx]++; + else + stats->del_cycles_2nd[idx]++; + if ( ncig<=stats->nindels ) + stats->deletions[ncig-1]++; + continue; + } + if ( cig!=3 && cig!=5 ) + icycle += ncig; + } +} + +void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) +{ + int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; + int icig,iread=0,icycle=0; + int iref = bam_line->core.pos - stats->rseq_pos; + int read_len = bam_line->core.l_qseq; + uint8_t *read = bam1_seq(bam_line); + uint8_t *quals = bam1_qual(bam_line); + uint64_t *mpc_buf = stats->mpc_buf; + for (icig=0; icig<bam_line->core.n_cigar; icig++) + { + // Conversion from uint32_t to MIDNSHP + // 0123456 + // MIDNSHP + int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK; + int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT; + if ( cig==1 ) + { + iread += ncig; + icycle += ncig; + continue; + } + if ( cig==2 ) + { + iref += ncig; + continue; + } + if ( cig==4 ) + { + icycle += ncig; + // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence + // iref += ncig; + iread += ncig; + continue; + } + if ( cig==5 ) + { + icycle += ncig; + continue; + } + // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large + // chunk of refseq in memory. Not very frequent and not noticable in the stats. + if ( cig==3 || cig==5 ) continue; + if ( cig!=0 ) + error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); + + if ( ncig+iref > stats->nrseq_buf ) + error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1); + + int im; + for (im=0; im<ncig; im++) + { + uint8_t cread = bam1_seqi(read,iread); + uint8_t cref = stats->rseq_buf[iref]; + + // ---------------15 + // =ACMGRSVTWYHKDBN + if ( cread==15 ) + { + int idx = is_fwd ? icycle : read_len-icycle-1; + if ( idx>stats->max_len ) + error("mpc: %d>%d\n",idx,stats->max_len); + idx = idx*stats->nquals; + if ( idx>=stats->nquals*stats->nbases ) + error("FIXME: mpc_buf overflow\n"); + mpc_buf[idx]++; + } + else if ( cref && cread && cref!=cread ) + { + uint8_t qual = quals[iread] + 1; + if ( qual>=stats->nquals ) + error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); + + int idx = is_fwd ? icycle : read_len-icycle-1; + if ( idx>stats->max_len ) + error("mpc: %d>%d\n",idx,stats->max_len); + + idx = idx*stats->nquals + qual; + if ( idx>=stats->nquals*stats->nbases ) + error("FIXME: mpc_buf overflow\n"); + mpc_buf[idx]++; + } + + iref++; + iread++; + icycle++; + } + } +} + +void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos) +{ + khash_t(kh_faidx) *h; + khiter_t iter; + faidx1_t val; + char *chr, c; + faidx_t *fai = stats->fai; + + h = fai->hash; + chr = stats->sam->header->target_name[tid]; + + // ID of the sequence name + iter = kh_get(kh_faidx, h, chr); + if (iter == kh_end(h)) + error("No such reference sequence [%s]?\n", chr); + val = kh_value(h, iter); + + // Check the boundaries + if (pos >= val.len) + error("Was the bam file mapped with the reference sequence supplied?" + " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len); + int size = stats->mrseq_buf; + // The buffer extends beyond the chromosome end. Later the rest will be filled with N's. + if (size+pos > val.len) size = val.len-pos; + + // Position the razf reader + razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET); + + uint8_t *ptr = stats->rseq_buf; + int nread = 0; + while ( nread<size && razf_read(fai->rz,&c,1) && !fai->rz->z_err ) + { + if ( !isgraph(c) ) + continue; + + // Conversion between uint8_t coding and ACGT + // -12-4---8------- + // =ACMGRSVTWYHKDBN + if ( c=='A' || c=='a' ) + *ptr = 1; + else if ( c=='C' || c=='c' ) + *ptr = 2; + else if ( c=='G' || c=='g' ) + *ptr = 4; + else if ( c=='T' || c=='t' ) + *ptr = 8; + else + *ptr = 0; + ptr++; + nread++; + } + if ( nread < stats->mrseq_buf ) + { + memset(ptr,0, stats->mrseq_buf - nread); + nread = stats->mrseq_buf; + } + stats->nrseq_buf = nread; + stats->rseq_pos = pos; + stats->tid = tid; +} + +float fai_gc_content(stats_t *stats, int pos, int len) +{ + uint32_t gc,count,c; + int i = pos - stats->rseq_pos, ito = i + len; + assert( i>=0 && ito<=stats->nrseq_buf ); + + // Count GC content + gc = count = 0; + for (; i<ito; i++) + { + c = stats->rseq_buf[i]; + if ( c==2 || c==4 ) + { + gc++; + count++; + } + else if ( c==1 || c==8 ) + count++; + } + return count ? (float)gc/count : 0; +} + +void realloc_rseq_buffer(stats_t *stats) +{ + int n = stats->nbases*10; + if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size; + if ( stats->mrseq_buf<n ) + { + stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n); + stats->mrseq_buf = n; + } +} + +void realloc_gcd_buffer(stats_t *stats, int seq_len) +{ + if ( seq_len >= stats->gcd_bin_size ) + error("The --GC-depth bin size (%d) is set too low for the read length %d\n", stats->gcd_bin_size, seq_len); + + int n = 1 + stats->gcd_ref_size / (stats->gcd_bin_size - seq_len); + if ( n <= stats->igcd ) + error("The --GC-depth bin size is too small or reference genome too big; please decrease the bin size or increase the reference length\n"); + + if ( n > stats->ngcd ) + { + stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t)); + if ( !stats->gcd ) + error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n); + memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t)); + stats->ngcd = n; + } + + realloc_rseq_buffer(stats); +} + +void realloc_buffers(stats_t *stats, int seq_len) +{ + int n = 2*(1 + seq_len - stats->nbases) + stats->nbases; + + stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t)); + if ( !stats->quals_1st ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t)); + memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t)); + + stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t)); + if ( !stats->quals_2nd ) + error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t)); + memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t)); + + if ( stats->mpc_buf ) + { + stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t)); + if ( !stats->mpc_buf ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t)); + memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t)); + } + + stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t)); + if ( !stats->acgt_cycles ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t)); + memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t)); + + stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t)); + if ( !stats->read_lengths ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); + memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t)); + if ( !stats->insertions ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); + memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t)); + if ( !stats->deletions ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); + memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t)); + if ( !stats->ins_cycles_1st ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); + memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t)); + if ( !stats->ins_cycles_2nd ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); + memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t)); + if ( !stats->del_cycles_1st ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); + memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t)); + if ( !stats->del_cycles_2nd ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); + memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->nbases = n; + + // Realloc the coverage distribution buffer + int *rbuffer = calloc(sizeof(int),seq_len*5); + n = stats->cov_rbuf.size-stats->cov_rbuf.start; + memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n); + if ( stats->cov_rbuf.start>1 ) + memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start); + stats->cov_rbuf.start = 0; + free(stats->cov_rbuf.buffer); + stats->cov_rbuf.buffer = rbuffer; + stats->cov_rbuf.size = seq_len*5; + + realloc_rseq_buffer(stats); +} + +void collect_stats(bam1_t *bam_line, stats_t *stats) +{ + if ( stats->rg_hash ) + { + const uint8_t *rg = bam_aux_get(bam_line, "RG"); + if ( !rg ) return; + khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1)); + if ( k == kh_end(stats->rg_hash) ) return; + } + if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require ) + { + stats->nreads_filtered++; + return; + } + if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) ) + { + stats->nreads_filtered++; + return; + } + if ( !is_in_regions(bam_line,stats) ) + return; + if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen ) + return; + + if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++; + if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++; + + int seq_len = bam_line->core.l_qseq; + if ( !seq_len ) return; + + if ( seq_len >= stats->nbases ) + realloc_buffers(stats,seq_len); + if ( stats->max_len<seq_len ) + stats->max_len = seq_len; + + stats->read_lengths[seq_len]++; + + // Count GC and ACGT per cycle + uint8_t base, *seq = bam1_seq(bam_line); + int gc_count = 0; + int i; + int reverse = IS_REVERSE(bam_line); + for (i=0; i<seq_len; i++) + { + // Conversion from uint8_t coding to ACGT + // -12-4---8------- + // =ACMGRSVTWYHKDBN + // 01 2 3 + base = bam1_seqi(seq,i); + base /= 2; + if ( base==1 || base==2 ) gc_count++; + else if ( base>2 ) base=3; + if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 ) + error("FIXME: acgt_cycles\n"); + stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++; + } + int gc_idx_min = gc_count*(stats->ngc-1)/seq_len; + int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len; + if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1; + + // Determine which array (1st or 2nd read) will these stats go to, + // trim low quality bases from end the same way BWA does, + // fill GC histogram + uint64_t *quals; + uint8_t *bam_quals = bam1_qual(bam_line); + if ( bam_line->core.flag&BAM_FREAD2 ) + { + quals = stats->quals_2nd; + stats->nreads_2nd++; + for (i=gc_idx_min; i<gc_idx_max; i++) + stats->gc_2nd[i]++; + } + else + { + quals = stats->quals_1st; + stats->nreads_1st++; + for (i=gc_idx_min; i<gc_idx_max; i++) + stats->gc_1st[i]++; + } + if ( stats->trim_qual>0 ) + stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse); + + // Quality histogram and average quality + for (i=0; i<seq_len; i++) + { + uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i]; + if ( qual>=stats->nquals ) + error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); + if ( qual>stats->max_qual ) + stats->max_qual = qual; + + quals[ i*stats->nquals+qual ]++; + stats->sum_qual += qual; + } + + // Look at the flags and increment appropriate counters (mapped, paired, etc) + if ( IS_UNMAPPED(bam_line) ) + stats->nreads_unmapped++; + else + { + if ( !bam_line->core.qual ) + stats->nreads_mq0++; + + count_indels(stats,bam_line); + + if ( !IS_PAIRED(bam_line) ) + stats->nreads_unpaired++; + else + { + stats->nreads_paired++; + + if ( bam_line->core.tid!=bam_line->core.mtid ) + stats->nreads_anomalous++; + + // The insert size is tricky, because for long inserts the libraries are + // prepared differently and the pairs point in other direction. BWA does + // not set the paired flag for them. Similar thing is true also for 454 + // reads. Mates mapped to different chromosomes have isize==0. + int32_t isize = bam_line->core.isize; + if ( isize<0 ) isize = -isize; + if ( isize >= stats->nisize ) + isize = stats->nisize-1; + if ( isize>0 || bam_line->core.tid==bam_line->core.mtid ) + { + int pos_fst = bam_line->core.mpos - bam_line->core.pos; + int is_fst = IS_READ1(bam_line) ? 1 : -1; + int is_fwd = IS_REVERSE(bam_line) ? -1 : 1; + int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1; + + if ( is_fwd*is_mfwd>0 ) + stats->isize_other[isize]++; + else if ( is_fst*pos_fst>0 ) + { + if ( is_fst*is_fwd>0 ) + stats->isize_inward[isize]++; + else + stats->isize_outward[isize]++; + } + else if ( is_fst*pos_fst<0 ) + { + if ( is_fst*is_fwd>0 ) + stats->isize_outward[isize]++; + else + stats->isize_inward[isize]++; + } + } + } + + // Number of mismatches + uint8_t *nm = bam_aux_get(bam_line,"NM"); + if (nm) + stats->nmismatches += bam_aux2i(nm); + + // Number of mapped bases from cigar + // Conversion from uint32_t to MIDNSHP + // 012-4-- + // MIDNSHP + if ( bam_line->core.n_cigar == 0) + error("FIXME: mapped read with no cigar?\n"); + int readlen=seq_len; + if ( stats->regions ) + { + // Count only on-target bases + int iref = bam_line->core.pos + 1; + for (i=0; i<bam_line->core.n_cigar; i++) + { + int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK; + int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; + if ( cig==2 ) readlen += ncig; + else if ( cig==0 ) + { + if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref; + else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to; + if ( ncig<0 ) ncig = 0; + stats->nbases_mapped_cigar += ncig; + iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; + } + else if ( cig==1 ) + { + iref += ncig; + if ( iref>=stats->reg_from && iref<=stats->reg_to ) + stats->nbases_mapped_cigar += ncig; + } + } + } + else + { + // Count the whole read + for (i=0; i<bam_line->core.n_cigar; i++) + { + if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 ) + stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; + if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 ) + readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; + } + } + stats->nbases_mapped += seq_len; + + if ( stats->tid==bam_line->core.tid && bam_line->core.pos<stats->pos ) + stats->is_sorted = 0; + stats->pos = bam_line->core.pos; + + if ( stats->is_sorted ) + { + if ( stats->tid==-1 || stats->tid!=bam_line->core.tid ) + round_buffer_flush(stats,-1); + + // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins + // are not splitted which results in up to seq_len-1 overlaps. The default bin size is + // 20kbp, so the effect is negligible. + if ( stats->fai ) + { + int inc_ref = 0, inc_gcd = 0; + // First pass or new chromosome + if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; } + // Read goes beyond the end of the rseq buffer + else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; } + // Read overlaps the next gcd bin + else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen ) + { + inc_gcd = 1; + if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1; + } + if ( inc_gcd ) + { + stats->igcd++; + if ( stats->igcd >= stats->ngcd ) + realloc_gcd_buffer(stats, readlen); + if ( inc_ref ) + read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos); + stats->gcd_pos = bam_line->core.pos; + stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size); + } + + count_mismatches_per_cycle(stats,bam_line); + } + // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin + else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size ) + { + // First pass or a new chromosome + stats->tid = bam_line->core.tid; + stats->gcd_pos = bam_line->core.pos; + stats->igcd++; + if ( stats->igcd >= stats->ngcd ) + realloc_gcd_buffer(stats, readlen); + } + stats->gcd[ stats->igcd ].depth++; + // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK) + if ( !stats->fai ) + stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len; + + // Coverage distribution graph + round_buffer_flush(stats,bam_line->core.pos); + round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1); + } + } + + stats->total_len += seq_len; + if ( IS_DUP(bam_line) ) + { + stats->total_len_dup += seq_len; + stats->nreads_dup++; + } +} + +// Sort by GC and depth +#define GCD_t(x) ((gc_depth_t *)x) +static int gcd_cmp(const void *a, const void *b) +{ + if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1; + if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1; + if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1; + if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1; + return 0; +} +#undef GCD_t + +float gcd_percentile(gc_depth_t *gcd, int N, int p) +{ + float n,d; + int k; + + n = p*(N+1)/100; + k = n; + if ( k<=0 ) + return gcd[0].depth; + if ( k>=N ) + return gcd[N-1].depth; + + d = n - k; + return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth); +} + +void output_stats(stats_t *stats) +{ + // Calculate average insert size and standard deviation (from the main bulk data only) + int isize, ibulk=0; + uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0; + for (isize=0; isize<stats->nisize; isize++) + { + // Each pair was counted twice + stats->isize_inward[isize] *= 0.5; + stats->isize_outward[isize] *= 0.5; + stats->isize_other[isize] *= 0.5; + + nisize_inward += stats->isize_inward[isize]; + nisize_outward += stats->isize_outward[isize]; + nisize_other += stats->isize_other[isize]; + nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]; + } + + double bulk=0, avg_isize=0, sd_isize=0; + for (isize=0; isize<stats->nisize; isize++) + { + bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]; + avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]); + + if ( bulk/nisize > stats->isize_main_bulk ) + { + ibulk = isize+1; + nisize = bulk; + break; + } + } + avg_isize /= nisize ? nisize : 1; + for (isize=1; isize<ibulk; isize++) + sd_isize += (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize; + sd_isize = sqrt(sd_isize); + + + printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION); + printf("# The command line was: %s",stats->argv[0]); + int i; + for (i=1; i<stats->argc; i++) + printf(" %s",stats->argv[i]); + printf("\n"); + printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"); + printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd)); + printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered); + printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd)); + printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0); + printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); + printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st); + printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd); + printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired)); + printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped); + printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired); + printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired); + printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup); + printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0); + printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed); + printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary); + printf("SN\ttotal length:\t%ld\n", (long)stats->total_len); + printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped); + printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar); + printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed); + printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup); + printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches); + printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar); + float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0; + printf("SN\taverage length:\t%.0f\n", avg_read_length); + printf("SN\tmaximum length:\t%d\n", stats->max_len); + printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0); + printf("SN\tinsert size average:\t%.1f\n", avg_isize); + printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize); + printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward); + printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward); + printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other); + printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2); + + int ibase,iqual; + if ( stats->max_len<stats->nbases ) stats->max_len++; + if ( stats->max_qual+1<stats->nquals ) stats->max_qual++; + printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n"); + printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n"); + for (ibase=0; ibase<stats->max_len; ibase++) + { + printf("FFQ\t%d",ibase+1); + for (iqual=0; iqual<=stats->max_qual; iqual++) + { + printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]); + } + printf("\n"); + } + printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n"); + printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n"); + for (ibase=0; ibase<stats->max_len; ibase++) + { + printf("LFQ\t%d",ibase+1); + for (iqual=0; iqual<=stats->max_qual; iqual++) + { + printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]); + } + printf("\n"); + } + if ( stats->mpc_buf ) + { + printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n"); + printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n"); + printf("# is the number of N's and the rest is the number of mismatches\n"); + for (ibase=0; ibase<stats->max_len; ibase++) + { + printf("MPC\t%d",ibase+1); + for (iqual=0; iqual<=stats->max_qual; iqual++) + { + printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]); + } + printf("\n"); + } + } + printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n"); + int ibase_prev = 0; + for (ibase=0; ibase<stats->ngc; ibase++) + { + if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue; + printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]); + ibase_prev = ibase; + } + printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n"); + ibase_prev = 0; + for (ibase=0; ibase<stats->ngc; ibase++) + { + if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue; + printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]); + ibase_prev = ibase; + } + printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n"); + for (ibase=0; ibase<stats->max_len; ibase++) + { + uint64_t *ptr = &(stats->acgt_cycles[ibase*4]); + uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3]; + if ( ! sum ) continue; + printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum); + } + printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); + for (isize=0; isize<ibulk; isize++) + printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]), + (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]); + + printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n"); + int ilen; + for (ilen=0; ilen<stats->max_len; ilen++) + { + if ( stats->read_lengths[ilen]>0 ) + printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]); + } + + printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n"); + for (ilen=0; ilen<stats->nindels; ilen++) + { + if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 ) + printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]); + } + + printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n"); + for (ilen=0; ilen<=stats->nbases; ilen++) + { + // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions + // the index of the cycle of the first inserted base (also 1-based) + if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 ) + printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]); + } + + printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n"); + if ( stats->cov[0] ) + printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]); + int icov; + for (icov=1; icov<stats->ncov-1; icov++) + if ( stats->cov[icov] ) + printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]); + if ( stats->cov[stats->ncov-1] ) + printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]); + + // Calculate average GC content, then sort by GC and depth + printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n"); + uint32_t igcd; + for (igcd=0; igcd<stats->igcd; igcd++) + { + if ( stats->fai ) + stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc); + else + if ( stats->gcd[igcd].depth ) + stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth); + } + qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp); + igcd = 0; + while ( igcd < stats->igcd ) + { + // Calculate percentiles (10,25,50,75,90th) for the current GC content and print + uint32_t nbins=0, itmp=igcd; + float gc = stats->gcd[igcd].gc; + while ( itmp<stats->igcd && fabs(stats->gcd[itmp].gc-gc)<0.1 ) + { + nbins++; + itmp++; + } + printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1), + gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size + ); + igcd += nbins; + } +} + +size_t mygetline(char **line, size_t *n, FILE *fp) +{ + if (line == NULL || n == NULL || fp == NULL) + { + errno = EINVAL; + return -1; + } + if (*n==0 || !*line) + { + *line = NULL; + *n = 0; + } + + size_t nread=0; + int c; + while ((c=getc(fp))!= EOF && c!='\n') + { + if ( ++nread>=*n ) + { + *n += 255; + *line = realloc(*line, sizeof(char)*(*n)); + } + (*line)[nread-1] = c; + } + if ( nread>=*n ) + { + *n += 255; + *line = realloc(*line, sizeof(char)*(*n)); + } + (*line)[nread] = 0; + return nread>0 ? nread : -1; + +} + +void init_regions(stats_t *stats, char *file) +{ + khiter_t iter; + khash_t(kh_bam_tid) *header_hash; + + bam_init_header_hash(stats->sam->header); + header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash; + + FILE *fp = fopen(file,"r"); + if ( !fp ) error("%s: %s\n",file,strerror(errno)); + + char *line = NULL; + size_t len = 0; + ssize_t nread; + int warned = 0; + int prev_tid=-1, prev_pos=-1; + while ((nread = mygetline(&line, &len, fp)) != -1) + { + if ( line[0] == '#' ) continue; + + int i = 0; + while ( i<nread && !isspace(line[i]) ) i++; + if ( i>=nread ) error("Could not parse the file: %s [%s]\n", file,line); + line[i] = 0; + + iter = kh_get(kh_bam_tid, header_hash, line); + int tid = kh_val(header_hash, iter); + if ( iter == kh_end(header_hash) ) + { + if ( !warned ) + fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line); + warned = 1; + continue; + } + + if ( tid >= stats->nregions ) + { + stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100)); + int j; + for (j=stats->nregions; j<stats->nregions+100; j++) + { + stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0; + stats->regions[j].pos = NULL; + } + stats->nregions += 100; + } + int npos = stats->regions[tid].npos; + if ( npos >= stats->regions[tid].mpos ) + { + stats->regions[tid].mpos += 1000; + stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos); + } + + if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n"); + if ( prev_tid==-1 || prev_tid!=tid ) + { + prev_tid = tid; + prev_pos = stats->regions[tid].pos[npos].from; + } + if ( prev_pos>stats->regions[tid].pos[npos].from ) + error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos); + stats->regions[tid].npos++; + } + if (line) free(line); + if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n"); + fclose(fp); +} + +void destroy_regions(stats_t *stats) +{ + int i; + for (i=0; i<stats->nregions; i++) + { + if ( !stats->regions[i].mpos ) continue; + free(stats->regions[i].pos); + } + if ( stats->regions ) free(stats->regions); +} + +static int fetch_read(const bam1_t *bam_line, void *data) +{ + collect_stats((bam1_t*)bam_line,(stats_t*)data); + return 1; +} + +void reset_regions(stats_t *stats) +{ + int i; + for (i=0; i<stats->nregions; i++) + stats->regions[i].cpos = 0; +} + +int is_in_regions(bam1_t *bam_line, stats_t *stats) +{ + if ( !stats->regions ) return 1; + + if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0; + if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n"); + + regions_t *reg = &stats->regions[bam_line->core.tid]; + if ( reg->cpos==reg->npos ) return 0; // done for this chr + + // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered, + // even small overlap is enough to include the read in the stats. + int i = reg->cpos; + while ( i<reg->npos && reg->pos[i].to<=bam_line->core.pos ) i++; + if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; } + if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0; + reg->cpos = i; + stats->reg_from = reg->pos[i].from; + stats->reg_to = reg->pos[i].to; + + return 1; +} + +void init_group_id(stats_t *stats, char *id) +{ + if ( !stats->sam->header->dict ) + stats->sam->header->dict = sam_header_parse2(stats->sam->header->text); + void *iter = stats->sam->header->dict; + const char *key, *val; + int n = 0; + stats->rg_hash = kh_init(kh_rg); + while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) ) + { + if ( !strcmp(id,key) || (val && !strcmp(id,val)) ) + { + khiter_t k = kh_get(kh_rg, stats->rg_hash, key); + if ( k != kh_end(stats->rg_hash) ) + fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key); + int ret; + k = kh_put(kh_rg, stats->rg_hash, key, &ret); + kh_value(stats->rg_hash, k) = val; + n++; + } + } + if ( !n ) + error("The sample or read group \"%s\" not present.\n", id); +} + + +void error(const char *format, ...) +{ + if ( !format ) + { + printf("Version: %s\n", BAMCHECK_VERSION); + printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n"); + printf("Usage: bamcheck [OPTIONS] file.bam\n"); + printf(" bamcheck [OPTIONS] file.bam chr:from-to\n"); + printf("Options:\n"); + printf(" -c, --coverage <int>,<int>,<int> Coverage distribution min,max,step [1,1000,1]\n"); + printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n"); + printf(" -f, --required-flag <int> Required flag, 0 for unset [0]\n"); + printf(" -F, --filtering-flag <int> Filtering flag, 0 for unset [0]\n"); + printf(" --GC-depth <float,float> Bin size for GC-depth graph and the maximum reference length [2e4,4.2e9]\n"); + printf(" -h, --help This help message\n"); + printf(" -i, --insert-size <int> Maximum insert size [8000]\n"); + printf(" -I, --id <string> Include only listed read group or sample name\n"); + printf(" -l, --read-length <int> Include in the statistics only reads with the given read length []\n"); + printf(" -m, --most-inserts <float> Report only the main part of inserts [0.99]\n"); + printf(" -q, --trim-quality <int> The BWA trimming parameter [0]\n"); + printf(" -r, --ref-seq <file> Reference sequence (required for GC-depth calculation).\n"); + printf(" -t, --target-regions <file> Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n"); + printf(" -s, --sam Input is SAM\n"); + printf("\n"); + } + else + { + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + } + exit(-1); +} + +int main(int argc, char *argv[]) +{ + char *targets = NULL; + char *bam_fname = NULL; + char *group_id = NULL; + samfile_t *sam = NULL; + char in_mode[5]; + + stats_t *stats = calloc(1,sizeof(stats_t)); + stats->ngc = 200; + stats->nquals = 256; + stats->nbases = 300; + stats->nisize = 8000; + stats->max_len = 30; + stats->max_qual = 40; + stats->isize_main_bulk = 0.99; // There are always outliers at the far end + stats->gcd_bin_size = 20e3; + stats->gcd_ref_size = 4.2e9; + stats->rseq_pos = -1; + stats->tid = stats->gcd_pos = -1; + stats->igcd = 0; + stats->is_sorted = 1; + stats->cov_min = 1; + stats->cov_max = 1000; + stats->cov_step = 1; + stats->argc = argc; + stats->argv = argv; + stats->filter_readlen = -1; + stats->nindels = stats->nbases; + + strcpy(in_mode, "rb"); + + static struct option loptions[] = + { + {"help",0,0,'h'}, + {"remove-dups",0,0,'d'}, + {"sam",0,0,'s'}, + {"ref-seq",1,0,'r'}, + {"coverage",1,0,'c'}, + {"read-length",1,0,'l'}, + {"insert-size",1,0,'i'}, + {"most-inserts",1,0,'m'}, + {"trim-quality",1,0,'q'}, + {"target-regions",0,0,'t'}, + {"required-flag",1,0,'f'}, + {"filtering-flag",0,0,'F'}, + {"id",1,0,'I'}, + {"GC-depth",1,0,1}, + {0,0,0,0} + }; + int opt; + while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 ) + { + switch (opt) + { + case 'f': stats->flag_require=strtol(optarg,0,0); break; + case 'F': stats->flag_filter=strtol(optarg,0,0); break; + case 'd': stats->flag_filter|=BAM_FDUP; break; + case 's': strcpy(in_mode, "r"); break; + case 'r': stats->fai = fai_load(optarg); + if (stats->fai==0) + error("Could not load faidx: %s\n", optarg); + break; + case 1 : { + float flen,fbin; + if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 ) + error("Unable to parse --GC-depth %s\n", optarg); + stats->gcd_bin_size = fbin; + stats->gcd_ref_size = flen; + } + break; + case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 ) + error("Unable to parse -c %s\n", optarg); + break; + case 'l': stats->filter_readlen = atoi(optarg); break; + case 'i': stats->nisize = atoi(optarg); break; + case 'm': stats->isize_main_bulk = atof(optarg); break; + case 'q': stats->trim_qual = atoi(optarg); break; + case 't': targets = optarg; break; + case 'I': group_id = optarg; break; + case '?': + case 'h': error(NULL); + default: error("Unknown argument: %s\n", optarg); + } + } + if ( optind<argc ) + bam_fname = argv[optind++]; + + if ( !bam_fname ) + { + if ( isatty(fileno((FILE *)stdin)) ) + error(NULL); + bam_fname = "-"; + } + + // Init structures + // .. coverage bins and round buffer + if ( stats->cov_step > stats->cov_max - stats->cov_min + 1 ) + { + stats->cov_step = stats->cov_max - stats->cov_min; + if ( stats->cov_step <= 0 ) + stats->cov_step = 1; + } + stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step; + stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1; + stats->cov = calloc(sizeof(uint64_t),stats->ncov); + stats->cov_rbuf.size = stats->nbases*5; + stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size); + // .. bam + if ((sam = samopen(bam_fname, in_mode, NULL)) == 0) + error("Failed to open: %s\n", bam_fname); + stats->sam = sam; + if ( group_id ) init_group_id(stats, group_id); + bam1_t *bam_line = bam_init1(); + // .. arrays + stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); + stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); + stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t)); + stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t)); + stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t)); + stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t)); + stats->isize_other = calloc(stats->nisize,sizeof(uint64_t)); + stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t)); + stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL; + stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t)); + stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t)); + stats->insertions = calloc(stats->nbases,sizeof(uint64_t)); + stats->deletions = calloc(stats->nbases,sizeof(uint64_t)); + stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); + stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); + stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); + stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); + realloc_rseq_buffer(stats); + if ( targets ) + init_regions(stats, targets); + + // Collect statistics + if ( optind<argc ) + { + // Collect stats in selected regions only + bam_index_t *bam_idx = bam_index_load(bam_fname); + if (bam_idx == 0) + error("Random alignment retrieval only works for indexed BAM files.\n"); + + int i; + for (i=optind; i<argc; i++) + { + int tid, beg, end; + bam_parse_region(stats->sam->header, argv[i], &tid, &beg, &end); + if ( tid < 0 ) continue; + reset_regions(stats); + bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read); + } + bam_index_destroy(bam_idx); + } + else + { + // Stream through the entire BAM ignoring off-target regions if -t is given + while (samread(sam,bam_line) >= 0) + collect_stats(bam_line,stats); + } + round_buffer_flush(stats,-1); + + output_stats(stats); + + bam_destroy1(bam_line); + samclose(stats->sam); + if (stats->fai) fai_destroy(stats->fai); + free(stats->cov_rbuf.buffer); free(stats->cov); + free(stats->quals_1st); free(stats->quals_2nd); + free(stats->gc_1st); free(stats->gc_2nd); + free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other); + free(stats->gcd); + free(stats->rseq_buf); + free(stats->mpc_buf); + free(stats->acgt_cycles); + free(stats->read_lengths); + free(stats->insertions); + free(stats->deletions); + free(stats->ins_cycles_1st); + free(stats->ins_cycles_2nd); + free(stats->del_cycles_1st); + free(stats->del_cycles_2nd); + destroy_regions(stats); + free(stats); + if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash); + + return 0; +} + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/blast2sam.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,92 @@ +#!/usr/bin/perl -w + +use strict; +use warnings; +use Getopt::Std; + +&blast2sam; + +sub blast2sam { + my %opts = (); + getopts('s', \%opts); + die("Usage: blast2sam.pl <in.blastn>\n") if (-t STDIN && @ARGV == 0); + my ($qlen, $slen, $q, $s, $qbeg, $qend, @sam, @cigar, @cmaux, $show_seq); + $show_seq = defined($opts{s}); + @sam = (); @sam[0,4,6..8,10] = ('', 255, '*', 0, 0, '*'); + while (<>) { + if (@cigar && (/^Query=/ || /Score =.*bits.*Expect/)) { # print + &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend); + @cigar = (); + } + if (/^Query= (\S+)/) { + $sam[0] = $1; + } elsif (/\((\S+)\s+letters\)/) { + $qlen = $1; $qlen =~ s/,//g; + } elsif (/^>(\S+)/) { + $sam[2] = $1; + } elsif (/Length = (\d+)/) { + $slen = $1; + } elsif (/Score =\s+(\S+) bits.+Expect(\(\d+\))? = (\S+)/) { # the start of an alignment block + my ($as, $ev) = (int($1 + .499), $3); + $ev = "1$ev" if ($ev =~ /^e/); + @sam[1,3,9,11,12] = (0, 0, '', "AS:i:$as", "EV:Z:$ev"); + @cigar = (); $qbeg = 0; + @cmaux = (0, 0, 0, ''); + } elsif (/Strand = (\S+) \/ (\S+)/) { + $sam[1] |= 0x10 if ($2 eq 'Minus'); + } elsif (/Query\:\s(\d+)\s*(\S+)\s(\d+)/) { + $q = $2; + unless ($qbeg) { + $qbeg = $1; + push(@cigar, ($1-1) . "H") if ($1 > 1); + } + $qend = $3; + if ($show_seq) { + my $x = $q; + $x =~ s/-//g; $sam[9] .= $x; + } + } elsif (/Sbjct\:\s(\d+)\s*(\S+)\s(\d+)/) { + $s = $2; + if ($sam[1] & 0x10) { + $sam[3] = $3; + } else { + $sam[3] = $1 unless ($sam[3]); + } + &aln2cm(\@cigar, \$q, \$s, \@cmaux); + } + } + &blast_print_sam(\@sam, \@cigar, \@cmaux, $qlen - $qend); +} + +sub blast_print_sam { + my ($sam, $cigar, $cmaux, $qrest) = @_; + push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1)); + push(@$cigar, $qrest . 'H') if ($qrest); + if ($sam->[1] & 0x10) { + @$cigar = reverse(@$cigar); + $sam->[9] = reverse($sam->[9]); + $sam->[9] =~ tr/atgcrymkswATGCRYMKSW/tacgyrkmswTACGYRKMSW/; + } + $sam->[9] = '*' if (!$sam->[9]); + $sam->[5] = join('', @$cigar); + print join("\t", @$sam), "\n"; +} + +sub aln2cm { + my ($cigar, $q, $s, $cmaux) = @_; + my $l = length($$q); + for (my $i = 0; $i < $l; ++$i) { + my $op; + # set $op + if (substr($$q, $i, 1) eq '-') { $op = 2; } + elsif (substr($$s, $i, 1) eq '-') { $op = 1; } + else { $op = 0; } + # for CIGAR + if ($cmaux->[0] == $op) { + ++$cmaux->[1]; + } else { + push(@$cigar, $cmaux->[1] . substr("MDI", $cmaux->[0], 1)); + $cmaux->[0] = $op; $cmaux->[1] = 1; + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/bowtie2sam.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,92 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.1 + +use strict; +use warnings; +use Getopt::Std; + +&bowtie2sam; +exit; + +sub bowtie2sam { + my %opts = (); + die("Usage: bowtie2sam.pl <aln.bowtie>\n") if (@ARGV == 0 && -t STDIN); + # core loop + my (@s, $last, @staging, $k, $best_s, $subbest_s, $best_k); + $last = ''; + while (<>) { + my ($name, $nm) = &bowtie2sam_aux($_, \@s); # read_name, number of mismatches + if ($name eq $last) { + # I do not know whether the multiple hits are ordered on the + # number of mismatches. I assume they are not and so I have to + # keep all these multiple hits in memory. + @{$staging[$k]} = @s; + if ($best_s > $nm) { + $subbest_s = $best_s; + $best_s = $nm; + $best_k = $k; + } elsif ($subbest_s > $nm) { + $subbest_s = $nm; + } + ++$k; + } else { + if ($last) { + if ($best_s == $subbest_s) { + $staging[$best_k][4] = 0; + } elsif ($subbest_s - $best_s == 1) { + $staging[$best_k][4] = 15 if ($staging[$best_k][4] > 15); + } + print join("\t", @{$staging[$best_k]}), "\n"; + } + $k = 1; $best_s = $nm; $subbest_s = 1000; $best_k = 0; + @{$staging[0]} = @s; + $last = $name; + } + } + print join("\t", @{$staging[$best_k]}), "\n" if ($best_k >= 0); +} + +sub bowtie2sam_aux { + my ($line, $s) = @_; + chomp($line); + my @t = split("\t", $line); + my $ret; + @$s = (); + # read name + $s->[0] = $ret = $t[0]; + $s->[0] =~ s/\/[12]$//g; + # initial flag (will be updated later) + $s->[1] = 0; + # read & quality + $s->[9] = $t[4]; $s->[10] = $t[5]; + # cigar + $s->[5] = length($s->[9]) . "M"; + # coor + $s->[2] = $t[2]; $s->[3] = $t[3] + 1; + $s->[1] |= 0x10 if ($t[1] eq '-'); + # mapQ + $s->[4] = $t[6] == 0? 25 : 0; + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + my $nm = @t - 7; + push(@$s, "NM:i:" . (@t-7)); + push(@$s, "X$nm:i:" . ($t[6]+1)); + my $md = ''; + if ($t[7]) { + $_ = $t[7]; + my $a = 0; + while (/(\d+):[ACGTN]>([ACGTN])/gi) { + my ($y, $z) = ($1, $2); + $md .= (int($y)-$a) . $z; + $a += $y - $a + 1; + } + $md .= length($s->[9]) - $a; + } else { + $md = length($s->[9]); + } + push(@$s, "MD:Z:$md"); + return ($ret, $nm); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/export2sam.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,545 @@ +#!/usr/bin/env perl +# +# +# export2sam.pl converts GERALD export files to SAM format. +# +# +# +########## License: +# +# The MIT License +# +# Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd. +# Modified SAMtools work copyright (c) 2010 Illumina, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# +# +# +########## ChangeLog: +# +# Version: 2.3.1 (18MAR2011) +# +# - Restore file '-' as stdin input. +# +# Version: 2.3.0 (24JAN2011) +# +# - Add support for export reserved chromosome name "CONTROL", +# which is translated to optional field "XC:Z:CONTROL". +# - Check for ".gz" file extension on export files and open +# these as gzip pipes when the extension is found. +# +# Version: 2.2.0 (16NOV2010) +# +# - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y +# - For export records with reserved chromosome name identifiers +# "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM" +# to the SAM record, so that these cases can be distinguished +# from other unmatched reads. +# +# Version: 2.1.0 (21SEP2010) +# +# - Additional export record error checking. +# - Convert export records with chromomsome value of "RM" to unmapped +# SAM records. +# +# Version: 2.0.0 (15FEB2010) +# +# Script updated by Illumina in conjunction with CASAVA 1.7.0 +# release. +# +# Major changes are as follows: +# - The CIGAR string has been updated to include all gaps from +# ELANDv2 alignments. +# - The ELAND single read alignment score is always stored in the +# optional "SM" field and the ELAND paired read alignment score +# is stored in the optional "AS" field when it exists. +# - The MAPQ value is set to the higher of the two alignment scores, +# but no greater than 254, i.e. min(254,max(SM,AS)) +# - The SAM "proper pair" bit (0x0002) is now set for read pairs +# meeting ELAND's expected orientation and insert size criteria. +# - The default quality score translation is set for export files +# which contain Phread+64 quality values. An option, +# "--qlogodds", has been added to translate quality values from +# the Solexa+64 format used in export files prior to Pipeline +# 1.3 +# - The export match descriptor is now reverse-complemented when +# necessary such that it always corresponds to the forward +# strand of the reference, to be consistent with other +# information in the SAM record. It is now written to the +# optional 'XD' field (rather than 'MD') to acknowledge its +# minor differences from the samtools match descriptor (see +# additional detail below). +# - An option, "--nofilter", has been added to include reads which +# have failed primary analysis quality filtration. Such reads +# will have the corresponding SAM flag bit (0x0200) set. +# - Labels in the export 'contig' field are preserved by setting +# RNAME to "$export_chromosome/$export_contig" when the contig +# label exists. +# +# +# Contact: lh3 +# Version: 0.1.2 (03JAN2009) +# +# +# +########## Known Conversion Limitations: +# +# - Export records for reads that map to a position < 1 (allowed +# in export format), are converted to unmapped reads in the SAM +# record. +# - Export records contain the reserved chromosome names: "NM", +# "QC","RM" and "CONTROL". "NM" indicates that the aligner could +# not map the read to the reference sequence set. "QC" means that +# the aligner did not attempt to map the read due to some +# technical limitation. "RM" means that the read mapped to a set +# of 'contaminant' sequences specified in GERALD's RNA-seq +# workflow. "CONTROL" means that the read is a control. All of +# these alignment types are collapsed to the single unmapped +# alignment state in the SAM record, but the optional SAM "XC" +# field is used to record the original reserved chromosome name of +# the read for all but the "NM" case. +# - The export match descriptor is slightly different than the +# samtools match descriptor. For this reason it is stored in the +# optional SAM field 'XD' (and not 'MD'). Note that the export +# match descriptor differs from the samtools version in two +# respects: (1) indels are explicitly closed with the '$' +# character and (2) insertions must be enumerated in the match +# descriptor. For example a 35-base read with a two-base insertion +# is described as: 20^2$14 +# +# +# + +my $version = "2.3.1"; + +use strict; +use warnings; + +use Getopt::Long; +use File::Spec; +use List::Util qw(min max); + + +use constant { + EXPORT_MACHINE => 0, + EXPORT_RUNNO => 1, + EXPORT_LANE => 2, + EXPORT_TILE => 3, + EXPORT_X => 4, + EXPORT_Y => 5, + EXPORT_INDEX => 6, + EXPORT_READNO => 7, + EXPORT_READ => 8, + EXPORT_QUAL => 9, + EXPORT_CHROM => 10, + EXPORT_CONTIG => 11, + EXPORT_POS => 12, + EXPORT_STRAND => 13, + EXPORT_MD => 14, + EXPORT_SEMAP => 15, + EXPORT_PEMAP => 16, + EXPORT_PASSFILT => 21, + EXPORT_SIZE => 22, +}; + + +use constant { + SAM_QNAME => 0, + SAM_FLAG => 1, + SAM_RNAME => 2, + SAM_POS => 3, + SAM_MAPQ => 4, + SAM_CIGAR => 5, + SAM_MRNM => 6, + SAM_MPOS => 7, + SAM_ISIZE => 8, + SAM_SEQ => 9, + SAM_QUAL => 10, +}; + + +# function prototypes for Richard's code +sub match_desc_to_cigar($); +sub match_desc_frag_length($); +sub reverse_compl_match_descriptor($); +sub write_header($;$;$); + + +&export2sam; +exit; + + + + +sub export2sam { + + my $cmdline = $0 . " " . join(" ",@ARGV); + my $arg_count = scalar @ARGV; + my $progname = (File::Spec->splitpath($0))[2]; + + my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values + my $is_nofilter = 0; + my $read1file; + my $read2file; + my $print_version = 0; + my $help = 0; + + my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, + "nofilter" => \$is_nofilter, + "read1=s" => \$read1file, + "read2=s" => \$read2file, + "version" => \$print_version, + "help" => \$help ); + + my $usage = <<END; + +$progname converts GERALD export files to SAM format. + +Usage: $progname --read1=FILENAME [ options ] | --version | --help + + --read1=FILENAME read1 export file or '-' for stdin (mandatory) + (file may be gzipped with ".gz" extension) + --read2=FILENAME read2 export file or '-' for stdin + (file may be gzipped with ".gz" extension) + --nofilter include reads that failed the basecaller + purity filter + --qlogodds assume export file(s) use logodds quality values + as reported by OLB (Pipeline) prior to v1.3 + (default: phred quality values) + +END + + my $version_msg = <<END; + +$progname version: $version + +END + + if((not $result) or $help or ($arg_count==0)) { + die($usage); + } + + if(@ARGV) { + print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n"; + die($usage); + } + + if($print_version) { + die($version_msg); + } + + if(not defined($read1file)) { + print STDERR "\nERROR: read1 export file must be specified\n\n"; + die($usage); + } + + unless((-f $read1file) or ($read1file eq '-')) { + die("\nERROR: Can't find read1 export file: '$read1file'\n\n"); + } + + if (defined $read2file) { + unless((-f $read2file) or ($read2file eq '-')) { + die("\nERROR: Can't find read2 export file: '$read2file'\n\n"); + } + if($read1file eq $read2file) { + die("\nERROR: read1 and read2 export filenames are the same: '$read1file'\n\n"); + } + } + + my ($fh1, $fh2, $is_paired); + + my $read1cmd="$read1file"; + $read1cmd = "gzip -dc $read1file |" if($read1file =~ /\.gz$/); + open($fh1, $read1cmd) + or die("\nERROR: Can't open read1 process: '$read1cmd'\n\n"); + $is_paired = defined $read2file; + if ($is_paired) { + my $read2cmd="$read2file"; + $read2cmd = "gzip -dc $read2file |" if($read2file =~ /\.gz$/); + open($fh2, $read2cmd) + or die("\nERROR: Can't open read2 process: '$read2cmd'\n\n"); + } + # quality value conversion table + my @conv_table; + if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3): + for (-64..64) { + $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499); + } + } else { # convert from phred+64 quality values (pipeline v1.3+): + for (-64..-1) { + $conv_table[$_+64] = undef; + } + for (0..64) { + $conv_table[$_+64] = int(33 + $_); + } + } + # write the header + print write_header( $progname, $version, $cmdline ); + # core loop + my $export_line_count = 0; + while (<$fh1>) { + $export_line_count++; + my (@s1, @s2); + &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter); + if ($is_paired) { + my $read2line = <$fh2>; + if(not $read2line){ + die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n"); + } + &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter); + + if (@s1 && @s2) { # then set mate coordinate + if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){ + die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n"); + } + + my $isize = 0; + if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize + my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS]; + my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS]; + $isize = $x2 - $x1; + } + + foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ + my ($sa,$sb,$is) = @{$_}; + if ($sb->[SAM_RNAME] ne '*') { + $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME]; + $sa->[SAM_MPOS] = $sb->[SAM_POS]; + $sa->[SAM_ISIZE] = $is; + $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10); + } else { + $sa->[SAM_FLAG] |= 0x8; + } + } + } + } + print join("\t", @s1), "\n" if (@s1); + print join("\t", @s2), "\n" if (@s2 && $is_paired); + } + close($fh1); + if($is_paired) { + while(my $read2line = <$fh2>){ + $export_line_count++; + die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n"); + } + close($fh2); + } +} + +sub export2sam_aux { + my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_; + chomp($line); + my @t = split("\t", $line); + if(scalar(@t) < EXPORT_SIZE) { + my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . EXPORT_SIZE . ".\n"; + $msg.="\t...erroneous export record:\n" . $line . "\n\n"; + die($msg); + } + @$s = (); + my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y'); + return if(not ($isPassFilt or $is_nofilter)); + # read name + my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : ""); + $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]), + int($t[EXPORT_X]), int($t[EXPORT_Y])); + # initial flag (will be updated later) + $s->[SAM_FLAG] = 0; + if($is_paired) { + if($t[EXPORT_READNO] != $read_no){ + die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n"); + } + $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no); + } + $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt); + + # read & quality + my $is_export_rev = ($t[EXPORT_STRAND] eq 'R'); + if ($is_export_rev) { # then reverse the sequence and quality + $s->[SAM_SEQ] = reverse($t[EXPORT_READ]); + $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/; + $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]); + } else { + $s->[SAM_SEQ] = $t[EXPORT_READ]; + $s->[SAM_QUAL] = $t[EXPORT_QUAL]; + } + my @convqual = (); + foreach (unpack('C*', $s->[SAM_QUAL])){ + my $val=$ct->[$_]; + if(not defined $val){ + my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n"; + if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; } + die($msg . "\n"); + } + push @convqual,$val; + } + + $s->[SAM_QUAL] = pack('C*',@convqual); # change coding + + + # coor + my $has_coor = 0; + $s->[SAM_RNAME] = "*"; + if (($t[EXPORT_CHROM] eq 'NM') or + ($t[EXPORT_CHROM] eq 'QC') or + ($t[EXPORT_CHROM] eq 'RM') or + ($t[EXPORT_CHROM] eq 'CONTROL')) { + $s->[SAM_FLAG] |= 0x4; # unmapped + push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM'); + } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) { + $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? + push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") + } elsif ($t[EXPORT_POS] < 1) { + $s->[SAM_FLAG] |= 0x4; # unmapped + } else { + $s->[SAM_RNAME] = $t[EXPORT_CHROM]; + $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne ''); + $has_coor = 1; + } + $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0; + +# print STDERR "t[14] = " . $t[14] . "\n"; + my $matchDesc = ''; + $s->[SAM_CIGAR] = "*"; + if($has_coor){ + $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD]; + + if($matchDesc =~ /\^/){ + # construct CIGAR string using Richard's function + $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing + } else { + $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M"; + } + } + +# print STDERR "cigar_string = $cigar_string\n"; + + $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev); + if($has_coor){ + my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0; + my $pemap = 0; + if($is_paired) { + $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0; + + # set `proper pair' bit if non-blank, non-zero PE alignment score: + $s->[SAM_FLAG] |= 0x02 if ($pemap > 0); + } + $s->[SAM_MAPQ] = min(254,max($semap,$pemap)); + } else { + $s->[SAM_MAPQ] = 0; + } + # mate coordinate + $s->[SAM_MRNM] = '*'; + $s->[SAM_MPOS] = 0; + $s->[SAM_ISIZE] = 0; + # aux + push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]); + if($has_coor){ + # The export match descriptor differs slightly from the samtools match descriptor. + # In order for the converted SAM files to be as compliant as possible, + # we put the export match descriptor in optional field 'XD' rather than 'MD': + push(@$s, "XD:Z:$matchDesc"); + push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne ''); + push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne '')); + } +} + + + +# +# the following code is taken from Richard Shaw's sorted2sam.pl file +# +sub reverse_compl_match_descriptor($) +{ +# print "\nREVERSING THE MATCH DESCRIPTOR!\n"; + my ($match_desc) = @_; + my $rev_compl_match_desc = reverse($match_desc); + $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/; + + # Unreverse the digits of numbers. + $rev_compl_match_desc = join('', + map {($_ =~ /\d+/) + ? join('', reverse(split('', $_))) + : $_} split(/(\d+)/, + $rev_compl_match_desc)); + + return $rev_compl_match_desc; +} + + + +sub match_desc_to_cigar($) +{ + my ($match_desc) = @_; + + my @match_desc_parts = split(/(\^.*?\$)/, $match_desc); + my $cigar_str = ''; + my $cigar_del_ch = 'D'; + my $cigar_ins_ch = 'I'; + my $cigar_match_ch = 'M'; + + foreach my $match_desc_part (@match_desc_parts) { + next if (!$match_desc_part); + + if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) { + # Deletion + $cigar_str .= (length($1) . $cigar_del_ch); + } elsif ($match_desc_part =~ /^\^(\d+)\$$/) { + # Insertion + $cigar_str .= ($1 . $cigar_ins_ch); + } else { + $cigar_str .= (match_desc_frag_length($match_desc_part) + . $cigar_match_ch); + } + } + + return $cigar_str; +} + + +#------------------------------------------------------------------------------ + +sub match_desc_frag_length($) + { + my ($match_desc_str) = @_; + my $len = 0; + + my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str); + + foreach my $match_desc_field (@match_desc_fields) { + next if ($match_desc_field eq ''); + + $len += (($match_desc_field =~ /(\d+)/) + ? $1 : length($match_desc_field)); + } + + return $len; +} + + +# argument holds the command line +sub write_header($;$;$) +{ + my ($progname,$version,$cl) = @_; + my $complete_header = ""; + $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n"; + + return $complete_header; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/interpolate_sam.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,125 @@ +#!/usr/bin/perl +use strict; + +###Builds interpolated pileup from SAM file +##@description counts bases between paired ends and piles up single end reads. +##@output, uses a #header for the RNAME and then the number of reads per base +##@author sm8@sanger.ac.uk, Stephen B. Montgomery + +##@caveats +##Requires RNAME to have format as per example +## chromosome:NCBI36:18:1:76117153:1 +## supercontig::NT_113883:1:137703:1 +## clone::AC138827.3:1:149397:1 +##Expects simple CIGAR characters, M, I and D +##Expects SAM file to be sorted. +##Expects 0x0010 to mark second read in PE file (as has been the observed case from MAQ output) (important for line 77) + +##Verify and read in SAM file +my $sam_file = $ARGV[0]; +if(!defined($sam_file)) { die("No sam file defined on arg 1"); } +unless(-f $sam_file) { die("Sam file does not exist: $sam_file"); } +open(SAM, $sam_file) || die("Cannot open sam file"); + +##Globals +my $current_location = ""; ##Current RNAME being processed +my $current_size = 0; ##Size of sequence region being processed +my $current_position = 1; ##Current base being processed +my $open = 0; ##Number of open reads (PE reads that have not been closed) +my %close = (); ##Hash of closing positions, when the current_position gets to this position it subtracts the + ##contained value from those open and deletes the indexed position from the hash + +while (my $line = <SAM>) { + my @tokens = split /\t/, $line; + + if ($current_location ne $tokens[2]) { ##Start a new sequence region + for (my $i = $current_position; $i <= $current_size; $i++) { ##Close the previous sequence region + if (defined($close{$i})) { + $open = $open - $close{$i}; + delete $close{$i}; + } + print $open . "\n"; + } + if ($current_location ne "") { + print "\n"; + } + + ##Initiate a new sequence region + my @location_tokens = split /:/, $tokens[2]; + $current_position = 1; + $current_location = $tokens[2]; + $current_size = $location_tokens[4]; + $open = 0; + %close = (); + print "#" . $tokens[2] . "\n"; + + ##Print pileup to just before the first read (will be 0) + for (my $current_position = 1; $current_position < $tokens[3]; $current_position++) { + print $open . "\n"; + } + $current_position = $tokens[3]; + + } else { ##Sequence region already open + if ($tokens[3] > $current_position) { ##If the new read's position is greater than the current position + ##cycle through to catch up to the current position + for (my $i = $current_position; $i < $tokens[3]; $i++) { + if (defined($close{$i})) { + $open = $open - $close{$i}; + delete $close{$i}; + } + print $open . "\n"; + } + $current_position = $tokens[3]; + } + } + $open++; ##Increment the number of open reads + + if (($tokens[1] & 0x0080 || $tokens[1] & 0x0040) && $tokens[1] & 0x0010 && $tokens[1] & 0x0002) { ##if second read of mate pair, add close condition + $open--; + my $parsed_cig = &parseCigar($tokens[5]); + my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1; + if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; } + $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1; + } elsif (!($tokens[1] & 0x0001) || !($tokens[1] & 0x0002)) { ##if unpaired, add close condition + my $parsed_cig = &parseCigar($tokens[5]); + my $seq_region_end = $tokens[3] + $parsed_cig->{'M'} + $parsed_cig->{'D'} - 1; + if (!defined($close{$seq_region_end + 1})) { $close{$seq_region_end + 1} = 0; } + $close{$seq_region_end + 1} = $close{$seq_region_end + 1} + 1; + } else { + #do nothing + } +} +for (my $i = $current_position; $i <= $current_size; $i++) { ##Finish up the last sequence region + if (defined($close{$i})) { + $open = $open - $close{$i}; + delete $close{$i}; + } + print $open . "\n"; +} +print "\n"; +close(SAM); +exit(0); + +##reads and tokenizes simple cigarline +sub parseCigar() { + my $cigar_line = shift; + $cigar_line =~ s/([0-9]*[A-Z]{1})/$1\t/g; + my @cigar_tokens = split /\t/, $cigar_line; + my %parsed = ('M' => 0, + 'I' => 0, + 'D' => 0); + my @events = (); + for(my $i = 0; $i < scalar(@cigar_tokens); $i++) { + if ($cigar_tokens[$i] =~ /([0-9]+)([A-Z]{1})/g) { + if (!defined($parsed{$2})) { $parsed{$2} = 0; } + my $nt = $2; + if ($nt ne "M" && $nt ne "D" && $nt ne "I") { $nt = "M"; } + $parsed{$nt} += $1; + my %event_el = ("t" => $nt, + "n" => $1); + push @events, \%event_el; + } + } + $parsed{'events'} = \@events; + return \%parsed; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/maq2sam.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,173 @@ +#include <string.h> +#include <zlib.h> +#include <stdio.h> +#include <inttypes.h> +#include <stdlib.h> +#include <assert.h> + +#define PACKAGE_VERSION "r439" + +//#define MAQ_LONGREADS + +#ifdef MAQ_LONGREADS +# define MAX_READLEN 128 +#else +# define MAX_READLEN 64 +#endif + +#define MAX_NAMELEN 36 +#define MAQMAP_FORMAT_OLD 0 +#define MAQMAP_FORMAT_NEW -1 + +#define PAIRFLAG_FF 0x01 +#define PAIRFLAG_FR 0x02 +#define PAIRFLAG_RF 0x04 +#define PAIRFLAG_RR 0x08 +#define PAIRFLAG_PAIRED 0x10 +#define PAIRFLAG_DIFFCHR 0x20 +#define PAIRFLAG_NOMATCH 0x40 +#define PAIRFLAG_SW 0x80 + +typedef struct +{ + uint8_t seq[MAX_READLEN]; /* the last base is the single-end mapping quality. */ + uint8_t size, map_qual, info1, info2, c[2], flag, alt_qual; + uint32_t seqid, pos; + int dist; + char name[MAX_NAMELEN]; +} maqmap1_t; + +typedef struct +{ + int format, n_ref; + char **ref_name; + uint64_t n_mapped_reads; + maqmap1_t *mapped_reads; +} maqmap_t; + +maqmap_t *maq_new_maqmap() +{ + maqmap_t *mm = (maqmap_t*)calloc(1, sizeof(maqmap_t)); + mm->format = MAQMAP_FORMAT_NEW; + return mm; +} +void maq_delete_maqmap(maqmap_t *mm) +{ + int i; + if (mm == 0) return; + for (i = 0; i < mm->n_ref; ++i) + free(mm->ref_name[i]); + free(mm->ref_name); + free(mm->mapped_reads); + free(mm); +} +maqmap_t *maqmap_read_header(gzFile fp) +{ + maqmap_t *mm; + int k, len; + mm = maq_new_maqmap(); + gzread(fp, &mm->format, sizeof(int)); + if (mm->format != MAQMAP_FORMAT_NEW) { + if (mm->format > 0) { + fprintf(stderr, "** Obsolete map format is detected. Please use 'mapass2maq' command to convert the format.\n"); + exit(3); + } + assert(mm->format == MAQMAP_FORMAT_NEW); + } + gzread(fp, &mm->n_ref, sizeof(int)); + mm->ref_name = (char**)calloc(mm->n_ref, sizeof(char*)); + for (k = 0; k != mm->n_ref; ++k) { + gzread(fp, &len, sizeof(int)); + mm->ref_name[k] = (char*)malloc(len * sizeof(char)); + gzread(fp, mm->ref_name[k], len); + } + /* read number of mapped reads */ + gzread(fp, &mm->n_mapped_reads, sizeof(uint64_t)); + return mm; +} + +void maq2tam_core(gzFile fp, const char *rg) +{ + maqmap_t *mm; + maqmap1_t mm1, *m1; + int ret; + m1 = &mm1; + mm = maqmap_read_header(fp); + while ((ret = gzread(fp, m1, sizeof(maqmap1_t))) == sizeof(maqmap1_t)) { + int j, flag = 0, se_mapq = m1->seq[MAX_READLEN-1]; + if (m1->flag) flag |= 1; + if ((m1->flag&PAIRFLAG_PAIRED) || ((m1->flag&PAIRFLAG_SW) && m1->flag != 192)) flag |= 2; + if (m1->flag == 192) flag |= 4; + if (m1->flag == 64) flag |= 8; + if (m1->pos&1) flag |= 0x10; + if ((flag&1) && m1->dist != 0) { + int c; + if (m1->dist > 0) { + if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_RF)) c = 0; + else if (m1->flag&(PAIRFLAG_FR|PAIRFLAG_RR)) c = 1; + else c = m1->pos&1; + } else { + if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_FR)) c = 0; + else if (m1->flag&(PAIRFLAG_RF|PAIRFLAG_RR)) c = 1; + else c = m1->pos&1; + } + if (c) flag |= 0x20; + } + if (m1->flag) { + int l = strlen(m1->name); + if (m1->name[l-2] == '/') { + flag |= (m1->name[l-1] == '1')? 0x40 : 0x80; + m1->name[l-2] = '\0'; + } + } + printf("%s\t%d\t", m1->name, flag); + printf("%s\t%d\t", mm->ref_name[m1->seqid], (m1->pos>>1)+1); + if (m1->flag == 130) { + int c = (int8_t)m1->seq[MAX_READLEN-1]; + printf("%d\t", m1->alt_qual); + if (c == 0) printf("%dM\t", m1->size); + else { + if (c > 0) printf("%dM%dI%dM\t", m1->map_qual, c, m1->size - m1->map_qual - c); + else printf("%dM%dD%dM\t", m1->map_qual, -c, m1->size - m1->map_qual); + } + se_mapq = 0; // zero SE mapQ for reads aligned by SW + } else { + if (flag&4) printf("0\t*\t"); + else printf("%d\t%dM\t", m1->map_qual, m1->size); + } + printf("*\t0\t%d\t", m1->dist); + for (j = 0; j != m1->size; ++j) { + if (m1->seq[j] == 0) putchar('N'); + else putchar("ACGT"[m1->seq[j]>>6&3]); + } + putchar('\t'); + for (j = 0; j != m1->size; ++j) + putchar((m1->seq[j]&0x3f) + 33); + putchar('\t'); + if (rg) printf("RG:Z:%s\t", rg); + if (flag&4) { // unmapped + printf("MF:i:%d\n", m1->flag); + } else { + printf("MF:i:%d\t", m1->flag); + if (m1->flag) printf("AM:i:%d\tSM:i:%d\t", m1->alt_qual, se_mapq); + printf("NM:i:%d\tUQ:i:%d\tH0:i:%d\tH1:i:%d\n", m1->info1&0xf, m1->info2, m1->c[0], m1->c[1]); + } + } + if (ret > 0) + fprintf(stderr, "Truncated! Continue anyway.\n"); + maq_delete_maqmap(mm); +} + +int main(int argc, char *argv[]) +{ + gzFile fp; + if (argc == 1) { + fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); + fprintf(stderr, "Usage: maq2sam <in.map> [<readGroup>]\n"); + return 1; + } + fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + maq2tam_core(fp, argc > 2? argv[2] : 0); + gzclose(fp); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/md5.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,296 @@ +/* + * This code implements the MD5 message-digest algorithm. + * The algorithm is due to Ron Rivest. This code was + * written by Colin Plumb in 1993, no copyright is claimed. + * This code is in the public domain; do with it what you wish. + * + * Equivalent code is available from RSA Data Security, Inc. + * This code has been tested against that, and is equivalent, + * except that you don't need to include two pages of legalese + * with every copy. + * + * To compute the message digest of a chunk of bytes, declare an + * MD5Context structure, pass it to MD5Init, call MD5Update as + * needed on buffers full of bytes, and then call MD5Final, which + * will fill a supplied 16-byte array with the digest. + */ + +/* Brutally hacked by John Walker back from ANSI C to K&R (no + prototypes) to maintain the tradition that Netfone will compile + with Sun's original "cc". */ + +#include <string.h> +#include "md5.h" + +#ifndef HIGHFIRST +#define byteReverse(buf, len) /* Nothing */ +#else +/* + * Note: this code is harmless on little-endian machines. + */ +void byteReverse(buf, longs) + unsigned char *buf; unsigned longs; +{ + uint32_t t; + do { + t = (uint32_t) ((unsigned) buf[3] << 8 | buf[2]) << 16 | + ((unsigned) buf[1] << 8 | buf[0]); + *(uint32_t *) buf = t; + buf += 4; + } while (--longs); +} +#endif + +void MD5Transform(uint32_t buf[4], uint32_t in[16]); + + +/* + * Start MD5 accumulation. Set bit count to 0 and buffer to mysterious + * initialization constants. + */ +void MD5Init(ctx) + struct MD5Context *ctx; +{ + ctx->buf[0] = 0x67452301; + ctx->buf[1] = 0xefcdab89; + ctx->buf[2] = 0x98badcfe; + ctx->buf[3] = 0x10325476; + + ctx->bits[0] = 0; + ctx->bits[1] = 0; +} + +/* + * Update context to reflect the concatenation of another buffer full + * of bytes. + */ +void MD5Update(ctx, buf, len) + struct MD5Context *ctx; unsigned char *buf; unsigned len; +{ + uint32_t t; + + /* Update bitcount */ + + t = ctx->bits[0]; + if ((ctx->bits[0] = t + ((uint32_t) len << 3)) < t) + ctx->bits[1]++; /* Carry from low to high */ + ctx->bits[1] += len >> 29; + + t = (t >> 3) & 0x3f; /* Bytes already in shsInfo->data */ + + /* Handle any leading odd-sized chunks */ + + if (t) { + unsigned char *p = (unsigned char *) ctx->in + t; + + t = 64 - t; + if (len < t) { + memcpy(p, buf, len); + return; + } + memcpy(p, buf, t); + byteReverse(ctx->in, 16); + MD5Transform(ctx->buf, (uint32_t *) ctx->in); + buf += t; + len -= t; + } + /* Process data in 64-byte chunks */ + + while (len >= 64) { + memcpy(ctx->in, buf, 64); + byteReverse(ctx->in, 16); + MD5Transform(ctx->buf, (uint32_t *) ctx->in); + buf += 64; + len -= 64; + } + + /* Handle any remaining bytes of data. */ + + memcpy(ctx->in, buf, len); +} + +/* + * Final wrapup - pad to 64-byte boundary with the bit pattern + * 1 0* (64-bit count of bits processed, MSB-first) + */ +void MD5Final(digest, ctx) + unsigned char digest[16]; struct MD5Context *ctx; +{ + unsigned count; + unsigned char *p; + + /* Compute number of bytes mod 64 */ + count = (ctx->bits[0] >> 3) & 0x3F; + + /* Set the first char of padding to 0x80. This is safe since there is + always at least one byte free */ + p = ctx->in + count; + *p++ = 0x80; + + /* Bytes of padding needed to make 64 bytes */ + count = 64 - 1 - count; + + /* Pad out to 56 mod 64 */ + if (count < 8) { + /* Two lots of padding: Pad the first block to 64 bytes */ + memset(p, 0, count); + byteReverse(ctx->in, 16); + MD5Transform(ctx->buf, (uint32_t *) ctx->in); + + /* Now fill the next block with 56 bytes */ + memset(ctx->in, 0, 56); + } else { + /* Pad block to 56 bytes */ + memset(p, 0, count - 8); + } + byteReverse(ctx->in, 14); + + /* Append length in bits and transform */ + ((uint32_t *) ctx->in)[14] = ctx->bits[0]; + ((uint32_t *) ctx->in)[15] = ctx->bits[1]; + + MD5Transform(ctx->buf, (uint32_t *) ctx->in); + byteReverse((unsigned char *) ctx->buf, 4); + memcpy(digest, ctx->buf, 16); + memset(ctx, 0, sizeof(ctx)); /* In case it's sensitive */ +} + + +/* The four core functions - F1 is optimized somewhat */ + +/* #define F1(x, y, z) (x & y | ~x & z) */ +#define F1(x, y, z) (z ^ (x & (y ^ z))) +#define F2(x, y, z) F1(z, x, y) +#define F3(x, y, z) (x ^ y ^ z) +#define F4(x, y, z) (y ^ (x | ~z)) + +/* This is the central step in the MD5 algorithm. */ +#define MD5STEP(f, w, x, y, z, data, s) \ + ( w += f(x, y, z) + data, w = w<<s | w>>(32-s), w += x ) + +/* + * The core of the MD5 algorithm, this alters an existing MD5 hash to + * reflect the addition of 16 longwords of new data. MD5Update blocks + * the data and converts bytes into longwords for this routine. + */ +void MD5Transform(buf, in) + uint32_t buf[4]; uint32_t in[16]; +{ + register uint32_t a, b, c, d; + + a = buf[0]; + b = buf[1]; + c = buf[2]; + d = buf[3]; + + MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); + MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); + MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); + MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); + MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); + MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); + MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); + MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); + MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); + MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); + MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); + MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); + MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); + MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); + MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); + MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); + + MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); + MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); + MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); + MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); + MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); + MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); + MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); + MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); + MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); + MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); + MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); + MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); + MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); + MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); + MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); + MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); + + MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); + MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); + MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); + MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); + MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); + MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); + MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); + MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); + MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); + MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); + MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); + MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); + MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); + MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); + MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); + MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); + + MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); + MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); + MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); + MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); + MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); + MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); + MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); + MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); + MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); + MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); + MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); + MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); + MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); + MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); + MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); + MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); + + buf[0] += a; + buf[1] += b; + buf[2] += c; + buf[3] += d; +} + +/* lh3: the following code is added by me */ + +#ifdef MD5SUM_MAIN +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#define HEX_STR "0123456789abcdef" + +static void md5_one(const char *fn) +{ + unsigned char buf[4096], digest[16]; + MD5_CTX md5; + int l; + FILE *fp; + + fp = strcmp(fn, "-")? fopen(fn, "r") : stdin; + if (fp == 0) { + fprintf(stderr, "md5sum: %s: No such file or directory\n", fn); + exit(1); + } + MD5Init(&md5); + while ((l = fread(buf, 1, 4096, fp)) > 0) + MD5Update(&md5, buf, l); + MD5Final(digest, &md5); + if (fp != stdin) fclose(fp); + for (l = 0; l < 16; ++l) + printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]); + printf(" %s\n", fn); +} +int main(int argc, char *argv[]) +{ + int i; + if (argc == 1) md5_one("-"); + else for (i = 1; i < argc; ++i) md5_one(argv[i]); + return 0; +} +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/md5.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,57 @@ +/* + This file is adapted from a program in this page: + + http://www.fourmilab.ch/md5/ + + The original source code does not work on 64-bit machines due to the + wrong typedef "uint32". I also added prototypes. + + -lh3 + */ + +#ifndef MD5_H +#define MD5_H + +/* The following tests optimise behaviour on little-endian + machines, where there is no need to reverse the byte order + of 32 bit words in the MD5 computation. By default, + HIGHFIRST is defined, which indicates we're running on a + big-endian (most significant byte first) machine, on which + the byteReverse function in md5.c must be invoked. However, + byteReverse is coded in such a way that it is an identity + function when run on a little-endian machine, so calling it + on such a platform causes no harm apart from wasting time. + If the platform is known to be little-endian, we speed + things up by undefining HIGHFIRST, which defines + byteReverse as a null macro. Doing things in this manner + insures we work on new platforms regardless of their byte + order. */ + +#define HIGHFIRST + +#if __LITTLE_ENDIAN__ != 0 +#undef HIGHFIRST +#endif + +#include <stdint.h> + +struct MD5Context { + uint32_t buf[4]; + uint32_t bits[2]; + unsigned char in[64]; +}; + +void MD5Init(struct MD5Context *ctx); +void MD5Update(struct MD5Context *ctx, unsigned char *buf, unsigned len); +void MD5Final(unsigned char digest[16], struct MD5Context *ctx); + +/* + * This is needed to make RSAREF happy on some MS-DOS compilers. + */ +typedef struct MD5Context MD5_CTX; + +/* Define CHECK_HARDWARE_PROPERTIES to have main,c verify + byte order and uint32_t settings. */ +#define CHECK_HARDWARE_PROPERTIES + +#endif /* !MD5_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/md5fa.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,58 @@ +#include <stdio.h> +#include <zlib.h> +#include "md5.h" +#include "kseq.h" + +#define HEX_STR "0123456789abcdef" + +KSEQ_INIT(gzFile, gzread) + +static void md5_one(const char *fn) +{ + MD5_CTX md5_one, md5_all; + int l, i, k; + gzFile fp; + kseq_t *seq; + unsigned char unordered[16], digest[16]; + + for (l = 0; l < 16; ++l) unordered[l] = 0; + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) { + fprintf(stderr, "md5fa: %s: No such file or directory\n", fn); + exit(1); + } + + MD5Init(&md5_all); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + for (i = k = 0; i < seq->seq.l; ++i) { + if (islower(seq->seq.s[i])) seq->seq.s[k++] = toupper(seq->seq.s[i]); + else if (isupper(seq->seq.s[i])) seq->seq.s[k++] = seq->seq.s[i]; + } + MD5Init(&md5_one); + MD5Update(&md5_one, (unsigned char*)seq->seq.s, k); + MD5Final(digest, &md5_one); + for (l = 0; l < 16; ++l) { + printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]); + unordered[l] ^= digest[l]; + } + printf(" %s %s\n", fn, seq->name.s); + MD5Update(&md5_all, (unsigned char*)seq->seq.s, k); + } + MD5Final(digest, &md5_all); + kseq_destroy(seq); + for (l = 0; l < 16; ++l) + printf("%c%c", HEX_STR[digest[l]>>4&0xf], HEX_STR[digest[l]&0xf]); + printf(" %s >ordered\n", fn); + for (l = 0; l < 16; ++l) + printf("%c%c", HEX_STR[unordered[l]>>4&0xf], HEX_STR[unordered[l]&0xf]); + printf(" %s >unordered\n", fn); +} + +int main(int argc, char *argv[]) +{ + int i; + if (argc == 1) md5_one("-"); + else for (i = 1; i < argc; ++i) md5_one(argv[i]); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/novo2sam.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,281 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.3 + +#Modified by Zayed Albertyn(zayed.albertyn@gmail.com) & Colin Hercus(colin@novocraft.com) + +#use strict; +#use warnings; +use Data::Dumper; +use Getopt::Std; + +&novo2sam; +exit; + +sub mating { + my ($s1, $s2) = @_; + my $isize = 0; + if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize + my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; + my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; + $isize = $x2 - $x1; + } + # update mate coordinate + if ($s2->[2] ne '*') { + @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); + $s1->[1] |= 0x20 if ($s2->[1] & 0x10); + } else { + $s1->[1] |= 0x8; + } + if ($s1->[2] ne '*') { + @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); + $s2->[1] |= 0x20 if ($s1->[1] & 0x10); + } else { + $s2->[1] |= 0x8; + } +} + +sub novo2sam { + my %opts = (); + getopts("p", \%opts); + die("Usage: novo2sam.pl [-p] <aln.novo>\n") if (@ARGV == 0); + my $is_paired = defined($opts{p}); + # core loop + my @s1 = (); + my @s2 = (); + my ($s_last, $s_curr) = (\@s1, \@s2); + while (<>) { + next if (/^#/); + next if (/(QC|NM)\s*$/ || /(R\s+\d+)\s*$/); + &novo2sam_aux($_, $s_curr, $is_paired); + if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { + &mating($s_last, $s_curr); + print join("\t", @$s_last), "\n"; + print join("\t", @$s_curr), "\n"; + @$s_last = (); @$s_curr = (); + } else { + print join("\t", @$s_last), "\n" if (@$s_last != 0); + my $s = $s_last; $s_last = $s_curr; $s_curr = $s; + } + } + print join("\t", @$s_last), "\n" if (@$s_last != 0); +} + +sub novo2sam_aux { + my ($line, $s, $is_paired) = @_; + + chomp($line); + my @t = split(/\s+/, $line); + my @variations = @t[13 .. $#t]; + @$s = (); + return if ($t[4] ne 'U'); + my $len = length($t[2]); + # read name + $s->[0] = substr($t[0], 1); + $s->[0] =~ s/\/[12]$//g; + # initial flag (will be updated later) + $s->[1] = 0; + $s->[1] |= 1 | 1<<($t[1] eq 'L'? 6 : 7); + $s->[1] |= 2 if ($t[10] eq '.'); + # read & quality + if ($t[9] eq 'R') { + $s->[9] = reverse($t[2]); + $s->[10] = reverse($t[3]); + $s->[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/; + } else { + $s->[9] = $t[2]; $s->[10] = $t[3]; + } + # cigar + my $cigarstring =""; + if (scalar @variations ==0 ) { + $s->[5] = $len . "M"; # IMPORTANT: this cigar is not correct for gapped alignment + } else { + #convert to correct CIGAR + my $tmpstr = join" ",@variations ; + if ( $tmpstr=~ /\+|\-/ ) { + $cigarstring = cigar_method($line,\@variations,$len); + $s->[5]=$cigarstring; + } else { + $s->[5]=$len. "M"; + } +} + +# coor + $s->[2] = substr($t[7], 1); $s->[3] = $t[8]; + $s->[1] |= 0x10 if ($t[9] eq 'R'); + # mapQ + $s->[4] = $t[5] > $t[6]? $t[5] : $t[6]; + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + push(@$s, "NM:i:".(@t-13)); + my $md = ''; + $md = mdtag($md,$line,\@variations,$len); + push(@$s, "MD:Z:$md"); + +} + +sub mdtag { + my $oldmd = shift; + my $line = shift; + my $ref =shift; + my $rdlen = shift; + my @variations = @$ref; + my $string=""; + my $mdtag=""; + my $t=1; + my $q=1; + my $deleteflag=0; + my $len =0; + foreach $string (@variations) { + my ($indeltype,$insert) = indeltype($string); + if ($indeltype eq "+") { + $len = length ($insert); + $q+=$len; + next; + } + my $pos = $1 if $string =~ /^(\d+)/; + $len = $pos - $t; + if ($len !=0 || ($deleteflag eq 1 && $indeltype eq ">")) { + $mdtag.=$len; + } + $t+=$len; + $q+=$len; + if ($indeltype eq ">") { + $mdtag.=$insert; + $deleteflag=0; + $t+=1; + $q+=1; + } + if ($indeltype eq "-") { + my $deletedbase = $2 if $string =~ /(\d+)\-([A-Za-z]+)/; + if ($deleteflag == 0 ) { + $mdtag.="^"; + } + $mdtag.=$deletedbase; + $deleteflag=1; + $t+=1; + } + } + $len = $rdlen - $q + 1; + if ($len > 0) { + $mdtag.="$len"; + } +# print "In:$line\n"; +# print "MD: OLD => NEW\nMD: $oldmd => $mdtag\n\n"; + + return $mdtag; +} + +sub indeltype { + my $string = shift; + my $insert=""; + my $indeltype; + if ($string =~ /([A-Za-z]+)\>/) { + $indeltype=">"; + $insert=$1; + } elsif ($string =~ /\-/) { + $indeltype="-"; + } elsif ($string =~ /\+([A-Za-z]+)/) { + $indeltype="+"; + $insert=$1; + } + return ($indeltype,$insert); + +} + + +sub cigar_method { + my $line = shift; + my $ref =shift; + my $rdlen = shift; + my @variations = @$ref; + my $string=""; + my $type=""; + my $t =1; + my $q=1; + my $indeltype=""; + my $cigar= ""; + my $insert = ""; + my $len=0; + my @cig=(); + foreach $string (@variations) { + next if $string =~ />/; + my $pos = $1 if $string =~ /^(\d+)/; + + if ($string =~ /\+([A-Za-z]+)/) { + $indeltype="+"; + $insert = $1; + }elsif ($string =~ /\-([A-Za-z]+)/) { + $indeltype="-"; + $insert = $1; + } +#print "$pos $indeltype $insert $t $q\n"; + $len = $pos - $t; + if ( $len > 0) { + $cigar.=$len."M"; + push(@cig,$len."M"); + } + $t+=$len; + $q+=$len; + + if ($indeltype eq "-") { + $cigar.="D"; + push(@cig,"D"); + $t++; + } + if ($indeltype eq "+") { + $len = length ($insert); + if ($len == 1) { + $cigar.="I"; + push(@cig,"I"); + } + if ($len > 1) { + $cigar.=$len."I"; + push(@cig,$len."I") + } + $q+=$len; + } + $insert=""; + } + $len= $rdlen - $q + 1; + if ($len > 0) { + $cigar.=$len."M"; + push(@cig,$len."M"); + } + + $cigar = newcigar($cigar,'D'); + $cigar = newcigar($cigar,'I'); + + #print "$line\n"; + #print "c CIGAR:\t$cigar\n\n"; + return $cigar; + +} + + + +sub newcigar { + my $cigar = shift; + my $char = shift; + my $new = ""; + my $copy = $cigar; +#print "$cigar\n"; + $copy =~ s/^($char+)/$1;/g; +#print "$copy\n"; + $copy =~ s/([^0-9$char])($char+)/$1;$2;/g; +#print "$copy\n"; + my @parts = split(/;/,$copy); + my $el=""; + foreach $el (@parts) { +#print "$el\n"; + if ($el =~ /^$char+$/) { + $new.=length($el).$char; + }else { + $new.=$el; + } + + } + return $new; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/plot-bamcheck Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,882 @@ +#!/usr/bin/env perl +# +# Author: petr.danecek@sanger +# + +use strict; +use warnings; +use Carp; + +my $opts = parse_params(); +parse_bamcheck($opts); +plot_qualities($opts); +plot_acgt_cycles($opts); +plot_gc($opts); +plot_gc_depth($opts); +plot_isize($opts); +plot_coverage($opts); +plot_mismatches_per_cycle($opts); +plot_indel_dist($opts); +plot_indel_cycles($opts); + +exit; + +#-------------------------------- + +sub error +{ + my (@msg) = @_; + if ( scalar @msg ) { confess @msg; } + die + "Usage: plot-bamcheck [OPTIONS] file.bam.bc\n", + " plot-bamcheck -p outdir/ file.bam.bc\n", + "Options:\n", + " -k, --keep-files Do not remove temporary files.\n", + " -p, --prefix <path> The output files prefix, add a slash to create new directory.\n", + " -r, --ref-stats <file.fa.gc> Optional reference stats file with expected GC content (created with -s).\n", + " -s, --do-ref-stats <file.fa> Calculate reference sequence GC for later use with -r\n", + " -t, --targets <file.tab> Restrict -s to the listed regions (tab-delimited chr,from,to. 1-based, inclusive)\n", + " -h, -?, --help This help message.\n", + "\n"; +} + + +sub parse_params +{ + $0 =~ s{^.+/}{}; + my $opts = { args=>join(' ',$0,@ARGV) }; + while (defined(my $arg=shift(@ARGV))) + { + if ( $arg eq '-k' || $arg eq '--keep-files' ) { $$opts{keep_files}=1; next; } + if ( $arg eq '-r' || $arg eq '--ref-stats' ) { $$opts{ref_stats}=shift(@ARGV); next; } + if ( $arg eq '-s' || $arg eq '--do-ref-stats' ) { $$opts{do_ref_stats}=shift(@ARGV); next; } + if ( $arg eq '-t' || $arg eq '--targets' ) { $$opts{targets}=shift(@ARGV); next; } + if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; } + if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } + if ( -e $arg ) { $$opts{bamcheck}=$arg; next; } + error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); + } + if ( exists($$opts{do_ref_stats }) ) { do_ref_stats($opts); exit; } + if ( !exists($$opts{bamcheck}) ) { error("No bamcheck file?\n") } + if ( !exists($$opts{prefix}) ) { error("Expected -p parameter.\n") } + if ( $$opts{prefix}=~m{/$} ) { `mkdir -p $$opts{prefix}`; } + elsif ( !($$opts{prefix}=~/-$/) ) { $$opts{prefix} .= '-'; } + return $opts; +} + + +# Creates GC stats for either the whole reference or only on target regions for exome QC +sub do_ref_stats +{ + my ($opts) = @_; + + + my %targets = (); + if ( exists($$opts{targets}) ) + { + my ($prev_chr,$prev_pos); + open(my $fh,'<',$$opts{targets}) or error("$$opts{targets}: $!"); + while (my $line=<$fh>) + { + if ( $line=~/^#/ ) { next; } + my ($chr,$from,$to) = split(/\s+/,$line); + chomp($to); + push @{$targets{$chr}}, $from,$to; + if ( !defined $prev_chr or $chr ne $prev_chr ) { $prev_chr=$chr; $prev_pos=$from } + if ( $prev_pos > $from ) { error("The file must be sorted: $$opts{targets}\n"); } + $prev_pos = $from; + } + close($fh); + } + + my $_len = 60; # for now do only standard fasta's with 60 bases per line + my %gc_counts = (); + my ($skip_chr,$pos,$ireg,$regions); + open(my $fh,'<',$$opts{do_ref_stats}) or error("$$opts{do_ref_stats}: $!"); + while (my $line=<$fh>) + { + if ( $line=~/^>/ ) + { + if ( !scalar %targets ) { next; } + + if ( !($line=~/>(\S+)/) ) { error("FIXME: could not determine chromosome name: $line"); } + if ( !exists($targets{$1}) ) { $skip_chr=$1; next; } + undef $skip_chr; + $pos = 0; + $ireg = 0; + $regions = $targets{$1}; + } + if ( defined $skip_chr ) { next; } + + # Only $_len sized lines are considered and no chopping for target regions. + chomp($line); + my $len = length($line); + if ( $len ne $_len ) { next; } + + if ( scalar %targets ) + { + while ( $ireg<@$regions && $$regions[$ireg+1]<=$pos ) { $ireg += 2; } + $pos += $len; + if ( $ireg==@$regions ) { next; } + if ( $pos < $$regions[$ireg] ) { next; } + } + + my $gc_count = 0; + for (my $i=0; $i<$len; $i++) + { + my $base = substr($line,$i,1); + if ( $base eq 'g' || $base eq 'G' || $base eq 'c' || $base eq 'C' ) { $gc_count++; } + } + $gc_counts{$gc_count}++; + } + + print "# Generated by $$opts{args}\n"; + print "# The columns are: GC content bin, normalized frequency\n"; + my $max; + for my $count (values %gc_counts) + { + if ( !defined $max or $count>$max ) { $max=$count; } + } + for my $gc (sort {$a<=>$b} keys %gc_counts) + { + if ( $gc==0 ) { next; } + printf "%f\t%f\n", $gc*100./$_len, $gc_counts{$gc}/$max; + } +} + +sub plot +{ + my ($cmdfile) = @_; + my $cmd = "gnuplot $cmdfile"; + system($cmd); + if ( $? ) { error("The command exited with non-zero status $?:\n\t$cmd\n\n"); } +} + + +sub parse_bamcheck +{ + my ($opts) = @_; + open(my $fh,'<',$$opts{bamcheck}) or error("$$opts{bamcheck}: $!"); + my $line = <$fh>; + if ( !($line=~/^# This file was produced by bamcheck (\S+)/) ) { error("Sanity check failed: was this file generated by bamcheck?"); } + $$opts{dat}{version} = $1; + while ($line=<$fh>) + { + if ( $line=~/^#/ ) { next; } + my @items = split(/\t/,$line); + chomp($items[-1]); + if ( $items[0] eq 'SN' ) + { + $$opts{dat}{$items[1]} = splice(@items,2); + next; + } + push @{$$opts{dat}{$items[0]}}, [splice(@items,1)]; + } + close($fh); + + # Check sanity + if ( !exists($$opts{dat}{'sequences:'}) or !$$opts{dat}{'sequences:'} ) + { + error("Sanity check failed: no sequences found by bamcheck??\n"); + } +} + +sub older_than +{ + my ($opts,$version) = @_; + my ($year,$month,$day) = split(/-/,$version); + $version = $$opts{dat}{version}; + if ( !($version=~/\((\d+)-(\d+)-(\d+)\)$/) ) { return 1; } + if ( $1<$year ) { return 1; } + elsif ( $1>$year ) { return 0; } + if ( $2<$month ) { return 1; } + elsif ( $2>$month ) { return 0; } + if ( $3<$day ) { return 1; } + return 0; +} + +sub get_defaults +{ + my ($opts,$img_fname,%args) = @_; + + if ( !($img_fname=~/\.png$/i) ) { error("FIXME: currently only PNG supported. (Easy to extend.)\n"); } + + # Determine the gnuplot script file name + my $gp_file = $img_fname; + $gp_file =~ s{\.[^.]+$}{.gp}; + if ( !($gp_file=~/.gp$/) ) { $gp_file .= '.gp'; } + + # Determine the default title: + # 5446_6/5446_6.bam.bc.gp -> 5446_6 + # test.aaa.png -> test.aaa + if ( !($$opts{bamcheck}=~m{([^/]+?)(?:\.bam)?(?:\.bc)?$}i) ) { error("FIXME: Could not determine the title from [$img_fname]\n"); } + my $title = $1; + + my $dir = $gp_file; + $dir =~ s{/[^/]+$}{}; + if ( $dir && $dir ne $gp_file ) { `mkdir -p $dir`; } + + my $wh = exists($args{wh}) ? $args{wh} : '600,400'; + + open(my $fh,'>',$gp_file) or error("$gp_file: $!"); + return { + title => $title, + gp => $gp_file, + img => $img_fname, + fh => $fh, + terminal => qq[set terminal png size $wh truecolor], + grid => 'set grid xtics ytics y2tics back lc rgb "#cccccc"', + }; +} + +sub percentile +{ + my ($p,@vals) = @_; + my $N = 0; + for my $val (@vals) { $N += $val; } + my $n = $p*($N+1)/100.; + my $k = int($n); + my $d = $n-$k; + if ( $k<=0 ) { return 0; } + if ( $k>=$N ) { return scalar @vals-1; } + my $cnt; + for (my $i=0; $i<@vals; $i++) + { + $cnt += $vals[$i]; + if ( $cnt>=$k ) { return $i; } + } + error("FIXME: this should not happen [percentile]\n"); +} + +sub plot_qualities +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{FFQ}) or !@{$$opts{dat}{FFQ}} ) { return; } + + my $yrange = @{$$opts{dat}{FFQ}[0]} > 50 ? @{$$opts{dat}{FFQ}[0]} : 50; + my $is_paired = $$opts{dat}{'is paired:'}; + + # Average quality per cycle, forward and reverse reads in one plot + my $args = get_defaults($opts,"$$opts{prefix}quals.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set ylabel "Average Quality" + set xlabel "Cycle" + set yrange [0:$yrange] + set title "$$args{title}" + plot '-' using 1:2 with lines title 'Forward reads' ] . ($is_paired ? q[, '-' using 1:2 with lines title 'Reverse reads'] : '') . q[ + ]; + my (@fp75,@fp50,@fmean); + my (@lp75,@lp50,@lmean); + my ($fmax,$fmax_qual,$fmax_cycle); + my ($lmax,$lmax_qual,$lmax_cycle); + for my $cycle (@{$$opts{dat}{FFQ}}) + { + my $sum=0; my $n=0; + for (my $iqual=1; $iqual<@$cycle; $iqual++) + { + $sum += $$cycle[$iqual]*$iqual; + $n += $$cycle[$iqual]; + if ( !defined $fmax or $fmax<$$cycle[$iqual] ) { $fmax=$$cycle[$iqual]; $fmax_qual=$iqual; $fmax_cycle=$$cycle[0]; } + } + my $p25 = percentile(25,(@$cycle)[1..$#$cycle]); + my $p50 = percentile(50,(@$cycle)[1..$#$cycle]); + my $p75 = percentile(75,(@$cycle)[1..$#$cycle]); + if ( !$n ) { next; } + push @fp75, "$$cycle[0]\t$p25\t$p75\n"; + push @fp50, "$$cycle[0]\t$p50\n"; + push @fmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n; + printf $fh $fmean[-1]; + } + print $fh "end\n"; + if ( $is_paired ) + { + for my $cycle (@{$$opts{dat}{LFQ}}) + { + my $sum=0; my $n=0; + for (my $iqual=1; $iqual<@$cycle; $iqual++) + { + $sum += $$cycle[$iqual]*$iqual; + $n += $$cycle[$iqual]; + if ( !defined $lmax or $lmax<$$cycle[$iqual] ) { $lmax=$$cycle[$iqual]; $lmax_qual=$iqual; $lmax_cycle=$$cycle[0]; } + } + my $p25 = percentile(25,(@$cycle)[1..$#$cycle]); + my $p50 = percentile(50,(@$cycle)[1..$#$cycle]); + my $p75 = percentile(75,(@$cycle)[1..$#$cycle]); + if ( !$n ) { next; } + push @lp75, "$$cycle[0]\t$p25\t$p75\n"; + push @lp50, "$$cycle[0]\t$p50\n"; + push @lmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n; + printf $fh $lmean[-1]; + } + print $fh "end\n"; + } + close($fh); + plot($$args{gp}); + + + + # Average, mean and quality percentiles per cycle, forward and reverse reads in separate plots + $args = get_defaults($opts,"$$opts{prefix}quals2.png",wh=>'700,500'); + $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set multiplot + set rmargin 0 + set lmargin 0 + set tmargin 0 + set bmargin 0 + set origin 0.1,0.1 + set size 0.4,0.8 + set yrange [0:$yrange] + set ylabel "Quality" + set xlabel "Cycle (fwd reads)" + plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 1 t 'Mean' + ]; + print $fh join('',@fp75),"end\n"; + print $fh join('',@fp50),"end\n"; + print $fh join('',@fmean),"end\n"; + if ( $is_paired ) + { + print $fh qq[ + set origin 0.55,0.1 + set size 0.4,0.8 + unset ytics + set y2tics mirror + set yrange [0:$yrange] + unset ylabel + set xlabel "Cycle (rev reads)" + set label "$$args{title}" at screen 0.5,0.95 center + plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 2 t 'Mean' + ]; + print $fh join('',@lp75),"end\n"; + print $fh join('',@lp50),"end\n"; + print $fh join('',@lmean),"end\n"; + } + close($fh); + plot($$args{gp}); + + + + # Quality distribution per cycle, the distribution is for each cycle plotted as a separate curve + $args = get_defaults($opts,"$$opts{prefix}quals3.png",wh=>'600,600'); + $fh = $$args{fh}; + my $nquals = @{$$opts{dat}{FFQ}[0]}-1; + my $ncycles = @{$$opts{dat}{FFQ}}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set multiplot + set rmargin 0 + set lmargin 0 + set tmargin 0 + set bmargin 0 + set origin 0.15,0.52 + set size 0.8,0.4 + set title "$$args{title}" + set ylabel "Frequency (fwd reads)" + set label "Cycle $fmax_cycle" at $fmax_qual+1,$fmax + unset xlabel + set xrange [0:$nquals] + set format x "" + ]; + my @plots; + for (my $i=0; $i<$ncycles; $i++) { push @plots, q['-' using 1:2 with lines t ''] } + print $fh "plot ", join(",", @plots), "\n"; + for my $cycle (@{$$opts{dat}{FFQ}}) + { + for (my $iqual=1; $iqual<$nquals; $iqual++) { print $fh "$iqual\t$$cycle[$iqual]\n"; } + print $fh "end\n"; + } + if ( $is_paired ) + { + print $fh qq[ + set origin 0.15,0.1 + set size 0.8,0.4 + unset title + unset format + set xtics + set xlabel "Quality" + unset label + set label "Cycle $lmax_cycle" at $lmax_qual+1,$lmax + set ylabel "Frequency (rev reads)" + ]; + print $fh "plot ", join(",", @plots), "\n"; + for my $cycle (@{$$opts{dat}{LFQ}}) + { + for (my $iqual=1; $iqual<$nquals; $iqual++) + { + print $fh "$iqual\t$$cycle[$iqual]\n"; + } + print $fh "end\n"; + } + } + close($fh); + plot($$args{gp}); + + + # Heatmap qualitites + $args = get_defaults($opts,"$$opts{prefix}quals-hm.png", wh=>'600,500'); + $fh = $$args{fh}; + my $max = defined $lmax && $lmax > $fmax ? $lmax : $fmax; + my @ytics; + for my $cycle (@{$$opts{dat}{FFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } } + my $ytics = join(',', @ytics); + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + unset key + unset colorbox + set palette defined (0 0 0 0, 1 0 0 1, 3 0 1 0, 4 1 0 0, 6 1 1 1) + set cbrange [0:$max] + set yrange [0:$ncycles] + set xrange [0:$nquals] + set view map + set multiplot + set rmargin 0 + set lmargin 0 + set tmargin 0 + set bmargin 0 + set origin 0,0.46 + set size 0.95,0.6 + set obj 1 rectangle behind from first 0,0 to first $nquals,$ncycles + set obj 1 fillstyle solid 1.0 fillcolor rgbcolor "black" + set ylabel "Cycle (fwd reads)" offset character -1,0 + unset ytics + set ytics ($ytics) + unset xtics + set title "$$args{title}" + splot '-' matrix with image + ]; + for my $cycle (@{$$opts{dat}{FFQ}}) + { + for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; } + print $fh "\n"; + } + print $fh "end\nend\n"; + @ytics = (); + for my $cycle (@{$$opts{dat}{LFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } } + $ytics = join(',', @ytics); + print $fh qq[ + set origin 0,0.03 + set size 0.95,0.6 + set ylabel "Cycle (rev reads)" offset character -1,0 + set xlabel "Base Quality" + unset title + unset ytics + set ytics ($ytics) + set xrange [0:$nquals] + set xtics + set colorbox vertical user origin first ($nquals+1),0 size screen 0.025,0.812 + set cblabel "Number of bases" + splot '-' matrix with image + ]; + for my $cycle (@{$$opts{dat}{LFQ}}) + { + for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; } + print $fh "\n"; + } + print $fh "end\nend\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_acgt_cycles +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{GCC}) or !@{$$opts{dat}{GCC}} ) { return; } + + my $args = get_defaults($opts,"$$opts{prefix}acgt-cycles.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linecolor rgb "green" + set style line 2 linecolor rgb "red" + set style line 3 linecolor rgb "black" + set style line 4 linecolor rgb "blue" + set style increment user + set ylabel "Base content [%]" + set xlabel "Read Cycle" + set yrange [0:100] + set title "$$args{title}" + plot '-' w l ti 'A', '-' w l ti 'C', '-' w l ti 'G', '-' w l ti 'T' + ]; + for my $base (1..4) + { + for my $cycle (@{$$opts{dat}{GCC}}) + { + print $fh $$cycle[0]+1,"\t",$$cycle[$base],"\n"; + } + print $fh "end\n"; + } + close($fh); + plot($$args{gp}); +} + + +sub plot_gc +{ + my ($opts) = @_; + + my $is_paired = $$opts{dat}{'is paired:'}; + my $args = get_defaults($opts,"$$opts{prefix}gc-content.png"); + my $fh = $$args{fh}; + my ($gcl_max,$gcf_max,$lmax,$fmax); + for my $gc (@{$$opts{dat}{GCF}}) { if ( !defined $gcf_max or $gcf_max<$$gc[1] ) { $gcf_max=$$gc[1]; $fmax=$$gc[0]; } } + for my $gc (@{$$opts{dat}{GCL}}) { if ( !defined $gcl_max or $gcl_max<$$gc[1] ) { $gcl_max=$$gc[1]; $lmax=$$gc[0]; } } + my $gcmax = $is_paired && $gcl_max > $gcf_max ? $lmax : $fmax; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set title "$$args{title}" + set ylabel "Normalized Frequency" + set xlabel "GC Content [%]" + set yrange [0:1.1] + set label sprintf("%.1f",$gcmax) at $gcmax,1 front offset 1,0 + plot ] + . (exists($$opts{ref_stats}) ? q['-' smooth csplines with lines lt 0 title 'Reference', ] : '') + . q['-' smooth csplines with lines lc 1 title 'First fragments' ] + . ($is_paired ? q[, '-' smooth csplines with lines lc 2 title 'Last fragments'] : '') + . q[ + ]; + if ( exists($$opts{ref_stats}) ) + { + open(my $ref,'<',$$opts{ref_stats}) or error("$$opts{ref_stats}: $!"); + while (my $line=<$ref>) { print $fh $line } + close($ref); + print $fh "end\n"; + } + for my $cycle (@{$$opts{dat}{GCF}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcf_max; } + print $fh "end\n"; + if ( $is_paired ) + { + for my $cycle (@{$$opts{dat}{GCL}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcl_max; } + print $fh "end\n"; + } + close($fh); + plot($$args{gp}); +} + + +sub plot_gc_depth +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{GCD}) or !@{$$opts{dat}{GCD}} ) { return; } + + # Find unique sequence percentiles for 30,40, and 50% GC content, just to draw x2tics. + my @tics = ( {gc=>30},{gc=>40},{gc=>50} ); + for my $gc (@{$$opts{dat}{GCD}}) + { + for my $tic (@tics) + { + my $diff = abs($$gc[0]-$$tic{gc}); + if ( !exists($$tic{pr}) or $diff<$$tic{diff} ) { $$tic{pr}=$$gc[1]; $$tic{diff}=$diff; } + } + } + + my @x2tics; + for my $tic (@tics) { push @x2tics, qq["$$tic{gc}" $$tic{pr}]; } + my $x2tics = join(',',@x2tics); + + my $args = get_defaults($opts,"$$opts{prefix}gc-depth.png", wh=>'600,500'); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set ylabel "Mapped depth" + set xlabel "Percentile of mapped sequence ordered by GC content" + set x2label "GC Content [%]" + set title "$$args{title}" + set x2tics ($x2tics) + set xtics nomirror + set xrange [0.1:99.9] + + plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#dedede" t '10-90th percentile' , \\ + '-' using 1:2:3 with filledcurve lt 1 lc rgb "#bbdeff" t '25-75th percentile' , \\ + '-' using 1:2 with lines lc rgb "#0084ff" t 'Median' + ]; + for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[2]\t$$gc[6]\n"; } print $fh "end\n"; + for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[3]\t$$gc[5]\n"; } print $fh "end\n"; + for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[4]\n"; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_isize +{ + my ($opts) = @_; + + if ( !$$opts{dat}{'is paired:'} or !exists($$opts{dat}{IS}) or !@{$$opts{dat}{IS}} ) { return; } + + my ($isize_max,$isize_cnt); + for my $isize (@{$$opts{dat}{IS}}) + { + if ( !defined $isize_max or $isize_cnt<$$isize[1] ) { $isize_cnt=$$isize[1]; $isize_max=$$isize[0]; } + } + + my $args = get_defaults($opts,"$$opts{prefix}insert-size.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set rmargin 5 + set label sprintf("%d",$isize_max) at $isize_max+10,$isize_cnt + set ylabel "Number of pairs" + set xlabel "Insert Size" + set title "$$args{title}" + plot \\ + '-' with lines lc rgb 'black' title 'All pairs', \\ + '-' with lines title 'Inward', \\ + '-' with lines title 'Outward', \\ + '-' with lines title 'Other' + ]; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[1]\n"; } print $fh "end\n"; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[2]\n"; } print $fh "end\n"; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[3]\n"; } print $fh "end\n"; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[4]\n"; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_coverage +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{COV}) or !@{$$opts{dat}{COV}} ) { return; } + + my @vals; + for my $cov (@{$$opts{dat}{COV}}) { push @vals,$$cov[2]; } + my $i = percentile(99.8,@vals); + my $p99 = $$opts{dat}{COV}[$i][1]; + + my $args = get_defaults($opts,"$$opts{prefix}coverage.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set ylabel "Number of mapped bases" + set xlabel "Coverage" + set style fill solid border -1 + set title "$$args{title}" + set xrange [:$p99] + plot '-' with lines notitle + ]; + for my $cov (@{$$opts{dat}{COV}}) + { + if ( $$cov[2]==0 ) { next; } + print $fh "$$cov[1]\t$$cov[2]\n"; + } + print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_mismatches_per_cycle +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{MPC}) or !@{$$opts{dat}{MPC}} ) { return; } + if ( older_than($opts,'2012-02-06') ) { plot_mismatches_per_cycle_old($opts); } + + my $nquals = @{$$opts{dat}{MPC}[0]} - 2; + my $ncycles = @{$$opts{dat}{MPC}}; + my ($style,$with); + if ( $ncycles>100 ) { $style = ''; $with = 'w l'; } + else { $style = 'set style data histogram; set style histogram rowstacked'; $with = ''; } + + my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linecolor rgb "#e40000" + set style line 2 linecolor rgb "#ff9f00" + set style line 3 linecolor rgb "#eeee00" + set style line 4 linecolor rgb "#4ebd68" + set style line 5 linecolor rgb "#0061ff" + set style increment user + set key left top + $style + set ylabel "Number of mismatches" + set xlabel "Read Cycle" + set style fill solid border -1 + set title "$$args{title}" + set xrange [-1:$ncycles] + plot '-' $with ti 'Base Quality>30', \\ + '-' $with ti '30>=Q>20', \\ + '-' $with ti '20>=Q>10', \\ + '-' $with ti '10>=Q', \\ + '-' $with ti "N's" + ]; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (31..$#$cycle) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (22..31) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (12..21) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (2..11) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) { print $fh "$$cycle[1]\n"; } + print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + +sub plot_indel_dist +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{ID}) or !@{$$opts{dat}{ID}} ) { return; } + + my $args = get_defaults($opts,"$$opts{prefix}indel-dist.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linetype 1 linecolor rgb "red" + set style line 2 linetype 2 linecolor rgb "black" + set style line 3 linetype 3 linecolor rgb "green" + set style increment user + set ylabel "Indel count [log]" + set xlabel "Indel length" + set y2label "Insertions/Deletions ratio" + set log y + set y2tics nomirror + set ytics nomirror + set title "$$args{title}" + plot '-' w l ti 'Insertions', '-' w l ti 'Deletions', '-' axes x1y2 w l ti "Ins/Dels ratio" + ]; + for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{ID}}) { printf $fh "%d\t%f\n", $$len[0],$$len[2]?$$len[1]/$$len[2]:0; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + +sub plot_indel_cycles +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{IC}) or !@{$$opts{dat}{IC}} ) { return; } + + my $args = get_defaults($opts,"$$opts{prefix}indel-cycles.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linetype 1 linecolor rgb "red" + set style line 2 linetype 2 linecolor rgb "black" + set style line 3 linetype 3 linecolor rgb "green" + set style line 4 linetype 4 linecolor rgb "blue" + set style increment user + set ylabel "Indel count" + set xlabel "Read Cycle" + set title "$$args{title}" + plot '-' w l ti 'Insertions (fwd)', '' w l ti 'Insertions (rev)', '' w l ti 'Deletions (fwd)', '' w l ti 'Deletions (rev)' + ]; + for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[3]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[4]\n"; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + + + + + + +sub has_values +{ + my ($opts,@tags) = @_; + for my $tag (@tags) + { + my (@lines) = `cat $$opts{bamcheck} | grep ^$tag | wc -l`; + chomp($lines[0]); + if ( $lines[0]<2 ) { return 0; } + } + return 1; +} + +sub plot_mismatches_per_cycle_old +{ + my ($opts) = @_; + + my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png"); + my ($nquals) = `grep ^MPC $$opts{bamcheck} | awk '\$2==1' | sed 's,\\t,\\n,g' | wc -l`; + my ($ncycles) = `grep ^MPC $$opts{bamcheck} | wc -l`; + chomp($nquals); + chomp($ncycles); + $nquals--; + $ncycles--; + my @gr0_15 = (2..17); + my @gr16_30 = (18..32); + my @gr31_n = (33..$nquals); + my $gr0_15 = '$'. join('+$',@gr0_15); + my $gr16_30 = '$'. join('+$',@gr16_30); + my $gr31_n = '$'. join('+$',@gr31_n); + + open(my $fh,'>',$$args{gp}) or error("$$args{gp}: $!"); + print $fh q[ + set terminal png size 600,400 truecolor font "DejaVuSansMono,9" + set output "] . $$args{img} . q[" + + set key left top + set style data histogram + set style histogram rowstacked + + set grid back lc rgb "#aaaaaa" + set ylabel "Number of mismatches" + set xlabel "Read Cycle" + set style fill solid border -1 + set title "] . $$args{title} . qq[" + set xrange [-1:$ncycles] + + plot '< grep ^MPC $$opts{bamcheck} | cut -f 2-' using ($gr31_n) ti 'Base Quality>30', '' using ($gr16_30) ti '30>=Q>15', '' using ($gr0_15) ti '15>=Q' + ]; + close($fh); + + plot($$args{gp}); +} + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/psl2sam.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,65 @@ +#!/usr/bin/perl -w + +# Author: lh3 + +# This script calculates a score using the BLAST scoring +# system. However, I am not sure how to count gap opens and gap +# extensions. It seems to me that column 5-8 are not what I am +# after. This script counts gaps from the last three columns. It does +# not generate reference skip (N) in the CIGAR as it is not easy to +# directly tell which gaps correspond to introns. + +use strict; +use warnings; +use Getopt::Std; + +my %opts = (a=>1, b=>3, q=>5, r=>2); +getopts('a:b:q:r:', \%opts); +die("Usage: psl2sam.pl [-a $opts{a}] [-b $opts{b}] [-q $opts{q}] [-r $opts{r}] <in.psl>\n") if (@ARGV == 0 && -t STDIN); + +my @stack; +my $last = ''; +my ($a, $b, $q, $r) = ($opts{a}, $opts{b}, $opts{q}, $opts{r}); +while (<>) { + next unless (/^\d/); + my @t = split; + my @s; + my $cigar = ''; + if ($t[8] eq '-') { + my $tmp = $t[11]; + $t[11] = $t[10] - $t[12]; + $t[12] = $t[10] - $tmp; + } + @s[0..4] = ($t[9], (($t[8] eq '+')? 0 : 16), $t[13], $t[15]+1, 0); + @s[6..10] = ('*', 0, 0, '*', '*'); + $cigar .= $t[11].'H' if ($t[11]); # 5'-end clipping + my @x = split(',', $t[18]); + my @y = split(',', $t[19]); + my @z = split(',', $t[20]); + my ($y0, $z0) = ($y[0], $z[0]); + my ($gap_open, $gap_ext) = (0, 0, 0); + for (1 .. $t[17]-1) { + my $ly = $y[$_] - $y[$_-1] - $x[$_-1]; + my $lz = $z[$_] - $z[$_-1] - $x[$_-1]; + if ($ly < $lz) { # del: the reference gap is longer + ++$gap_open; + $gap_ext += $lz - $ly; + $cigar .= ($y[$_] - $y0) . 'M'; + $cigar .= ($lz - $ly) . 'D'; + ($y0, $z0) = ($y[$_], $z[$_]); + } elsif ($lz < $ly) { # ins: the query gap is longer + ++$gap_open; + $gap_ext += $ly - $lz; + $cigar .= ($z[$_] - $z0) . 'M'; + $cigar .= ($ly - $lz) . 'I'; + ($y0, $z0) = ($y[$_], $z[$_]); + } + } + $cigar .= ($t[12] - $y0) . 'M'; + $cigar .= ($t[10] - $t[12]).'H' if ($t[10] != $t[12]); # 3'-end clipping + $s[5] = $cigar; + my $score = $a * $t[0] - $b * $t[1] - $q * $gap_open - $r * $gap_ext; + $score = 0 if ($score < 0); + $s[11] = "AS:i:$score"; + print join("\t", @s), "\n"; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/r2plot.lua Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,83 @@ +#!/usr/bin/env luajit + +function string:split(sep, n) + local a, start = {}, 1; + sep = sep or "%s+"; + repeat + local b, e = self:find(sep, start); + if b == nil then + table.insert(a, self:sub(start)); + break + end + a[#a+1] = self:sub(start, b - 1); + start = e + 1; + if n and #a == n then + table.insert(a, self:sub(start)); + break + end + until start > #self; + return a; +end + +function io.xopen(fn, mode) + mode = mode or 'r'; + if fn == nil then return io.stdin; + elseif fn == '-' then return (mode == 'r' and io.stdin) or io.stdout; + elseif fn:sub(-3) == '.gz' then return (mode == 'r' and io.popen('gzip -dc ' .. fn, 'r')) or io.popen('gzip > ' .. fn, 'w'); + elseif fn:sub(-4) == '.bz2' then return (mode == 'r' and io.popen('bzip2 -dc ' .. fn, 'r')) or io.popen('bgzip2 > ' .. fn, 'w'); + else return io.open(fn, mode) end +end + +local eps = {}; + +function eps.func(fp) + fp = fp or io.stdout + fp:write("/C { dup 255 and 255 div exch dup -8 bitshift 255 and 255 div 3 1 roll -16 bitshift 255 and 255 div 3 1 roll setrgbcolor } bind def\n") + fp:write("/L { 4 2 roll moveto lineto } bind def\n") + fp:write("/LX { dup 4 -1 roll exch moveto lineto } bind def\n") + fp:write("/LY { dup 4 -1 roll moveto exch lineto } bind def\n") + fp:write("/LS { 3 1 roll moveto show } bind def\n") + fp:write("/RS { dup stringwidth pop 4 -1 roll exch sub 3 -1 roll moveto show } bind def\n") + fp:write("/B { 4 copy 3 1 roll exch 6 2 roll 8 -2 roll moveto lineto lineto lineto closepath } bind def\n") +end + +function eps.font(ft, size, fp) + fp = fp or io.stdout + fp:write(string.format('/FS %d def\n', size)); + fp:write('/FS4 FS 4 div def\n'); + fp:write('/' .. ft .. ' findfont FS scalefont setfont\n'); +end + +local scale = 8; + +if #arg == 0 then + print("Usage: r2plot.lua <in.txt>"); + os.exit(1) +end + +local fp = io.xopen(arg[1]); +local n = tonumber(fp:read()); + +print('%!PS-Adobe-3.0 EPSF-3.0'); +print('%%' .. string.format('BoundingBox: -%d -%d %.3f %.3f\n', 10*scale, scale, (n+1)*scale, (n+1)*scale)); +print(string.format('%.3f setlinewidth', scale)); +print(string.format('/plot { setgray moveto 0 %d rlineto } def', scale)); +print(string.format('/plothalf { setgray moveto 0 %.2f rlineto } def', scale/2)); +eps.func(); +eps.font('Helvetica', scale-1); + +local i = 1; +for l in fp:lines() do + local t = l:split('\t'); + print(string.format("%d %d FS4 add (%s) RS", (i-1)*scale-2, (i-1)*scale, t[1])); + for j = 2, #t do + if tonumber(t[j]) > 0.01 then + print(string.format('%.2f %.2f %.2f plot stroke', (i-1+.5)*scale, (j-2)*scale, 1.-t[j])); + end + end + i = i + 1; +end +for j = 1, 21 do + print(string.format('%.2f %.2f %.2f plothalf stroke', -8*scale, (j-1) * scale/2, 1.-(j-1)/20)); +end +print('showpage');
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/sam2vcf.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,270 @@ +#!/usr/bin/perl -w +# +# VCF specs: http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3 +# +# Contact: pd3@sanger +# Version: 2010-04-23 + +use strict; +use warnings; +use Carp; + +my $opts = parse_params(); +do_pileup_to_vcf($opts); + +exit; + +#--------------- + +sub error +{ + my (@msg) = @_; + if ( scalar @msg ) { croak(@msg); } + die + "Usage: sam2vcf.pl [OPTIONS] < in.pileup > out.vcf\n", + "Options:\n", + " -h, -?, --help This help message.\n", + " -i, --indels-only Ignore SNPs.\n", + " -r, --refseq <file.fa> The reference sequence, required when indels are present.\n", + " -R, --keep-ref Print reference alleles as well.\n", + " -s, --snps-only Ignore indels.\n", + " -t, --column-title <string> The column title.\n", + "\n"; +} + + +sub parse_params +{ + my %opts = (); + + $opts{fh_in} = *STDIN; + $opts{fh_out} = *STDOUT; + + while (my $arg=shift(@ARGV)) + { + if ( $arg eq '-R' || $arg eq '--keep-ref' ) { $opts{keep_ref}=1; next; } + if ( $arg eq '-r' || $arg eq '--refseq' ) { $opts{refseq}=shift(@ARGV); next; } + if ( $arg eq '-t' || $arg eq '--column-title' ) { $opts{title}=shift(@ARGV); next; } + if ( $arg eq '-s' || $arg eq '--snps-only' ) { $opts{snps_only}=1; next; } + if ( $arg eq '-i' || $arg eq '--indels-only' ) { $opts{indels_only}=1; next; } + if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } + + error("Unknown parameter \"$arg\". Run -h for help.\n"); + } + return \%opts; +} + +sub iupac_to_gtype +{ + my ($ref,$base) = @_; + my %iupac = ( + 'K' => ['G','T'], + 'M' => ['A','C'], + 'S' => ['C','G'], + 'R' => ['A','G'], + 'W' => ['A','T'], + 'Y' => ['C','T'], + ); + if ( !exists($iupac{$base}) ) + { + if ( $base ne 'A' && $base ne 'C' && $base ne 'G' && $base ne 'T' ) { error("FIXME: what is this [$base]?\n"); } + if ( $ref eq $base ) { return ('.','0/0'); } + return ($base,'1/1'); + } + my $gt = $iupac{$base}; + if ( $$gt[0] eq $ref ) { return ($$gt[1],'0/1'); } + elsif ( $$gt[1] eq $ref ) { return ($$gt[0],'0/1'); } + return ("$$gt[0],$$gt[1]",'1/2'); +} + + +sub parse_indel +{ + my ($cons) = @_; + if ( $cons=~/^-/ ) + { + my $len = length($'); + return "D$len"; + } + elsif ( $cons=~/^\+/ ) { return "I$'"; } + elsif ( $cons eq '*' ) { return undef; } + error("FIXME: could not parse [$cons]\n"); +} + + +# An example of the pileup format: +# 1 3000011 C C 32 0 98 1 ^~, A +# 1 3002155 * +T/+T 53 119 52 5 +T * 4 1 0 +# 1 3003094 * -TT/-TT 31 164 60 11 -TT * 5 6 0 +# 1 3073986 * */-AAAAAAAAAAAAAA 3 3 45 9 * -AAAAAAAAAAAAAA 7 2 0 +# +sub do_pileup_to_vcf +{ + my ($opts) = @_; + + my $fh_in = $$opts{fh_in}; + my $fh_out = $$opts{fh_out}; + my ($prev_chr,$prev_pos,$prev_ref); + my $refseq; + my $ignore_indels = $$opts{snps_only} ? 1 : 0; + my $ignore_snps = $$opts{indels_only} ? 1 : 0; + my $keep_ref = $$opts{keep_ref} ? 1 : 0; + my $title = exists($$opts{title}) ? $$opts{title} : 'data'; + + print $fh_out + qq[##fileformat=VCFv3.3\n], + qq[##INFO=DP,1,Integer,"Total Depth"\n], + qq[##FORMAT=GT,1,String,"Genotype"\n], + qq[##FORMAT=GQ,1,Integer,"Genotype Quality"\n], + qq[##FORMAT=DP,1,Integer,"Read Depth"\n], + qq[#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$title\n] + ; + + while (my $line=<$fh_in>) + { + chomp($line); + my (@items) = split(/\t/,$line); + if ( scalar @items<8 ) + { + error("\nToo few columns, does not look like output of 'samtools pileup -c': $line\n"); + } + my ($chr,$pos,$ref,$cons,$cons_qual,$snp_qual,$rms_qual,$depth,$a1,$a2) = @items; + $ref = uc($ref); + $cons = uc($cons); + + my ($alt,$gt); + if ( $ref eq '*' ) + { + # An indel is involved. + if ( $ignore_indels ) + { + $prev_ref = $ref; + $prev_pos = $pos; + $prev_chr = $chr; + next; + } + + if (!defined $prev_chr || $chr ne $prev_chr || $pos ne $prev_pos) + { + if ( !$$opts{refseq} ) { error("Cannot do indels without the reference.\n"); } + if ( !$refseq ) { $refseq = Fasta->new(file=>$$opts{refseq}); } + $ref = $refseq->get_base($chr,$pos); + $ref = uc($ref); + } + else { $ref = $prev_ref; } + + # One of the alleles can be a reference and it can come in arbitrary order. In some + # cases */* can be encountered. In such a case, look in the additional columns. + my ($al1,$al2) = split(m{/},$cons); + if ( $al1 eq $al2 && $al1 eq '*' ) { $al1=$a1; $al2=$a2; } + my $alt1 = parse_indel($al1); + my $alt2 = parse_indel($al2); + if ( !$alt1 && !$alt2 ) { error("FIXME: could not parse indel:\n", $line); } + if ( !$alt1 ) + { + $alt=$alt2; + $gt='0/1'; + } + elsif ( !$alt2 ) + { + $alt=$alt1; + $gt='0/1'; + } + elsif ( $alt1 eq $alt2 ) + { + $alt="$alt1"; + $gt='1/1'; + } + else + { + $alt="$alt1,$alt2"; + $gt='1/2'; + } + } + else + { + if ( $ignore_snps || (!$keep_ref && $ref eq $cons) ) + { + $prev_ref = $ref; + $prev_pos = $pos; + $prev_chr = $chr; + next; + } + + # SNP + ($alt,$gt) = iupac_to_gtype($ref,$cons); + } + + print $fh_out "$chr\t$pos\t.\t$ref\t$alt\t$snp_qual\t0\tDP=$depth\tGT:GQ:DP\t$gt:$cons_qual:$depth\n"; + + $prev_ref = $ref; + $prev_pos = $pos; + $prev_chr = $chr; + } +} + + +#------------- Fasta -------------------- +# +# Uses samtools to get a requested base from a fasta file. For efficiency, preloads +# a chunk to memory. The size of the cached sequence can be controlled by the 'size' +# parameter. +# +package Fasta; + +use strict; +use warnings; +use Carp; + +sub Fasta::new +{ + my ($class,@args) = @_; + my $self = {@args}; + bless $self, ref($class) || $class; + if ( !$$self{file} ) { $self->throw(qq[Missing the parameter "file"\n]); } + $$self{chr} = undef; + $$self{from} = undef; + $$self{to} = undef; + if ( !$$self{size} ) { $$self{size}=10_000_000; } + bless $self, ref($class) || $class; + return $self; +} + +sub read_chunk +{ + my ($self,$chr,$pos) = @_; + my $to = $pos + $$self{size}; + my $cmd = "samtools faidx $$self{file} $chr:$pos-$to"; + my @out = `$cmd`; + if ( $? ) { $self->throw("$cmd: $!"); } + my $line = shift(@out); + if ( !($line=~/^>$chr:(\d+)-(\d+)/) ) { $self->throw("Could not parse: $line"); } + $$self{chr} = $chr; + $$self{from} = $1; + $$self{to} = $2; + my $chunk = ''; + while ($line=shift(@out)) + { + chomp($line); + $chunk .= $line; + } + $$self{chunk} = $chunk; + return; +} + +sub get_base +{ + my ($self,$chr,$pos) = @_; + if ( !$$self{chr} || $chr ne $$self{chr} || $pos<$$self{from} || $pos>$$self{to} ) + { + $self->read_chunk($chr,$pos); + } + my $idx = $pos - $$self{from}; + return substr($$self{chunk},$idx,1); +} + +sub throw +{ + my ($self,@msg) = @_; + croak(@msg); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/samtools.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,528 @@ +#!/usr/bin/perl -w + +# Author: lh3 + +use strict; +use warnings; +use Getopt::Std; + +my $version = '0.3.3'; +&usage if (@ARGV < 1); + +my $command = shift(@ARGV); +my %func = (showALEN=>\&showALEN, pileup2fq=>\&pileup2fq, varFilter=>\&varFilter, plp2vcf=>\&plp2vcf, + unique=>\&unique, uniqcmp=>\&uniqcmp, sra2hdr=>\&sra2hdr, sam2fq=>\&sam2fq); + +die("Unknown command \"$command\".\n") if (!defined($func{$command})); +&{$func{$command}}; +exit(0); + +# +# showALEN +# + +sub showALEN { + die(qq/Usage: samtools.pl showALEN <in.sam>\n/) if (@ARGV == 0 && -t STDIN); + while (<>) { + my @t = split; + next if (/^\@/ || @t < 11); + my $l = 0; + $_ = $t[5]; + s/(\d+)[MI]/$l+=$1/eg; + print join("\t", @t[0..5]), "\t$l\t", join("\t", @t[6..$#t]), "\n"; + } +} + +# +# varFilter +# + +# +# Filtration code: +# +# d low depth +# D high depth +# W too many SNPs in a window (SNP only) +# G close to a high-quality indel (SNP only) +# Q low RMS mapping quality (SNP only) +# g close to another indel with higher quality (indel only) +# s low SNP quality (SNP only) +# i low indel quality (indel only) + +sub varFilter { + my %opts = (d=>3, D=>100, l=>30, Q=>25, q=>10, G=>25, s=>100, w=>10, W=>10, N=>2, p=>undef, S=>'', i=>''); + getopts('pq:d:D:l:Q:w:W:N:G:S:i:', \%opts); + die(qq/ +Usage: samtools.pl varFilter [options] <in.cns-pileup> + +Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] + -q INT minimum RMS mapping quality for gaps [$opts{q}] + -d INT minimum read depth [$opts{d}] + -D INT maximum read depth [$opts{D}] + -S INT minimum SNP quality [$opts{S}] + -i INT minimum indel quality [$opts{i}] + + -G INT min indel score for nearby SNP filtering [$opts{G}] + -w INT SNP within INT bp around a gap to be filtered [$opts{w}] + + -W INT window size for filtering dense SNPs [$opts{W}] + -N INT max number of SNPs in a window [$opts{N}] + + -l INT window size for filtering adjacent gaps [$opts{l}] + + -p print filtered variants +\n/) if (@ARGV == 0 && -t STDIN); + + # calculate the window size + my ($ol, $ow, $oW) = ($opts{l}, $opts{w}, $opts{W}); + my $max_dist = $ol > $ow? $ol : $ow; + $max_dist = $oW if ($max_dist < $oW); + # the core loop + my @staging; # (indel_filtering_score, flt_tag) + while (<>) { + my @t = split; + next if (uc($t[2]) eq uc($t[3]) || $t[3] eq '*/*'); # skip non-var sites + # clear the out-of-range elements + while (@staging) { + # Still on the same chromosome and the first element's window still affects this position? + last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]); + varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much + } + my ($flt, $score) = (0, -1); + # first a simple filter + if ($t[7] < $opts{d}) { + $flt = 2; + } elsif ($t[7] > $opts{D}) { + $flt = 3; + } + if ($t[2] eq '*') { # an indel + if ($opts{i} && $opts{i}>$t[5]) { $flt = 8; } + } + elsif ($opts{S} && $opts{S}>$t[5]) { $flt = 7; } # SNP + + # site dependent filters + my $len=0; + if ($flt == 0) { + if ($t[2] eq '*') { # an indel + # If deletion, remember the length of the deletion + my ($a,$b) = split(m{/},$t[3]); + my $alen = length($a) - 1; + my $blen = length($b) - 1; + if ( $alen>$blen ) + { + if ( substr($a,0,1) eq '-' ) { $len=$alen; } + } + elsif ( substr($b,0,1) eq '-' ) { $len=$blen; } + + $flt = 1 if ($t[6] < $opts{q}); + # filtering SNPs + if ($t[5] >= $opts{G}) { + for my $x (@staging) { + # Is it a SNP and is it outside the SNP filter window? + next if ($x->[0] >= 0 || $x->[4] + $x->[2] + $ow < $t[1]); + $x->[1] = 5 if ($x->[1] == 0); + } + } + # calculate the filtering score (different from indel quality) + $score = $t[5]; + $score += $opts{s} * $t[10] if ($t[8] ne '*'); + $score += $opts{s} * $t[11] if ($t[9] ne '*'); + # check the staging list for indel filtering + for my $x (@staging) { + # Is it a SNP and is it outside the gap filter window + next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ol < $t[1]); + if ($x->[0] < $score) { + $x->[1] = 6; + } else { + $flt = 6; last; + } + } + } else { # a SNP + $flt = 1 if ($t[6] < $opts{Q}); + # check adjacent SNPs + my $k = 1; + for my $x (@staging) { + ++$k if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && ($x->[1] == 0 || $x->[1] == 4 || $x->[1] == 5)); + } + # filtering is necessary + if ($k > $opts{N}) { + $flt = 4; + for my $x (@staging) { + $x->[1] = 4 if ($x->[0] < 0 && $x->[4] + $x->[2] + $oW >= $t[1] && $x->[1] == 0); + } + } else { # then check gap filter + for my $x (@staging) { + next if ($x->[0] < 0 || $x->[4] + $x->[2] + $ow < $t[1]); + if ($x->[0] >= $opts{G}) { + $flt = 5; last; + } + } + } + } + } + push(@staging, [$score, $flt, $len, @t]); + } + # output the last few elements in the staging list + while (@staging) { + varFilter_aux(shift @staging, $opts{p}); + } +} + +sub varFilter_aux { + my ($first, $is_print) = @_; + if ($first->[1] == 0) { + print join("\t", @$first[3 .. @$first-1]), "\n"; + } elsif ($is_print) { + print STDERR join("\t", substr("UQdDWGgsiX", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; + } +} + +# +# pileup2fq +# + +sub pileup2fq { + my %opts = (d=>3, D=>255, Q=>25, G=>25, l=>10); + getopts('d:D:Q:G:l:', \%opts); + die(qq/ +Usage: samtools.pl pileup2fq [options] <in.cns-pileup> + +Options: -d INT minimum depth [$opts{d}] + -D INT maximum depth [$opts{D}] + -Q INT min RMS mapQ [$opts{Q}] + -G INT minimum indel score [$opts{G}] + -l INT indel filter winsize [$opts{l}]\n +/) if (@ARGV == 0 && -t STDIN); + + my ($last_chr, $seq, $qual, @gaps, $last_pos); + my $_Q = $opts{Q}; + my $_d = $opts{d}; + my $_D = $opts{D}; + + $last_chr = ''; + while (<>) { + my @t = split; + if ($last_chr ne $t[0]) { + &p2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr); + $last_chr = $t[0]; + $last_pos = 0; + $seq = ''; $qual = ''; + @gaps = (); + } + if ($t[1] - $last_pos != 1) { + $seq .= 'n' x ($t[1] - $last_pos - 1); + $qual .= '!' x ($t[1] - $last_pos - 1); + } + if ($t[2] eq '*') { + push(@gaps, $t[1]) if ($t[5] >= $opts{G}); + } else { + $seq .= ($t[6] >= $_Q && $t[7] >= $_d && $t[7] <= $_D)? uc($t[3]) : lc($t[3]); + my $q = $t[4] + 33; + $q = 126 if ($q > 126); + $qual .= chr($q); + } + $last_pos = $t[1]; + } + &p2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}); +} + +sub p2q_post_process { + my ($chr, $seq, $qual, $gaps, $l) = @_; + &p2q_filter_gaps($seq, $gaps, $l); + print "\@$chr\n"; &p2q_print_str($seq); + print "+\n"; &p2q_print_str($qual); +} + +sub p2q_filter_gaps { + my ($seq, $gaps, $l) = @_; + for my $g (@$gaps) { + my $x = $g > $l? $g - $l : 0; + substr($$seq, $x, $l + $l) = lc(substr($$seq, $x, $l + $l)); + } +} + +sub p2q_print_str { + my ($s) = @_; + my $l = length($$s); + for (my $i = 0; $i < $l; $i += 60) { + print substr($$s, $i, 60), "\n"; + } +} + +# +# sam2fq +# + +sub sam2fq { + my %opts = (n=>20, p=>''); + getopts('n:p:', \%opts); + die("Usage: samtools.pl sam2fq [-n 20] [-p <prefix>] <inp.sam>\n") if (@ARGV == 0 && -t STDIN); + if ($opts{p} && $opts{n} > 1) { + my $pre = $opts{p}; + my @fh; + for (0 .. $opts{n}-1) { + open($fh[$_], sprintf("| gzip > $pre.%.3d.fq.gz", $_)) || die; + } + my $i = 0; + while (<>) { + next if (/^@/); + chomp; + my @t = split("\t"); + next if ($t[9] eq '*'); + my ($name, $seq, $qual); + if ($t[1] & 16) { # reverse strand + $seq = reverse($t[9]); + $qual = reverse($t[10]); + $seq =~ tr/ACGTacgt/TGCAtgca/; + } else { + ($seq, $qual) = @t[9,10]; + } + $name = $t[0]; + $name .= "/1" if ($t[1] & 0x40); + $name .= "/2" if ($t[1] & 0x80); + print {$fh[$i]} "\@$name\n$seq\n"; + if ($qual ne '*') { + print {$fh[$i]} "+\n$qual\n"; + } + $i = 0 if (++$i == $opts{n}); + } + close($fh[$_]) for (0 .. $opts{n}-1); + } else { + die("To be implemented.\n"); + } +} + +# +# sra2hdr +# + +# This subroutine does not use an XML parser. It requires that the SRA +# XML files are properly formated. +sub sra2hdr { + my %opts = (); + getopts('', \%opts); + die("Usage: samtools.pl sra2hdr <SRA.prefix>\n") if (@ARGV == 0); + my $pre = $ARGV[0]; + my $fh; + # read sample + my $sample = 'UNKNOWN'; + open($fh, "$pre.sample.xml") || die; + while (<$fh>) { + $sample = $1 if (/<SAMPLE.*alias="([^"]+)"/i); + } + close($fh); + # read experiment + my (%exp2lib, $exp); + open($fh, "$pre.experiment.xml") || die; + while (<$fh>) { + if (/<EXPERIMENT.*accession="([^\s"]+)"/i) { + $exp = $1; + } elsif (/<LIBRARY_NAME>\s*(\S+)\s*<\/LIBRARY_NAME>/i) { + $exp2lib{$exp} = $1; + } + } + close($fh); + # read run + my ($run, @fn); + open($fh, "$pre.run.xml") || die; + while (<$fh>) { + if (/<RUN.*accession="([^\s"]+)"/i) { + $run = $1; @fn = (); + } elsif (/<EXPERIMENT_REF.*accession="([^\s"]+)"/i) { + print "\@RG\tID:$run\tSM:$sample\tLB:$exp2lib{$1}\n"; + } elsif (/<FILE.*filename="([^\s"]+)"/i) { + push(@fn, $1); + } elsif (/<\/RUN>/i) { + if (@fn == 1) { + print STDERR "$fn[0]\t$run\n"; + } else { + for (0 .. $#fn) { + print STDERR "$fn[$_]\t$run", "_", $_+1, "\n"; + } + } + } + } + close($fh); +} + +# +# unique +# + +sub unique { + my %opts = (f=>250.0, q=>5, r=>2, a=>1, b=>3); + getopts('Qf:q:r:a:b:m', \%opts); + die("Usage: samtools.pl unique [-f $opts{f}] <in.sam>\n") if (@ARGV == 0 && -t STDIN); + my $last = ''; + my $recal_Q = !defined($opts{Q}); + my $multi_only = defined($opts{m}); + my @a; + while (<>) { + my $score = -1; + print $_ if (/^\@/); + $score = $1 if (/AS:i:(\d+)/); + my @t = split("\t"); + next if (@t < 11); + if ($score < 0) { # AS tag is unavailable + my $cigar = $t[5]; + my ($mm, $go, $ge) = (0, 0, 0); + $cigar =~ s/(\d+)[ID]/++$go,$ge+=$1/eg; + $cigar = $t[5]; + $cigar =~ s/(\d+)M/$mm+=$1/eg; + $score = $mm * $opts{a} - $go * $opts{q} - $ge * $opts{r}; # no mismatches... + } + $score = 1 if ($score < 1); + if ($t[0] ne $last) { + &unique_aux(\@a, $opts{f}, $recal_Q, $multi_only) if (@a); + $last = $t[0]; + } + push(@a, [$score, \@t]); + } + &unique_aux(\@a, $opts{f}, $recal_Q, $multi_only) if (@a); +} + +sub unique_aux { + my ($a, $fac, $is_recal, $multi_only) = @_; + my ($max, $max2, $max_i) = (0, 0, -1); + for (my $i = 0; $i < @$a; ++$i) { + if ($a->[$i][0] > $max) { + $max2 = $max; $max = $a->[$i][0]; $max_i = $i; + } elsif ($a->[$i][0] > $max2) { + $max2 = $a->[$i][0]; + } + } + if ($is_recal) { + if (!$multi_only || @$a > 1) { + my $q = int($fac * ($max - $max2) / $max + .499); + $q = 250 if ($q > 250); + $a->[$max_i][1][4] = $q < 250? $q : 250; + } + } + print join("\t", @{$a->[$max_i][1]}); + @$a = (); +} + +# +# uniqcmp: compare two SAM files +# + +sub uniqcmp { + my %opts = (q=>10, s=>100); + getopts('pq:s:', \%opts); + die("Usage: samtools.pl uniqcmp <in1.sam> <in2.sam>\n") if (@ARGV < 2); + my ($fh, %a); + warn("[uniqcmp] read the first file...\n"); + &uniqcmp_aux($ARGV[0], \%a, 0); + warn("[uniqcmp] read the second file...\n"); + &uniqcmp_aux($ARGV[1], \%a, 1); + warn("[uniqcmp] stats...\n"); + my @cnt; + $cnt[$_] = 0 for (0..9); + for my $x (keys %a) { + my $p = $a{$x}; + my $z; + if (defined($p->[0]) && defined($p->[1])) { + $z = ($p->[0][0] == $p->[1][0] && $p->[0][1] eq $p->[1][1] && abs($p->[0][2] - $p->[1][2]) < $opts{s})? 0 : 1; + if ($p->[0][3] >= $opts{q} && $p->[1][3] >= $opts{q}) { + ++$cnt[$z*3+0]; + } elsif ($p->[0][3] >= $opts{q}) { + ++$cnt[$z*3+1]; + } elsif ($p->[1][3] >= $opts{q}) { + ++$cnt[$z*3+2]; + } + print STDERR "$x\t$p->[0][1]:$p->[0][2]\t$p->[0][3]\t$p->[0][4]\t$p->[1][1]:$p->[1][2]\t$p->[1][3]\t$p->[1][4]\t", + $p->[0][5]-$p->[1][5], "\n" if ($z && defined($opts{p}) && ($p->[0][3] >= $opts{q} || $p->[1][3] >= $opts{q})); + } elsif (defined($p->[0])) { + ++$cnt[$p->[0][3]>=$opts{q}? 6 : 7]; + print STDERR "$x\t$p->[0][1]:$p->[0][2]\t$p->[0][3]\t$p->[0][4]\t*\t0\t*\t", + $p->[0][5], "\n" if (defined($opts{p}) && $p->[0][3] >= $opts{q}); + } else { + print STDERR "$x\t*\t0\t*\t$p->[1][1]:$p->[1][2]\t$p->[1][3]\t$p->[1][4]\t", + -$p->[1][5], "\n" if (defined($opts{p}) && $p->[1][3] >= $opts{q}); + ++$cnt[$p->[1][3]>=$opts{q}? 8 : 9]; + } + } + print "Consistent (high, high): $cnt[0]\n"; + print "Consistent (high, low ): $cnt[1]\n"; + print "Consistent (low , high): $cnt[2]\n"; + print "Inconsistent (high, high): $cnt[3]\n"; + print "Inconsistent (high, low ): $cnt[4]\n"; + print "Inconsistent (low , high): $cnt[5]\n"; + print "Second missing (high): $cnt[6]\n"; + print "Second missing (low ): $cnt[7]\n"; + print "First missing (high): $cnt[8]\n"; + print "First missing (low ): $cnt[9]\n"; +} + +sub uniqcmp_aux { + my ($fn, $a, $which) = @_; + my $fh; + $fn = "samtools view $fn |" if ($fn =~ /\.bam/); + open($fh, $fn) || die; + while (<$fh>) { + my @t = split; + next if (@t < 11); +# my $l = ($t[5] =~ /^(\d+)S/)? $1 : 0; + my $l = 0; + my ($x, $nm) = (0, 0); + $nm = $1 if (/NM:i:(\d+)/); + $_ = $t[5]; + s/(\d+)[MI]/$x+=$1/eg; + @{$a->{$t[0]}[$which]} = (($t[1]&0x10)? 1 : 0, $t[2], $t[3]-$l, $t[4], "$x:$nm", $x - 4 * $nm); + } + close($fh); +} + +sub plp2vcf { + while (<>) { + my @t = split; + next if ($t[3] eq '*/*'); + if ($t[2] eq '*') { # indel + my @s = split("/", $t[3]); + my (@a, @b); + my ($ref, $alt); + for (@s) { + next if ($_ eq '*'); + if (/^-/) { + push(@a, 'N'.substr($_, 1)); + push(@b, 'N'); + } elsif (/^\+/) { + push(@a, 'N'); + push(@b, 'N'.substr($_, 1)); + } + } + if ($a[0] && $a[1]) { + if (length($a[0]) < length($a[1])) { + $ref = $a[1]; + $alt = ($b[0] . ('N' x (length($a[1]) - length($a[0])))) . ",$b[1]"; + } elsif (length($a[0]) > length($a[1])) { + $ref = $a[0]; + $alt = ($b[1] . ('N' x (length($a[0]) - length($a[1])))) . ",$b[0]"; + } else { + $ref = $a[0]; + $alt = ($b[0] eq $b[1])? $b[0] : "$b[0],$b[1]"; + } + } else { + $ref = $a[0]; $alt = $b[0]; + } + print join("\t", @t[0,1], '.', $ref, $alt, $t[5], '.', '.'), "\n"; + } else { # SNP + } + } +} + +# +# Usage +# + +sub usage { + die(qq/ +Program: samtools.pl (helper script for SAMtools) +Version: $version +Contact: Heng Li <lh3\@sanger.ac.uk>\n +Usage: samtools.pl <command> [<arguments>]\n +Command: varFilter filtering SNPs and short indels + pileup2fq generate fastq from `pileup -c' + showALEN print alignment length (ALEN) following CIGAR +\n/); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/soap2sam.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,109 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.1 + +use strict; +use warnings; +use Getopt::Std; + +&soap2sam; +exit; + +sub mating { + my ($s1, $s2) = @_; + my $isize = 0; + if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize + my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; + my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; + $isize = $x2 - $x1; + } + # update mate coordinate + if ($s2->[2] ne '*') { + @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); + $s1->[1] |= 0x20 if ($s2->[1] & 0x10); + } else { + $s1->[1] |= 0x8; + } + if ($s1->[2] ne '*') { + @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); + $s2->[1] |= 0x20 if ($s1->[1] & 0x10); + } else { + $s2->[1] |= 0x8; + } +} + +sub soap2sam { + my %opts = (); + getopts("p", \%opts); + die("Usage: soap2sam.pl [-p] <aln.soap>\n") if (@ARGV == 0 && -t STDIN); + my $is_paired = defined($opts{p}); + # core loop + my @s1 = (); + my @s2 = (); + my ($s_last, $s_curr) = (\@s1, \@s2); + while (<>) { + s/[\177-\377]|[\000-\010]|[\012-\040]//g; + next if (&soap2sam_aux($_, $s_curr, $is_paired) < 0); + if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { + &mating($s_last, $s_curr); + print join("\t", @$s_last), "\n"; + print join("\t", @$s_curr), "\n"; + @$s_last = (); @$s_curr = (); + } else { + print join("\t", @$s_last), "\n" if (@$s_last != 0); + my $s = $s_last; $s_last = $s_curr; $s_curr = $s; + } + } + print join("\t", @$s_last), "\n" if (@$s_last != 0); +} + +sub soap2sam_aux { + my ($line, $s, $is_paired) = @_; + chomp($line); + my @t = split(/\s+/, $line); + return -1 if (@t < 9 || $line =~ /^\s/ || !$t[0]); + @$s = (); + # fix SOAP-2.1.x bugs + @t = @t[0..2,4..$#t] unless ($t[3] =~ /^\d+$/); + # read name + $s->[0] = $t[0]; + $s->[0] =~ s/\/[12]$//g; + # initial flag (will be updated later) + $s->[1] = 0; + $s->[1] |= 1 | 1<<($t[4] eq 'a'? 6 : 7); + $s->[1] |= 2 if ($is_paired); + # read & quality + $s->[9] = $t[1]; + $s->[10] = (length($t[2]) > length($t[1]))? substr($t[2], 0, length($t[1])) : $t[2]; + # cigar + $s->[5] = length($s->[9]) . "M"; + # coor + $s->[2] = $t[7]; $s->[3] = $t[8]; + $s->[1] |= 0x10 if ($t[6] eq '-'); + # mapQ + $s->[4] = $t[3] == 1? 30 : 0; + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + push(@$s, "NM:i:$t[9]"); + my $md = ''; + if ($t[9]) { + my @x; + for (10 .. $#t) { + push(@x, sprintf("%.3d,$1", $2)) if ($t[$_] =~ /^([ACGT])->(\d+)/i); + } + @x = sort(@x); + my $a = 0; + for (@x) { + my ($y, $z) = split(","); + $md .= (int($y)-$a) . $z; + $a += $y - $a + 1; + } + $md .= length($t[1]) - $a; + } else { + $md = length($t[1]); + } + push(@$s, "MD:Z:$md"); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/varfilter.py Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,205 @@ +#!/software/bin/python + +# Author: lh3, converted to python and modified to add -C option by Aylwyn Scally +# +# About: +# varfilter.py is a port of Heng's samtools.pl varFilter script into +# python, with an additional -C INT option. This option sets a minimum +# consensus score, above which the script will output a pileup line +# wherever it _could have_ called a variant, even if none is actually +# called (i.e. hom-ref positions). This is important if you want to +# subsequently merge the calls with those for another individual to get a +# synoptic view of calls at each site. Without this option, and in all +# other respects, it behaves like samtools.pl varFilter. +# +# Aylwyn Scally as6@sanger.ac.uk + + +# Filtration code: +# +# C low CNS quality (hom-ref only) +# d low depth +# D high depth +# W too many SNPs in a window (SNP only) +# G close to a high-quality indel (SNP only) +# Q low RMS mapping quality (SNP only) +# g close to another indel with higher quality (indel only) +# s low SNP quality (SNP only) +# i low indel quality (indel only) + + +import sys +import getopt + +def usage(): + print '''usage: varfilter.py [options] [cns-pileup] + +Options: -Q INT minimum RMS mapping quality for SNPs + -q INT minimum RMS mapping quality for gaps + -d INT minimum read depth + -D INT maximum read depth + -S INT minimum SNP quality + -i INT minimum indel quality + -C INT minimum consensus quality for hom-ref sites + + -G INT min indel score for nearby SNP filtering + -w INT SNP within INT bp around a gap to be filtered + + -W INT window size for filtering dense SNPs + -N INT max number of SNPs in a window + + -l INT window size for filtering adjacent gaps + + -p print filtered variants''' + +def varFilter_aux(first, is_print): + try: + if first[1] == 0: + sys.stdout.write("\t".join(first[4:]) + "\n") + elif is_print: + sys.stderr.write("\t".join(["UQdDWGgsiCX"[first[1]]] + first[4:]) + "\n") + except IOError: + sys.exit() + +mindepth = 3 +maxdepth = 100 +gapgapwin = 30 +minsnpmapq = 25 +mingapmapq = 10 +minindelscore = 25 +scorefactor = 100 +snpgapwin = 10 +densesnpwin = 10 +densesnps = 2 +printfilt = False +minsnpq = 0 +minindelq = 0 +mincnsq = 0 + +try: + options, args = getopt.gnu_getopt(sys.argv[1:], 'pq:d:D:l:Q:w:W:N:G:S:i:C:', []) +except getopt.GetoptError: + usage() + sys.exit(2) +for (oflag, oarg) in options: + if oflag == '-d': mindepth = int(oarg) + if oflag == '-D': maxdepth = int(oarg) + if oflag == '-l': gapgapwin = int(oarg) + if oflag == '-Q': minsnpmapq = int(oarg) + if oflag == '-q': mingapmapq = int(oarg) + if oflag == '-G': minindelscore = int(oarg) + if oflag == '-s': scorefactor = int(oarg) + if oflag == '-w': snpgapwin = int(oarg) + if oflag == '-W': densesnpwin = int(oarg) + if oflag == '-C': mincnsq = int(oarg) + if oflag == '-N': densesnps = int(oarg) + if oflag == '-p': printfilt = True + if oflag == '-S': minsnpq = int(oarg) + if oflag == '-i': minindelq = int(oarg) + +if len(args) < 1: + inp = sys.stdin +else: + inp = open(args[0]) + +# calculate the window size +max_dist = max(gapgapwin, snpgapwin, densesnpwin) + +staging = [] +for t in (line.strip().split() for line in inp): + (flt, score) = (0, -1) + # non-var sites + if t[3] == '*/*': + continue + is_snp = t[2].upper() != t[3].upper() + if not (is_snp or mincnsq): + continue + # clear the out-of-range elements + while staging: + # Still on the same chromosome and the first element's window still affects this position? + if staging[0][4] == t[0] and int(staging[0][5]) + staging[0][2] + max_dist >= int(t[1]): + break + varFilter_aux(staging.pop(0), printfilt) + + # first a simple filter + if int(t[7]) < mindepth: + flt = 2 + elif int(t[7]) > maxdepth: + flt = 3 + if t[2] == '*': # an indel + if minindelq and minindelq > int(t[5]): + flt = 8 + elif is_snp: + if minsnpq and minsnpq> int(t[5]): + flt = 7 + else: + if mincnsq and mincnsq > int(t[4]): + flt = 9 + + # site dependent filters + dlen = 0 + if flt == 0: + if t[2] == '*': # an indel + # If deletion, remember the length of the deletion + (a,b) = t[3].split('/') + alen = len(a) - 1 + blen = len(b) - 1 + if alen>blen: + if a[0] == '-': dlen=alen + elif b[0] == '-': dlen=blen + + if int(t[6]) < mingapmapq: + flt = 1 + # filtering SNPs + if int(t[5]) >= minindelscore: + for x in (y for y in staging if y[3]): + # Is it a SNP and is it outside the SNP filter window? + if x[0] >= 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]): + continue + if x[1] == 0: + x[1] = 5 + + # calculate the filtering score (different from indel quality) + score = int(t[5]) + if t[8] != '*': + score += scorefactor * int(t[10]) + if t[9] != '*': + score += scorefactor * int(t[11]) + # check the staging list for indel filtering + for x in (y for y in staging if y[3]): + # Is it a SNP and is it outside the gap filter window + if x[0] < 0 or int(x[5]) + x[2] + gapgapwin < int(t[1]): + continue + if x[0] < score: + x[1] = 6 + else: + flt = 6 + break + else: # a SNP or hom-ref + if int(t[6]) < minsnpmapq: + flt = 1 + # check adjacent SNPs + k = 1 + for x in (y for y in staging if y[3]): + if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and (x[1] == 0 or x[1] == 4 or x[1] == 5): + k += 1 + + # filtering is necessary + if k > densesnps: + flt = 4 + for x in (y for y in staging if y[3]): + if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and x[1] == 0: + x[1] = 4 + else: # then check gap filter + for x in (y for y in staging if y[3]): + if x[0] < 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]): + continue + if x[0] >= minindelscore: + flt = 5 + break + + staging.append([score, flt, dlen, is_snp] + t) + +# output the last few elements in the staging list +while staging: + varFilter_aux(staging.pop(0), printfilt)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/vcfutils.lua Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,694 @@ +#!/usr/bin/env luajit + +----------------------------------- +-- BEGIN: routines from klib.lua -- +----------------------------------- + +-- Description: getopt() translated from the BSD getopt(); compatible with the default Unix getopt() +--[[ Example: + for o, a in os.getopt(arg, 'a:b') do + print(o, a) + end +]]-- +function os.getopt(args, ostr) + local arg, place = nil, 0; + return function () + if place == 0 then -- update scanning pointer + place = 1 + if #args == 0 or args[1]:sub(1, 1) ~= '-' then place = 0; return nil end + if #args[1] >= 2 then + place = place + 1 + if args[1]:sub(2, 2) == '-' then -- found "--" + table.remove(args, 1); + place = 0 + return nil; + end + end + end + local optopt = place <= #args[1] and args[1]:sub(place, place) or nil + place = place + 1; + local oli = optopt and ostr:find(optopt) or nil + if optopt == ':' or oli == nil then -- unknown option + if optopt == '-' then return nil end + if place > #args[1] then + table.remove(args, 1); + place = 0; + end + return '?'; + end + oli = oli + 1; + if ostr:sub(oli, oli) ~= ':' then -- do not need argument + arg = nil; + if place > #args[1] then + table.remove(args, 1); + place = 0; + end + else -- need an argument + if place <= #args[1] then -- no white space + arg = args[1]:sub(place); + else + table.remove(args, 1); + if #args == 0 then -- an option requiring argument is the last one + place = 0; + if ostr:sub(1, 1) == ':' then return ':' end + return '?'; + else arg = args[1] end + end + table.remove(args, 1); + place = 0; + end + return optopt, arg; + end +end + +-- Description: string split +function string:split(sep, n) + local a, start = {}, 1; + sep = sep or "%s+"; + repeat + local b, e = self:find(sep, start); + if b == nil then + table.insert(a, self:sub(start)); + break + end + a[#a+1] = self:sub(start, b - 1); + start = e + 1; + if n and #a == n then + table.insert(a, self:sub(start)); + break + end + until start > #self; + return a; +end + +-- Description: smart file open +function io.xopen(fn, mode) + mode = mode or 'r'; + if fn == nil then return io.stdin; + elseif fn == '-' then return (mode == 'r' and io.stdin) or io.stdout; + elseif fn:sub(-3) == '.gz' then return (mode == 'r' and io.popen('gzip -dc ' .. fn, 'r')) or io.popen('gzip > ' .. fn, 'w'); + elseif fn:sub(-4) == '.bz2' then return (mode == 'r' and io.popen('bzip2 -dc ' .. fn, 'r')) or io.popen('bgzip2 > ' .. fn, 'w'); + else return io.open(fn, mode) end +end + +-- Description: log gamma function +-- Required by: math.lbinom() +-- Reference: AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245 +function math.lgamma(z) + local x; + x = 0.1659470187408462e-06 / (z+7); + x = x + 0.9934937113930748e-05 / (z+6); + x = x - 0.1385710331296526 / (z+5); + x = x + 12.50734324009056 / (z+4); + x = x - 176.6150291498386 / (z+3); + x = x + 771.3234287757674 / (z+2); + x = x - 1259.139216722289 / (z+1); + x = x + 676.5203681218835 / z; + x = x + 0.9999999999995183; + return math.log(x) - 5.58106146679532777 - z + (z-0.5) * math.log(z+6.5); +end + +-- Description: regularized incomplete gamma function +-- Dependent on: math.lgamma() +--[[ + Formulas are taken from Wiki, with additional input from Numerical + Recipes in C (for modified Lentz's algorithm) and AS245 + (http://lib.stat.cmu.edu/apstat/245). + + A good online calculator is available at: + + http://www.danielsoper.com/statcalc/calc23.aspx + + It calculates upper incomplete gamma function, which equals + math.igamma(s,z,true)*math.exp(math.lgamma(s)) +]]-- +function math.igamma(s, z, complement) + + local function _kf_gammap(s, z) + local sum, x = 1, 1; + for k = 1, 100 do + x = x * z / (s + k); + sum = sum + x; + if x / sum < 1e-14 then break end + end + return math.exp(s * math.log(z) - z - math.lgamma(s + 1.) + math.log(sum)); + end + + local function _kf_gammaq(s, z) + local C, D, f, TINY; + f = 1. + z - s; C = f; D = 0.; TINY = 1e-290; + -- Modified Lentz's algorithm for computing continued fraction. See Numerical Recipes in C, 2nd edition, section 5.2 + for j = 1, 100 do + local d; + local a, b = j * (s - j), j*2 + 1 + z - s; + D = b + a * D; + if D < TINY then D = TINY end + C = b + a / C; + if C < TINY then C = TINY end + D = 1. / D; + d = C * D; + f = f * d; + if math.abs(d - 1) < 1e-14 then break end + end + return math.exp(s * math.log(z) - z - math.lgamma(s) - math.log(f)); + end + + if complement then + return ((z <= 1 or z < s) and 1 - _kf_gammap(s, z)) or _kf_gammaq(s, z); + else + return ((z <= 1 or z < s) and _kf_gammap(s, z)) or (1 - _kf_gammaq(s, z)); + end +end + +function math.brent(func, a, b, tol) + local gold1, gold2, tiny, max_iter = 1.6180339887, 0.3819660113, 1e-20, 100 + + local fa, fb = func(a, data), func(b, data) + if fb > fa then -- swap, such that f(a) > f(b) + a, b, fa, fb = b, a, fb, fa + end + local c = b + gold1 * (b - a) + local fc = func(c) -- golden section extrapolation + while fb > fc do + local bound = b + 100.0 * (c - b) -- the farthest point where we want to go + local r = (b - a) * (fb - fc) + local q = (b - c) * (fb - fa) + if math.abs(q - r) < tiny then -- avoid 0 denominator + tmp = q > r and tiny or 0.0 - tiny + else tmp = q - r end + u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp) -- u is the parabolic extrapolation point + if (b > u and u > c) or (b < u and u < c) then -- u lies between b and c + fu = func(u) + if fu < fc then -- (b,u,c) bracket the minimum + a, b, fa, fb = b, u, fb, fu + break + elseif fu > fb then -- (a,b,u) bracket the minimum + c, fc = u, fu + break + end + u = c + gold1 * (c - b) + fu = func(u) -- golden section extrapolation + elseif (c > u and u > bound) or (c < u and u < bound) then -- u lies between c and bound + fu = func(u) + if fu < fc then -- fb > fc > fu + b, c, u = c, u, c + gold1 * (c - b) + fb, fc, fu = fc, fu, func(u) + else -- (b,c,u) bracket the minimum + a, b, c = b, c, u + fa, fb, fc = fb, fc, fu + break + end + elseif (u > bound and bound > c) or (u < bound and bound < c) then -- u goes beyond the bound + u = bound + fu = func(u) + else -- u goes the other way around, use golden section extrapolation + u = c + gold1 * (c - b) + fu = func(u) + end + a, b, c = b, c, u + fa, fb, fc = fb, fc, fu + end + if a > c then a, c = c, a end -- swap + + -- now, a<b<c, fa>fb and fb<fc, move on to Brent's algorithm + local e, d = 0, 0 + local w, v, fw, fv + w, v = b, b + fw, fv = fb, fb + for iter = 1, max_iter do + local mid = 0.5 * (a + c) + local tol1 = tol * math.abs(b) + tiny + local tol2 = 2.0 * tol1 + if math.abs(b - mid) <= tol2 - 0.5 * (c - a) then return fb, b end -- found + if math.abs(e) > tol1 then + -- related to parabolic interpolation + local r = (b - w) * (fb - fv) + local q = (b - v) * (fb - fw) + local p = (b - v) * q - (b - w) * r + q = 2.0 * (q - r) + if q > 0.0 then p = 0.0 - p + else q = 0.0 - q end + eold, e = e, d + if math.abs(p) >= math.abs(0.5 * q * eold) or p <= q * (a - b) or p >= q * (c - b) then + e = b >= mid and a - b or c - b + d = gold2 * e + else + d, u = p / q, b + d -- actual parabolic interpolation happens here + if u - a < tol2 or c - u < tol2 then + d = mid > b and tol1 or 0.0 - tol1 + end + end + else -- golden section interpolation + e = b >= min and a - b or c - b + d = gold2 * e + end + u = fabs(d) >= tol1 and b + d or b + (d > 0.0 and tol1 or -tol1); + fu = func(u) + if fu <= fb then -- u is the minimum point so far + if u >= b then a = b + else c = b end + v, w, b = w, b, u + fv, fw, fb = fw, fb, fu + else -- adjust (a,c) and (u,v,w) + if u < b then a = u + else c = u end + if fu <= fw or w == b then + v, w = w, u + fv, fw = fw, fu + elseif fu <= fv or v == b or v == w then + v, fv = u, fu; + end + end + end + return fb, b +end + +matrix = {} + +-- Description: chi^2 test for contingency tables +-- Dependent on: math.igamma() +function matrix.chi2(a) + if #a == 2 and #a[1] == 2 then -- 2x2 table + local x, z + x = (a[1][1] + a[1][2]) * (a[2][1] + a[2][2]) * (a[1][1] + a[2][1]) * (a[1][2] + a[2][2]) + if x == 0 then return 0, 1, false end + z = a[1][1] * a[2][2] - a[1][2] * a[2][1] + z = (a[1][1] + a[1][2] + a[2][1] + a[2][2]) * z * z / x + return z, math.igamma(.5, .5 * z, true), true + else -- generic table + local rs, cs, n, m, N, z = {}, {}, #a, #a[1], 0, 0 + for i = 1, n do rs[i] = 0 end + for j = 1, m do cs[j] = 0 end + for i = 1, n do -- compute column sum and row sum + for j = 1, m do cs[j], rs[i] = cs[j] + a[i][j], rs[i] + a[i][j] end + end + for i = 1, n do N = N + rs[i] end + for i = 1, n do -- compute the chi^2 statistics + for j = 1, m do + local E = rs[i] * cs[j] / N; + z = z + (a[i][j] - E) * (a[i][j] - E) / E + end + end + return z, math.igamma(.5 * (n-1) * (m-1), .5 * z, true), true; + end +end + +--------------------------------- +-- END: routines from klib.lua -- +--------------------------------- + + +-------------------------- +-- BEGIN: misc routines -- +-------------------------- + +-- precompute an array for PL->probability conversion +-- @param m maximum PL +function algo_init_q2p(m) + local q2p = {} + for i = 0, m do + q2p[i] = math.pow(10, -i / 10) + end + return q2p +end + +-- given the haplotype frequency, compute r^2 +-- @param f 4 haplotype frequencies; f[] is 0-indexed. +-- @return r^2 +function algo_r2(f) + local p = { f[0] + f[1], f[0] + f[2] } + local D = f[0] * f[3] - f[1] * f[2] + return (p[1] == 0 or p[2] == 0 or 1-p[1] == 0 or 1-p[2] == 0) and 0 or D * D / (p[1] * p[2] * (1 - p[1]) * (1 - p[2])) +end + +-- parse a VCF line to get PL +-- @param q2p is computed by algo_init_q2p() +function text_parse_pl(t, q2p, parse_GT) + parse_GT = parse_GT == nil and true or false + local ht, gt, pl = {}, {}, {} + local s, j0 = t[9]:split(':'), 0 + for j = 1, #s do + if s[j] == 'PL' then j0 = j break end + end + local has_GT = (s[1] == 'GT' and parse_GT) and true or false + for i = 10, #t do + if j0 > 0 then + local s = t[i]:split(':') + local a, b = 1, s[j0]:find(',') + pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, b - 1))] + a, b = b + 1, s[j0]:find(',', b + 1) + pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, b - 1))] + a, b = b + 1, s[j0]:find(',', b + 1) + pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, (b and b - 1) or nil))] + end + if has_GT then + if t[i]:sub(1, 1) ~= '.' then + local g = tonumber(t[i]:sub(1, 1)) + tonumber(t[i]:sub(3, 3)); + gt[#gt+1] = 1e-6; gt[#gt+1] = 1e-6; gt[#gt+1] = 1e-6 + gt[#gt - 2 + g] = 1 + ht[#ht+1] = tonumber(t[i]:sub(1, 1)); ht[#ht+1] = tonumber(t[i]:sub(3, 3)); + else + gt[#gt+1] = 1; gt[#gt+1] = 1; gt[#gt+1] = 1 + ht[#ht+1] = -1; ht[#ht+1] = -1; + end + end +-- print(t[i], pl[#pl-2], pl[#pl-1], pl[#pl], gt[#gt-2], gt[#gt-1], gt[#gt]) + end + if #pl == 0 then pl = nil end + local x = has_GT and { t[1], t[2], ht, gt, pl } or { t[1], t[2], nil, nil, pl } + return x +end + +-- Infer haplotype frequency +-- @param pdg genotype likelihoods P(D|g) generated by text_parse_pl(). pdg[] is 1-indexed. +-- @param eps precision [1e-5] +-- @return 2-locus haplotype frequencies, 0-indexed array +function algo_hapfreq2(pdg, eps) + eps = eps or 1e-5 + local n, f = #pdg[1] / 3, {[0]=0.25, 0.25, 0.25, 0.25} + for iter = 1, 100 do + local F = {[0]=0, 0, 0, 0} + for i = 0, n - 1 do + local p1, p2 = {[0]=pdg[1][i*3+1], pdg[1][i*3+2], pdg[1][i*3+3]}, {[0]=pdg[2][i*3+1], pdg[2][i*3+2], pdg[2][i*3+3]} + local u = { [0]= + f[0] * (f[0] * p1[0] * p2[0] + f[1] * p1[0] * p2[1] + f[2] * p1[1] * p2[0] + f[3] * p1[1] * p2[1]), + f[1] * (f[0] * p1[0] * p2[1] + f[1] * p1[0] * p2[2] + f[2] * p1[1] * p2[1] + f[3] * p1[1] * p2[2]), + f[2] * (f[0] * p1[1] * p2[0] + f[1] * p1[1] * p2[1] + f[2] * p1[2] * p2[0] + f[3] * p1[2] * p2[1]), + f[3] * (f[0] * p1[1] * p2[1] + f[1] * p1[1] * p2[2] + f[2] * p1[2] * p2[1] + f[3] * p1[2] * p2[2]) + } + local s = u[0] + u[1] + u[2] + u[3] + s = 1 / (s * n) + F[0] = F[0] + u[0] * s + F[1] = F[1] + u[1] * s + F[2] = F[2] + u[2] * s + F[3] = F[3] + u[3] * s + end + local e = 0 + for k = 0, 3 do + e = math.abs(f[k] - F[k]) > e and math.abs(f[k] - F[k]) or e + end + for k = 0, 3 do f[k] = F[k] end + if e < eps then break end +-- print(f[0], f[1], f[2], f[3]) + end + return f +end + +------------------------ +-- END: misc routines -- +------------------------ + + +--------------------- +-- BEGIN: commands -- +--------------------- + +-- CMD vcf2bgl: convert PL tagged VCF to Beagle input -- +function cmd_vcf2bgl() + if #arg == 0 then + print("\nUsage: vcf2bgl.lua <in.vcf>") + print("\nNB: This command finds PL by matching /(\\d+),(\\d+),(\\d+)/.\n"); + os.exit(1) + end + + local lookup = {} + for i = 0, 10000 do lookup[i] = string.format("%.4f", math.pow(10, -i/10)) end + + local fp = io.xopen(arg[1]) + for l in fp:lines() do + if l:sub(1, 2) == '##' then -- meta lines; do nothing + elseif l:sub(1, 1) == '#' then -- sample lines + local t, s = l:split('\t'), {} + for i = 10, #t do s[#s+1] = t[i]; s[#s+1] = t[i]; s[#s+1] = t[i] end + print('marker', 'alleleA', 'alleleB', table.concat(s, '\t')) + else -- data line + local t = l:split('\t'); + if t[5] ~= '.' and t[5]:find(",") == nil and #t[5] == 1 and #t[4] == 1 then -- biallic SNP + local x, z = -1, {}; + if t[9]:find('PL') then + for i = 10, #t do + local AA, Aa, aa = t[i]:match('(%d+),(%d+),(%d+)') + AA = tonumber(AA); Aa = tonumber(Aa); aa = tonumber(aa); + if AA ~= nil then + z[#z+1] = lookup[AA]; z[#z+1] = lookup[Aa]; z[#z+1] = lookup[aa]; + else z[#z+1] = 1; z[#z+1] = 1; z[#z+1] = 1; end + end + print(t[1]..':'..t[2], t[4], t[5], table.concat(z, '\t')) + elseif t[9]:find('GL') then + print('Error: not implemented') + os.exit(1) + end + end + end + end + fp:close() +end + +-- CMD bgl2vcf: convert Beagle output to VCF +function cmd_bgl2vcf() + if #arg < 2 then + print('Usage: bgl2vcf.lua <in.phased> <in.gprobs>') + os.exit(1) + end + + local fpp = io.xopen(arg[1]); + local fpg = io.xopen(arg[2]); + for lg in fpg:lines() do + local tp, tg, a = fpp:read():split('%s'), lg:split('%s', 4), {} + if tp[1] == 'I' then + for i = 3, #tp, 2 do a[#a+1] = tp[i] end + print('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', table.concat(a, '\t')) + else + local chr, pos = tg[1]:match('(%S+):(%d+)$') + a = {chr, pos, '.', tg[2], tg[3], 30, '.', '.', 'GT'} + for i = 3, #tp, 2 do + a[#a+1] = ((tp[i] == tg[2] and 0) or 1) .. '|' .. ((tp[i+1] == tg[2] and 0) or 1) + end + print(table.concat(a, '\t')) + end + end + fpg:close(); fpp:close(); +end + +-- CMD freq: count alleles in each population +function cmd_freq() + -- parse the command line + local site_only = true; -- print site allele frequency or not + for c in os.getopt(arg, 's') do + if c == 's' then site_only = false end + end + if #arg == 0 then + print("\nUsage: vcfutils.lua freq [-s] <in.vcf> [samples.txt]\n") + print("NB: 1) This command only considers biallelic variants.") + print(" 2) Apply '-s' to get the allele frequency spectrum.") + print(" 3) 'samples.txt' is TAB-delimited with each line consisting of sample and population.") + print("") + os.exit(1) + end + + -- read the sample-population pairs + local pop, sample = {}, {} + if #arg > 1 then + local fp = io.xopen(arg[2]); + for l in fp:lines() do + local s, p = l:match("^(%S+)%s+(%S+)"); -- sample, population pair + sample[s] = p; -- FIXME: check duplications + if pop[p] then table.insert(pop[p], s) + else pop[p] = {s} end + end + fp:close(); + end + pop['NA'] = {} + + -- parse VCF + fp = (#arg >= 2 and io.xopen(arg[1])) or io.stdin; + local col, cnt = {}, {}; + for k in pairs(pop) do + col[k], cnt[k] = {}, {[0]=0}; + end + for l in fp:lines() do + if l:sub(1, 2) == '##' then -- meta lines; do nothing + elseif l:sub(1, 1) == '#' then -- the sample line + local t, del_NA = l:split('\t'), true; + for i = 10, #t do + local k = sample[t[i]] + if k == nil then + k, del_NA = 'NA', false + table.insert(pop[k], t[i]) + end + table.insert(col[k], i); + table.insert(cnt[k], 0); + table.insert(cnt[k], 0); + end + if del_NA then pop['NA'], col['NA'], cnt['NA'] = nil, nil, nil end + else -- data lines + local t = l:split('\t'); + if t[5] ~= '.' and t[5]:find(",") == nil then -- biallic + if site_only == true then io.write(t[1], '\t', t[2], '\t', t[4], '\t', t[5]) end + for k, v in pairs(col) do + local ac, an = 0, 0; + for i = 1, #v do + local a1, a2 = t[v[i]]:match("^(%d).(%d)"); + if a1 ~= nil then ac, an = ac + a1 + a2, an + 2 end + end + if site_only == true then io.write('\t', k, ':', an, ':', ac) end + if an == #cnt[k] then cnt[k][ac] = cnt[k][ac] + 1 end + end + if site_only == true then io.write('\n') end + end + end + end + fp:close(); + + -- print + if site_only == false then + for k, v in pairs(cnt) do + io.write(k .. "\t" .. #v); + for i = 0, #v do io.write("\t" .. v[i]) end + io.write('\n'); + end + end +end + +function cmd_vcf2chi2() + if #arg < 3 then + print("Usage: vcfutils.lua vcf2chi2 <in.vcf> <group1.list> <group2.list>"); + os.exit(1) + end + + local g = {}; + + -- read the list of groups + local fp = io.xopen(arg[2]); + for l in fp:lines() do local x = l:match("^(%S+)"); g[x] = 1 end -- FIXME: check duplicate + fp:close() + fp = io.xopen(arg[3]); + for l in fp:lines() do local x = l:match("^(%S+)"); g[x] = 2 end + fp:close() + + -- process VCF + fp = io.xopen(arg[1]) + local h = {{}, {}} + for l in fp:lines() do + if l:sub(1, 2) == '##' then print(l) -- meta lines; do nothing + elseif l:sub(1, 1) == '#' then -- sample lines + local t = l:split('\t'); + for i = 10, #t do + if g[t[i]] == 1 then table.insert(h[1], i) + elseif g[t[i]] == 2 then table.insert(h[2], i) end + end + while #t > 8 do table.remove(t) end + print(table.concat(t, "\t")) + else -- data line + local t = l:split('\t'); + if t[5] ~= '.' and t[5]:find(",") == nil then -- biallic + local a = {{0, 0}, {0, 0}} + for i = 1, 2 do + for _, k in pairs(h[i]) do + if t[k]:find("^0.0") then a[i][1] = a[i][1] + 2 + elseif t[k]:find("^1.1") then a[i][2] = a[i][2] + 2 + elseif t[k]:find("^0.1") or t[k]:find("^1.0") then + a[i][1], a[i][2] = a[i][1] + 1, a[i][2] + 1 + end + end + end + local chi2, p, succ = matrix.chi2(a); + while #t > 8 do table.remove(t) end + --print(a[1][1], a[1][2], a[2][1], a[2][2], chi2, p); + if succ then print(table.concat(t, "\t") .. ";PCHI2=" .. string.format("%.3g", p) + .. string.format(';AF1=%.4g;AF2=%.4g,%.4g', (a[1][2]+a[2][2]) / (a[1][1]+a[1][2]+a[2][1]+a[2][2]), + a[1][2]/(a[1][1]+a[1][2]), a[2][2]/(a[2][1]+a[2][2]))) + else print(table.concat(t, "\t")) end + end + end + end + fp:close() +end + +-- CMD: compute r^2 +function cmd_r2() + local w, is_ht, is_gt = 1, false, false + for o, a in os.getopt(arg, 'w:hg') do + if o == 'w' then w = tonumber(a) + elseif o == 'h' then is_ht, is_gt = true, true + elseif o == 'g' then is_gt = true + end + end + if #arg == 0 then + print("Usage: vcfutils.lua r2 [-hg] [-w 1] <in.vcf>") + os.exit(1) + end + local stack, fp, q2p = {}, io.xopen(arg[1]), algo_init_q2p(1023) + for l in fp:lines() do + if l:sub(1, 1) ~= '#' then + local t = l:split('\t') + local x = text_parse_pl(t, q2p) + if #t[5] == 1 and t[5] ~= '.' then -- biallelic + local r2 = {} + for k = 1, w do + if is_gt == false then -- use PL + if stack[k] then + local pdg = { stack[k][5], x[5] } + r2[#r2+1] = algo_r2(algo_hapfreq2(pdg)) + else r2[#r2+1] = 0 end + elseif is_ht == false then -- use unphased GT + if stack[k] then + local pdg = { stack[k][4], x[4] } + r2[#r2+1] = algo_r2(algo_hapfreq2(pdg)) + else r2[#r2+1] = 0 end + else -- use phased GT + if stack[k] then + local f, ht = { [0]=0, 0, 0, 0 }, { stack[k][3], x[3] } + for i = 1, #ht[1] do + local j = ht[1][i] * 2 + ht[2][i] + f[j] = f[j] + 1 + end + local sum = f[0] + f[1] + f[2] + f[3] + for k = 0, 3 do f[k] = f[k] / sum end + r2[#r2+1] = algo_r2(f) + else r2[#r2+1] = 0 end + end + end + for k = 1, #r2 do + r2[k] = string.format('%.3f', r2[k]) + end + print(x[1], x[2], table.concat(r2, '\t')) + if #stack == w then table.remove(stack, 1) end + stack[#stack+1] = x + end + end + end + fp:close() +end + +------------------- +-- END: commands -- +------------------- + + +------------------- +-- MAIN FUNCTION -- +------------------- + +if #arg == 0 then + print("\nUsage: vcfutils.lua <command> <arguments>\n") + print("Command: freq count biallelic alleles in each population") + print(" r2 compute r^2") + print(" vcf2chi2 compute 1-degree chi-square between two groups of samples") + print(" vcf2bgl convert PL annotated VCF to Beagle input") + print(" bgl2vcf convert Beagle input to VCF") + print("") + os.exit(1) +end + +local cmd = arg[1] +table.remove(arg, 1) +if cmd == 'vcf2bgl' then cmd_vcf2bgl() +elseif cmd == 'bgl2vcf' then cmd_bgl2vcf() +elseif cmd == 'freq' then cmd_freq() +elseif cmd == 'r2' then cmd_r2() +elseif cmd == 'vcf2chi2' then cmd_vcf2chi2() +else + print('ERROR: unknown command "' .. cmd .. '"') + os.exit(1) +end
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/wgsim.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,419 @@ +/* The MIT License + + Copyright (c) 2008 Genome Research Ltd (GRL). + 2011 Heng Li <lh3@live.co.uk> + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* This program is separated from maq's read simulator with Colin + * Hercus' modification to allow longer indels. */ + +#include <stdlib.h> +#include <math.h> +#include <time.h> +#include <assert.h> +#include <stdio.h> +#include <unistd.h> +#include <stdint.h> +#include <ctype.h> +#include <string.h> +#include <zlib.h> +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +#define PACKAGE_VERSION "0.3.0" + +const uint8_t nst_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +/* Simple normal random number generator, copied from genran.c */ + +double ran_normal() +{ + static int iset = 0; + static double gset; + double fac, rsq, v1, v2; + if (iset == 0) { + do { + v1 = 2.0 * drand48() - 1.0; + v2 = 2.0 * drand48() - 1.0; + rsq = v1 * v1 + v2 * v2; + } while (rsq >= 1.0 || rsq == 0.0); + fac = sqrt(-2.0 * log(rsq) / rsq); + gset = v1 * fac; + iset = 1; + return v2 * fac; + } else { + iset = 0; + return gset; + } +} + +/* wgsim */ + +enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000}; +typedef unsigned short mut_t; +static mut_t mutmsk = (mut_t)0xf000; + +typedef struct { + int l, m; /* length and maximum buffer size */ + mut_t *s; /* sequence */ +} mutseq_t; + +static double ERR_RATE = 0.02; +static double MUT_RATE = 0.001; +static double INDEL_FRAC = 0.15; +static double INDEL_EXTEND = 0.3; +static double MAX_N_RATIO = 0.1; + +void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2) +{ + int i, deleting = 0; + mutseq_t *ret[2]; + + ret[0] = hap1; ret[1] = hap2; + ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l; + ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m; + ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t)); + ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t)); + for (i = 0; i != ks->seq.l; ++i) { + int c; + c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]]; + if (deleting) { + if (drand48() < INDEL_EXTEND) { + if (deleting & 1) ret[0]->s[i] |= DELETE; + if (deleting & 2) ret[1]->s[i] |= DELETE; + continue; + } else deleting = 0; + } + if (c < 4 && drand48() < MUT_RATE) { // mutation + if (drand48() >= INDEL_FRAC) { // substitution + double r = drand48(); + c = (c + (int)(r * 3.0 + 1)) & 3; + if (is_hap || drand48() < 0.333333) { // hom + ret[0]->s[i] = ret[1]->s[i] = SUBSTITUTE|c; + } else { // het + ret[drand48()<0.5?0:1]->s[i] = SUBSTITUTE|c; + } + } else { // indel + if (drand48() < 0.5) { // deletion + if (is_hap || drand48() < 0.333333) { // hom-del + ret[0]->s[i] = ret[1]->s[i] = DELETE; + deleting = 3; + } else { // het-del + deleting = drand48()<0.5?1:2; + ret[deleting-1]->s[i] = DELETE; + } + } else { // insertion + int num_ins = 0, ins = 0; + do { + num_ins++; + ins = (ins << 2) | (int)(drand48() * 4.0); + } while (num_ins < 4 && drand48() < INDEL_EXTEND); + + if (is_hap || drand48() < 0.333333) { // hom-ins + ret[0]->s[i] = ret[1]->s[i] = (num_ins << 12) | (ins << 4) | c; + } else { // het-ins + ret[drand48()<0.5?0:1]->s[i] = (num_ins << 12) | (ins << 4) | c; + } + } + } + } + } +} +void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2) +{ + int i; + for (i = 0; i != ks->seq.l; ++i) { + int c[3]; + c[0] = nst_nt4_table[(int)ks->seq.s[i]]; + c[1] = hap1->s[i]; c[2] = hap2->s[i]; + if (c[0] >= 4) continue; + if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) { + printf("%s\t%d\t", name, i+1); + if (c[1] == c[2]) { // hom + if ((c[1]&mutmsk) == SUBSTITUTE) { // substitution + printf("%c\t%c\t-\n", "ACGTN"[c[0]], "ACGTN"[c[1]&0xf]); + } else if ((c[1]&mutmsk) == DELETE) { // del + printf("%c\t-\t-\n", "ACGTN"[c[0]]); + } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins + printf("-\t"); + int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; + while (n > 0) { + putchar("ACGTN"[ins & 0x3]); + ins >>= 2; + n--; + } + printf("\t-\n"); + } else assert(0); + } else { // het + if ((c[1]&mutmsk) == SUBSTITUTE || (c[2]&mutmsk) == SUBSTITUTE) { // substitution + printf("%c\t%c\t+\n", "ACGTN"[c[0]], "XACMGRSVTWYHKDBN"[1<<(c[1]&0x3)|1<<(c[2]&0x3)]); + } else if ((c[1]&mutmsk) == DELETE) { + printf("%c\t-\t+\n", "ACGTN"[c[0]]); + } else if ((c[2]&mutmsk) == DELETE) { + printf("%c\t-\t+\n", "ACGTN"[c[0]]); + } else if (((c[1] & mutmsk) >> 12) <= 4) { // ins1 + printf("-\t"); + int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; + while (n > 0) { + putchar("ACGTN"[ins & 0x3]); + ins >>= 2; + n--; + } + printf("\t+\n"); + } else if (((c[2] & mutmsk) >> 12) <= 5) { // ins2 + printf("-\t"); + int n = (c[2]&mutmsk) >> 12, ins = c[2] >> 4; + while (n > 0) { + putchar("ACGTN"[ins & 0x3]); + ins >>= 2; + n--; + } + printf("\t+\n"); + } else assert(0); + } + } + } +} + +void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r) +{ + kseq_t *ks; + mutseq_t rseq[2]; + gzFile fp_fa; + uint64_t tot_len, ii; + int i, l, n_ref; + char *qstr; + int size[2], Q, max_size; + uint8_t *tmp_seq[2]; + mut_t *target; + + l = size_l > size_r? size_l : size_r; + qstr = (char*)calloc(l+1, 1); + tmp_seq[0] = (uint8_t*)calloc(l+2, 1); + tmp_seq[1] = (uint8_t*)calloc(l+2, 1); + size[0] = size_l; size[1] = size_r; + max_size = size_l > size_r? size_l : size_r; + + Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; + + fp_fa = gzopen(fn, "r"); + ks = kseq_init(fp_fa); + tot_len = n_ref = 0; + fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__); + while ((l = kseq_read(ks)) >= 0) { + tot_len += l; + ++n_ref; + } + fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len); + kseq_destroy(ks); + gzclose(fp_fa); + + fp_fa = gzopen(fn, "r"); + ks = kseq_init(fp_fa); + while ((l = kseq_read(ks)) >= 0) { + uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5); + if (l < dist + 3 * std_dev) { + fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev); + continue; + } + + // generate mutations and print them out + wgsim_mut_diref(ks, is_hap, rseq, rseq+1); + wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1); + + for (ii = 0; ii != n_pairs; ++ii) { // the core loop + double ran; + int d, pos, s[2], is_flip = 0; + int n_sub[2], n_indel[2], n_err[2], ext_coor[2], j, k; + FILE *fpo[2]; + + do { // avoid boundary failure + ran = ran_normal(); + ran = ran * std_dev + dist; + d = (int)(ran + 0.5); + d = d > max_size? d : max_size; + pos = (int)((l - d + 1) * drand48()); + } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l); + + // flip or not + if (drand48() < 0.5) { + fpo[0] = fpout1; fpo[1] = fpout2; + s[0] = size[0]; s[1] = size[1]; + } else { + fpo[1] = fpout1; fpo[0] = fpout2; + s[1] = size[0]; s[0] = size[1]; + is_flip = 1; + } + + // generate the read sequences + target = rseq[drand48()<0.5?0:1].s; // haplotype from which the reads are generated + n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0; + +#define __gen_read(x, start, iter) do { \ + for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) { \ + int c = target[i], mut_type = c & mutmsk; \ + if (ext_coor[x] < 0) { \ + if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \ + ext_coor[x] = i; \ + } \ + if (mut_type == DELETE) ++n_indel[x]; \ + else if (mut_type == NOCHANGE || mut_type == SUBSTITUTE) { \ + tmp_seq[x][k++] = c & 0xf; \ + if (mut_type == SUBSTITUTE) ++n_sub[x]; \ + } else { \ + int n, ins; \ + ++n_indel[x]; \ + tmp_seq[x][k++] = c & 0xf; \ + for (n = mut_type>>12, ins = c>>4; n > 0 && k < s[x]; --n, ins >>= 2) \ + tmp_seq[x][k++] = ins & 0x3; \ + } \ + } \ + if (k != s[x]) ext_coor[x] = -10; \ + } while (0) + + __gen_read(0, pos, ++i); + __gen_read(1, pos + d - 1, --i); + for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement + if (ext_coor[0] < 0 || ext_coor[1] < 0) { // fail to generate the read(s) + --ii; + continue; + } + + // generate sequencing errors + for (j = 0; j < 2; ++j) { + int n_n = 0; + for (i = 0; i < s[j]; ++i) { + int c = tmp_seq[j][i]; + if (c >= 4) { // actually c should be never larger than 4 if everything is correct + c = 4; + ++n_n; + } else if (drand48() < ERR_RATE) { + // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors + c = (c + 1) & 3; // recurrent sequencing errors + ++n_err[j]; + } + tmp_seq[j][i] = c; + } + if ((double)n_n / s[j] > MAX_N_RATIO) break; + } + if (j < 2) { // too many ambiguous bases on one of the reads + --ii; + continue; + } + + // print + for (j = 0; j < 2; ++j) { + for (i = 0; i < s[j]; ++i) qstr[i] = Q; + qstr[i] = 0; + fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1, + n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], + (long long)ii, j==0? is_flip+1 : 2-is_flip); + for (i = 0; i < s[j]; ++i) + fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]); + fprintf(fpo[j], "\n+\n%s\n", qstr); + } + } + free(rseq[0].s); free(rseq[1].s); + } + kseq_destroy(ks); + gzclose(fp_fa); + free(qstr); + free(tmp_seq[0]); free(tmp_seq[1]); +} + +static int simu_usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Program: wgsim (short read simulator)\n"); + fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); + fprintf(stderr, "Contact: Heng Li <lh3@sanger.ac.uk>\n\n"); + fprintf(stderr, "Usage: wgsim [options] <in.ref.fa> <out.read1.fq> <out.read2.fq>\n\n"); + fprintf(stderr, "Options: -e FLOAT base error rate [%.3f]\n", ERR_RATE); + fprintf(stderr, " -d INT outer distance between the two ends [500]\n"); + fprintf(stderr, " -s INT standard deviation [50]\n"); + fprintf(stderr, " -N INT number of read pairs [1000000]\n"); + fprintf(stderr, " -1 INT length of the first read [70]\n"); + fprintf(stderr, " -2 INT length of the second read [70]\n"); + fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE); + fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC); + fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND); + fprintf(stderr, " -S INT seed for random generator [-1]\n"); + fprintf(stderr, " -h haplotype mode\n"); + fprintf(stderr, "\n"); + return 1; +} + +int main(int argc, char *argv[]) +{ + int64_t N; + int dist, std_dev, c, size_l, size_r, is_hap = 0; + FILE *fpout1, *fpout2; + int seed = -1; + + N = 1000000; dist = 500; std_dev = 50; + size_l = size_r = 70; + while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:")) >= 0) { + switch (c) { + case 'd': dist = atoi(optarg); break; + case 's': std_dev = atoi(optarg); break; + case 'N': N = atoi(optarg); break; + case '1': size_l = atoi(optarg); break; + case '2': size_r = atoi(optarg); break; + case 'e': ERR_RATE = atof(optarg); break; + case 'r': MUT_RATE = atof(optarg); break; + case 'R': INDEL_FRAC = atof(optarg); break; + case 'X': INDEL_EXTEND = atof(optarg); break; + case 'S': seed = atoi(optarg); break; + case 'h': is_hap = 1; break; + } + } + if (argc - optind < 3) return simu_usage(); + fpout1 = fopen(argv[optind+1], "w"); + fpout2 = fopen(argv[optind+2], "w"); + if (!fpout1 || !fpout2) { + fprintf(stderr, "[wgsim] file open error\n"); + return 1; + } + srand48(seed > 0? seed : time(0)); + wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r); + + fclose(fpout1); fclose(fpout2); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/wgsim_eval.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,91 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.5 + +use strict; +use warnings; +use Getopt::Std; + +&wgsim_eval; +exit; + +sub wgsim_eval { + my %opts = (g=>5); + getopts('pcag:', \%opts); + die("Usage: wgsim_eval.pl [-pca] [-g $opts{g}] <in.sam>\n") if (@ARGV == 0 && -t STDIN); + my (@c0, @c1, %fnfp); + my ($max_q, $flag) = (0, 0); + my $gap = $opts{g}; + $flag |= 1 if (defined $opts{p}); + $flag |= 2 if (defined $opts{c}); + while (<>) { + next if (/^\@/); + my @t = split("\t"); + next if (@t < 11); + my $line = $_; + my ($q, $is_correct, $chr, $left, $rght) = (int($t[4]/10), 1, $t[2], $t[3], $t[3]); + $max_q = $q if ($q > $max_q); + # right coordinate + $_ = $t[5]; s/(\d+)[MDN]/$rght+=$1,'x'/eg; + --$rght; + # correct for soft clipping + my ($left0, $rght0) = ($left, $rght); + $left -= $1 if (/^(\d+)[SH]/); + $rght += $1 if (/(\d+)[SH]$/); + $left0 -= $1 if (/(\d+)[SH]$/); + $rght0 += $1 if (/^(\d+)[SH]/); + # skip unmapped reads + next if (($t[1]&0x4) || $chr eq '*'); + # parse read name and check + if ($t[0] =~ /^(\S+)_(\d+)_(\d+)_/) { + if ($1 ne $chr) { # different chr + $is_correct = 0; + } else { + if ($flag & 2) { + if (($t[1]&0x40) && !($t[1]&0x10)) { # F3, forward + $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap); + } elsif (($t[1]&0x40) && ($t[1]&0x10)) { # F3, reverse + $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap); + } elsif (($t[1]&0x80) && !($t[1]&0x10)) { # R3, forward + $is_correct = 0 if (abs($3 - $left) > $gap && abs($3 - $left0) > $gap); + } else { # R3, reverse + $is_correct = 0 if (abs($2 - $rght) > $gap && abs($3 - $rght0) > $gap); + } + } else { + if ($t[1] & 0x10) { # reverse + $is_correct = 0 if (abs($3 - $rght) > $gap && abs($3 - $rght0) > $gap); # in case of indels that are close to the end of a reads + } else { + $is_correct = 0 if (abs($2 - $left) > $gap && abs($2 - $left0) > $gap); + } + } + } + } else { + warn("[wgsim_eval] read '$t[0]' was not generated by wgsim?\n"); + next; + } + ++$c0[$q]; + ++$c1[$q] unless ($is_correct); + @{$fnfp{$t[4]}} = (0, 0) unless (defined $fnfp{$t[4]}); + ++$fnfp{$t[4]}[0]; + ++$fnfp{$t[4]}[1] unless ($is_correct); + print STDERR $line if (($flag&1) && !$is_correct && $q > 0); + } + # print + my ($cc0, $cc1) = (0, 0); + if (!defined($opts{a})) { + for (my $i = $max_q; $i >= 0; --$i) { + $c0[$i] = 0 unless (defined $c0[$i]); + $c1[$i] = 0 unless (defined $c1[$i]); + $cc0 += $c0[$i]; $cc1 += $c1[$i]; + printf("%.2dx %12d / %-12d %12d %.3e\n", $i, $c1[$i], $c0[$i], $cc0, $cc1/$cc0) if ($cc0); + } + } else { + for (reverse(sort {$a<=>$b} (keys %fnfp))) { + next if ($_ == 0); + $cc0 += $fnfp{$_}[0]; + $cc1 += $fnfp{$_}[1]; + print join("\t", $_, $cc0, $cc1), "\n"; + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/zoom2sam.pl Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,97 @@ +#!/usr/bin/perl -w + +# Contact: lh3 +# Version: 0.1.0 + +use strict; +use warnings; +use Getopt::Std; + +&zoom2sam; +exit; + +sub mating { + my ($s1, $s2) = @_; + my $isize = 0; + if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize + my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; + my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; + $isize = $x2 - $x1; + } + # update mate coordinate + if ($s2->[2] ne '*') { + @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); + $s1->[1] |= 0x20 if ($s2->[1] & 0x10); + } else { + $s1->[1] |= 0x8; + } + if ($s1->[2] ne '*') { + @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); + $s2->[1] |= 0x20 if ($s1->[1] & 0x10); + } else { + $s2->[1] |= 0x8; + } +} + +sub zoom2sam { + my %opts = (); + getopts("p", \%opts); + die("Usage: zoom2sam.pl [-p] <readLen> <aln.zoom> +Warnings: This script only supports the default Illumina outputs.\n") if (@ARGV < 2); + my $is_paired = defined($opts{p}); + my $len = shift(@ARGV); + # core loop + my @s1 = (); + my @s2 = (); + my ($s_last, $s_curr) = (\@s1, \@s2); + while (<>) { + &zoom2sam_aux($_, $s_curr, $is_paired, $len); + if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { + &mating($s_last, $s_curr); + print join("\t", @$s_last), "\n"; + print join("\t", @$s_curr), "\n"; + @$s_last = (); @$s_curr = (); + } else { + print join("\t", @$s_last), "\n" if (@$s_last != 0); + my $s = $s_last; $s_last = $s_curr; $s_curr = $s; + } + } + print join("\t", @$s_last), "\n" if (@$s_last != 0); +} + +sub zoom2sam_aux { + my ($line, $s, $is_paired, $len) = @_; + chomp($line); + my @t = split("\t", $line); + @$s = (); + # read name + $s->[0] = $t[0]; + # initial flag (will be updated later) + $s->[1] = 0; + $s->[1] |= 1 | 1<<6 if ($s->[0] =~ /_F$/); + $s->[1] |= 1 | 1<<7 if ($s->[0] =~ /_R$/); + $s->[1] |= 2 if ($is_paired); + # read & quality + $s->[9] = "*"; $s->[10] = "*"; + # cigar + $s->[5] = $len . "M"; + # coor + my @s = split(/\s+/, $t[1]); + $s->[2] = $s[0]; + $t[1] =~ /:(\d+)$/; + $s->[3] = $1 + 1; + if ($s->[0] =~ /_[FR]$/) { + my $u = ($s->[0] =~ /_F$/)? 1 : 0; + my $w = ($t[2] eq '+')? 1 : 0; + $s->[1] |= 0x10 if ($u ^ $w); + $s->[0] =~ s/_[FR]$//; + } else { + $s->[1] |= 0x10 if ($t[2] eq '-'); + } + # mapQ + $s->[4] = 30; + # mate coordinate + $s->[6] = '*'; $s->[7] = $s->[8] = 0; + # aux + push(@$s, "NM:i:$t[3]"); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/padding.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,479 @@ +#include <string.h> +#include <assert.h> +#include <unistd.h> +#include "kstring.h" +#include "sam_header.h" +#include "sam.h" +#include "bam.h" +#include "faidx.h" + +bam_header_t *bam_header_dup(const bam_header_t *h0); /*in sam.c*/ + +static void replace_cigar(bam1_t *b, int n, uint32_t *cigar) +{ + if (n != b->core.n_cigar) { + int o = b->core.l_qname + b->core.n_cigar * 4; + if (b->data_len + (n - b->core.n_cigar) * 4 > b->m_data) { + b->m_data = b->data_len + (n - b->core.n_cigar) * 4; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->data_len - o); + memcpy(b->data + b->core.l_qname, cigar, n * 4); + b->data_len += (n - b->core.n_cigar) * 4; + b->core.n_cigar = n; + } else memcpy(b->data + b->core.l_qname, cigar, n * 4); +} + +#define write_cigar(_c, _n, _m, _v) do { \ + if (_n == _m) { \ + _m = _m? _m<<1 : 4; \ + _c = (uint32_t*)realloc(_c, _m * 4); \ + } \ + _c[_n++] = (_v); \ + } while (0) + +static void unpad_seq(bam1_t *b, kstring_t *s) +{ + int k, j, i; + int length; + uint32_t *cigar = bam1_cigar(b); + uint8_t *seq = bam1_seq(b); + // b->core.l_qseq gives length of the SEQ entry (including soft clips, S) + // We need the padded length after alignment from the CIGAR (excluding + // soft clips S, but including pads from CIGAR D operations) + length = 0; + for (k = 0; k < b->core.n_cigar; ++k) { + int op, ol; + op= bam_cigar_op(cigar[k]); + ol = bam_cigar_oplen(cigar[k]); + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF || op == BAM_CDEL) + length += ol; + } + ks_resize(s, length); + for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) { + int op, ol; + op = bam_cigar_op(cigar[k]); + ol = bam_cigar_oplen(cigar[k]); + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam1_seqi(seq, j); + } else if (op == BAM_CSOFT_CLIP) { + j += ol; + } else if (op == BAM_CHARD_CLIP) { + /* do nothing */ + } else if (op == BAM_CDEL) { + for (i = 0; i < ol; ++i) s->s[s->l++] = 0; + } else { + fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam1_qname(b)); + assert(-1); + } + } + assert(length == s->l); +} + +int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq) +{ + char base; + char *fai_ref = 0; + int fai_ref_len = 0, k; + + fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); + if (fai_ref_len != ref_len) { + fprintf(stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len); + free(fai_ref); + return -1; + } + ks_resize(seq, ref_len); + seq->l = 0; + for (k = 0; k < ref_len; ++k) { + base = fai_ref[k]; + if (base == '-' || base == '*') { + // Map gaps to null to match unpad_seq function + seq->s[seq->l++] = 0; + } else { + int i = bam_nt16_table[(int)base]; + if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16 + fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name); + free(fai_ref); + return -1; + } + seq->s[seq->l++] = i; + } + } + assert(ref_len == seq->l); + free(fai_ref); + return 0; +} + +int get_unpadded_len(faidx_t *fai, char *ref_name, int padded_len) +{ + char base; + char *fai_ref = 0; + int fai_ref_len = 0, k; + int bases=0, gaps=0; + + fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); + if (fai_ref_len != padded_len) { + fprintf(stderr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len); + free(fai_ref); + return -1; + } + for (k = 0; k < padded_len; ++k) { + //fprintf(stderr, "[depad] checking base %i of %i or %i\n", k+1, ref_len, strlen(fai_ref)); + base = fai_ref[k]; + if (base == '-' || base == '*') { + gaps += 1; + } else { + int i = bam_nt16_table[(int)base]; + if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16 + fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence '%s'\n", base, (int)base, ref_name); + free(fai_ref); + return -1; + } + bases += 1; + } + } + free(fai_ref); + assert (padded_len == bases + gaps); + return bases; +} + +inline int * update_posmap(int *posmap, kstring_t ref) +{ + int i, k; + posmap = realloc(posmap, ref.m * sizeof(int)); + for (i = k = 0; i < ref.l; ++i) { + posmap[i] = k; + if (ref.s[i]) ++k; + } + return posmap; +} + +int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai) +{ + bam_header_t *h = 0; + bam1_t *b = 0; + kstring_t r, q; + int r_tid = -1; + uint32_t *cigar2 = 0; + int ret = 0, n2 = 0, m2 = 0, *posmap = 0; + + b = bam_init1(); + r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; + int read_ret; + h = in->header; + while ((read_ret = samread(in, b)) >= 0) { // read one alignment from `in' + uint32_t *cigar = bam1_cigar(b); + n2 = 0; + if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) { + // fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam1_qname(b)); + r_tid = b->core.tid; + unpad_seq(b, &r); + if (h->target_len[r_tid] != r.l) { + fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %d in BAM header, but %ld in embedded reference\n", bam1_qname(b), h->target_len[r_tid], r.l); + return -1; + } + if (fai) { + // Check the embedded reference matches the FASTA file + if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) { + fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]); + return -1; + } + assert(r.l == q.l); + int i; + for (i = 0; i < r.l; ++i) { + if (r.s[i] != q.s[i]) { + // Show gaps as ASCII 45 + fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", + h->target_name[b->core.tid], i+1, + r.s[i] ? bam_nt16_rev_table[(int)r.s[i]] : 45, + q.s[i] ? bam_nt16_rev_table[(int)q.s[i]] : 45); + return -1; + } + } + } + write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); + replace_cigar(b, n2, cigar2); + posmap = update_posmap(posmap, r); + } else if (b->core.n_cigar > 0) { + int i, k, op; + if (b->core.tid < 0) { + fprintf(stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam1_qname(b)); + return -1; + } else if (b->core.tid == r_tid) { + ; // good case, reference available + //fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam1_qname(b)); + } else if (fai) { + if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { + fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); + return -1; + } + posmap = update_posmap(posmap, r); + r_tid = b->core.tid; + // fprintf(stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); + } else { + fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); + return -1; + } + unpad_seq(b, &q); + if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) { + write_cigar(cigar2, n2, m2, cigar[0]); + } else if (bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) { + write_cigar(cigar2, n2, m2, cigar[0]); + if (b->core.n_cigar > 2 && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) { + write_cigar(cigar2, n2, m2, cigar[1]); + } + } + /* Determine CIGAR operator for each base in the aligned read */ + for (i = 0, k = b->core.pos; i < q.l; ++i, ++k) + q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD); + /* Include any pads if starts with an insert */ + if (q.s[0] == BAM_CINS) { + for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k); + if (k) write_cigar(cigar2, n2, m2, bam_cigar_gen(k, BAM_CPAD)); + } + /* Count consecutive CIGAR operators to turn into a CIGAR string */ + for (i = k = 1, op = q.s[0]; i < q.l; ++i) { + if (op != q.s[i]) { + write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); + op = q.s[i]; k = 1; + } else ++k; + } + write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); + if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) { + write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); + } else if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CHARD_CLIP) { + if (b->core.n_cigar > 2 && bam_cigar_op(cigar[b->core.n_cigar-2]) == BAM_CSOFT_CLIP) { + write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-2]); + } + write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); + } + /* Remove redundant P operators between M/X/=/D operators, e.g. 5M2P10M -> 15M */ + int pre_op, post_op; + for (i = 2; i < n2; ++i) + if (bam_cigar_op(cigar2[i-1]) == BAM_CPAD) { + pre_op = bam_cigar_op(cigar2[i-2]); + post_op = bam_cigar_op(cigar2[i]); + /* Note don't need to check for X/= as code above will use M only */ + if ((pre_op == BAM_CMATCH || pre_op == BAM_CDEL) && (post_op == BAM_CMATCH || post_op == BAM_CDEL)) { + /* This is a redundant P operator */ + cigar2[i-1] = 0; // i.e. 0M + /* If had same operator either side, combine them in post_op */ + if (pre_op == post_op) { + /* If CIGAR M, could treat as simple integers since BAM_CMATCH is zero*/ + cigar2[i] = bam_cigar_gen(bam_cigar_oplen(cigar2[i-2]) + bam_cigar_oplen(cigar2[i]), post_op); + cigar2[i-2] = 0; // i.e. 0M + } + } + } + /* Remove the zero'd operators (0M) */ + for (i = k = 0; i < n2; ++i) + if (cigar2[i]) cigar2[k++] = cigar2[i]; + n2 = k; + replace_cigar(b, n2, cigar2); + b->core.pos = posmap[b->core.pos]; + if (b->core.mtid < 0 || b->core.mpos < 0) { + /* Nice case, no mate to worry about*/ + // fprintf(stderr, "[depad] Read '%s' mate not mapped\n", bam1_qname(b)); + /* TODO - Warning if FLAG says mate should be mapped? */ + /* Clean up funny input where mate position is given but mate reference is missing: */ + b->core.mtid = -1; + b->core.mpos = -1; + } else if (b->core.mtid == b->core.tid) { + /* Nice case, same reference */ + // fprintf(stderr, "[depad] Read '%s' mate mapped to same ref\n", bam1_qname(b)); + b->core.mpos = posmap[b->core.mpos]; + } else { + /* Nasty case, Must load alternative posmap */ + // fprintf(stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]); + if (!fai) { + fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]); + return -1; + } + /* Temporarily load the other reference sequence */ + if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) { + fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]); + return -1; + } + posmap = update_posmap(posmap, r); + b->core.mpos = posmap[b->core.mpos]; + /* Restore the reference and posmap*/ + if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { + fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); + return -1; + } + posmap = update_posmap(posmap, r); + } + } + samwrite(out, b); + } + if (read_ret < -1) { + fprintf(stderr, "[depad] truncated file.\n"); + ret = 1; + } + free(r.s); free(q.s); free(posmap); + bam_destroy1(b); + return ret; +} + +bam_header_t * fix_header(bam_header_t *old, faidx_t *fai) +{ + int i = 0, unpadded_len = 0; + bam_header_t *header = 0 ; + + header = bam_header_dup(old); + for (i = 0; i < old->n_targets; ++i) { + unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]); + if (unpadded_len < 0) { + fprintf(stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]); + } else { + header->target_len[i] = unpadded_len; + //fprintf(stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]); + } + } + /* Duplicating the header allocated new buffer for header string */ + /* After modifying the @SQ lines it will only get smaller, since */ + /* the LN entries will be the same or shorter, and we'll remove */ + /* any MD entries (MD5 checksums). */ + assert(strlen(old->text) == strlen(header->text)); + assert (0==strcmp(old->text, header->text)); + const char *text; + text = old->text; + header->text[0] = '\0'; /* Resuse the allocated buffer */ + char * newtext = header->text; + char * end=NULL; + while (text[0]=='@') { + end = strchr(text, '\n'); + assert(end != 0); + if (text[1]=='S' && text[2]=='Q' && text[3]=='\t') { + /* TODO - edit the @SQ line here to remove MD and fix LN. */ + /* For now just remove the @SQ line, and samtools will */ + /* automatically generate a minimal replacement with LN. */ + /* However, that discards any other tags like AS, SP, UR. */ + //fprintf(stderr, "[depad] Removing @SQ line\n"); + } else { + /* Copy this line to the new header */ + strncat(newtext, text, end - text + 1); + } + text = end + 1; + } + assert (text[0]=='\0'); + /* Check we didn't overflow the buffer */ + assert (strlen(header->text) <= strlen(old->text)); + if (strlen(header->text) < header->l_text) { + //fprintf(stderr, "[depad] Reallocating header buffer\n"); + assert (newtext == header->text); + newtext = malloc(strlen(header->text) + 1); + strcpy(newtext, header->text); + free(header->text); + header->text = newtext; + header->l_text = strlen(newtext); + } + //fprintf(stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text); + return header; +} + +static int usage(int is_long_help); + +int main_pad2unpad(int argc, char *argv[]) +{ + samfile_t *in = 0, *out = 0; + bam_header_t *h = 0; + faidx_t *fai = 0; + int c, is_bamin = 1, compress_level = -1, is_bamout = 1, is_long_help = 0; + char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0; + int ret=0; + + /* parse command-line options */ + strcpy(in_mode, "r"); strcpy(out_mode, "w"); + while ((c = getopt(argc, argv, "Sso:u1T:?")) >= 0) { + switch (c) { + case 'S': is_bamin = 0; break; + case 's': assert(compress_level == -1); is_bamout = 0; break; + case 'o': fn_out = strdup(optarg); break; + case 'u': assert(is_bamout == 1); compress_level = 0; break; + case '1': assert(is_bamout == 1); compress_level = 1; break; + case 'T': fn_ref = strdup(optarg); break; + case '?': is_long_help = 1; break; + default: return usage(is_long_help); + } + } + if (argc == optind) return usage(is_long_help); + + if (is_bamin) strcat(in_mode, "b"); + if (is_bamout) strcat(out_mode, "b"); + strcat(out_mode, "h"); + if (compress_level >= 0) { + char tmp[2]; + tmp[0] = compress_level + '0'; tmp[1] = '\0'; + strcat(out_mode, tmp); + } + + // Load FASTA reference (also needed for SAM -> BAM if missing header) + if (fn_ref) { + fn_list = samfaipath(fn_ref); + fai = fai_load(fn_ref); + } + // open file handlers + if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) { + fprintf(stderr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]); + ret = 1; + goto depad_end; + } + if (in->header == 0) { + fprintf(stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]); + ret = 1; + goto depad_end; + } + if (in->header->text == 0 || in->header->l_text == 0) { + fprintf(stderr, "[depad] Warning - failed to read any header text from \"%s\".\n", argv[optind]); + assert (0 == in->header->l_text); + assert (0 == in->header->text); + } + if (fn_ref) { + h = fix_header(in->header, fai); + } else { + fprintf(stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n"); + h = in->header; + } + if ((out = samopen(fn_out? fn_out : "-", out_mode, h)) == 0) { + fprintf(stderr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); + ret = 1; + goto depad_end; + } + + // Do the depad + ret = bam_pad2unpad(in, out, fai); + +depad_end: + // close files, free and return + if (fai) fai_destroy(fai); + if (h != in->header) bam_header_destroy(h); + samclose(in); + samclose(out); + free(fn_list); free(fn_out); + return ret; +} + +static int usage(int is_long_help) +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools depad <in.bam>\n\n"); + fprintf(stderr, "Options: -s output is SAM (default is BAM)\n"); + fprintf(stderr, " -S input is SAM (default is BAM)\n"); + fprintf(stderr, " -u uncompressed BAM output (can't use with -s)\n"); + fprintf(stderr, " -1 fast compression BAM output (can't use with -s)\n"); + fprintf(stderr, " -T FILE reference sequence file [null]\n"); + fprintf(stderr, " -o FILE output file name [stdout]\n"); + fprintf(stderr, " -? longer help\n"); + fprintf(stderr, "\n"); + if (is_long_help) + fprintf(stderr, "Notes:\n\ +\n\ + 1. Requires embedded reference sequences (before the reads for that reference),\n\ + with the future aim to also support a FASTA padded reference sequence file.\n\ +\n\ + 2. The input padded alignment read's CIGAR strings must not use P or I operators.\n\ +\n"); + return 1; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/phase.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,687 @@ +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdint.h> +#include <math.h> +#include <zlib.h> +#include "bam.h" +#include "errmod.h" + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 16384) + +#define MAX_VARS 256 +#define FLIP_PENALTY 2 +#define FLIP_THRES 4 +#define MASK_THRES 3 + +#define FLAG_FIX_CHIMERA 0x1 +#define FLAG_LIST_EXCL 0x4 +#define FLAG_DROP_AMBI 0x8 + +typedef struct { + // configurations, initialized in the main function + int flag, k, min_baseQ, min_varLOD, max_depth; + // other global variables + int vpos_shift; + bamFile fp; + char *pre; + bamFile out[3]; + // alignment queue + int n, m; + bam1_t **b; +} phaseg_t; + +typedef struct { + int8_t seq[MAX_VARS]; // TODO: change to dynamic memory allocation! + int vpos, beg, end; + uint32_t vlen:16, single:1, flip:1, phase:1, phased:1, ambig:1; + uint32_t in:16, out:16; // in-phase and out-phase +} frag_t, *frag_p; + +#define rseq_lt(a,b) ((a)->vpos < (b)->vpos) + +#include "khash.h" +KHASH_SET_INIT_INT64(set64) +KHASH_MAP_INIT_INT64(64, frag_t) + +typedef khash_t(64) nseq_t; + +#include "ksort.h" +KSORT_INIT(rseq, frag_p, rseq_lt) + +static char nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; + +static inline uint64_t X31_hash_string(const char *s) +{ + uint64_t h = *s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + return h; +} + +static void count1(int l, const uint8_t *seq, int *cnt) +{ + int i, j, n_ambi; + uint32_t z, x; + if (seq[l-1] == 0) return; // do nothing is the last base is ambiguous + for (i = n_ambi = 0; i < l; ++i) // collect ambiguous bases + if (seq[i] == 0) ++n_ambi; + if (l - n_ambi <= 1) return; // only one SNP + for (x = 0; x < 1u<<n_ambi; ++x) { // count + for (i = j = 0, z = 0; i < l; ++i) { + int c; + if (seq[i]) c = seq[i] - 1; + else { + c = x>>j&1; + ++j; + } + z = z<<1 | c; + } + ++cnt[z]; + } +} + +static int **count_all(int l, int vpos, nseq_t *hash) +{ + khint_t k; + int i, j, **cnt; + uint8_t *seq; + seq = calloc(l, 1); + cnt = calloc(vpos, sizeof(void*)); + for (i = 0; i < vpos; ++i) cnt[i] = calloc(1<<l, sizeof(int)); + for (k = 0; k < kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + frag_t *f = &kh_val(hash, k); + if (f->vpos >= vpos || f->single) continue; // out of region; or singleton + if (f->vlen == 1) { // such reads should be flagged as deleted previously if everything is right + f->single = 1; + continue; + } + for (j = 1; j < f->vlen; ++j) { + for (i = 0; i < l; ++i) + seq[i] = j < l - 1 - i? 0 : f->seq[j - (l - 1 - i)]; + count1(l, seq, cnt[f->vpos + j]); + } + } + } + free(seq); + return cnt; +} + +// phasing +static int8_t *dynaprog(int l, int vpos, int **w) +{ + int *f[2], *curr, *prev, max, i; + int8_t **b, *h = 0; + uint32_t x, z = 1u<<(l-1), mask = (1u<<l) - 1; + f[0] = calloc(z, sizeof(int)); + f[1] = calloc(z, sizeof(int)); + b = calloc(vpos, sizeof(void*)); + prev = f[0]; curr = f[1]; + // fill the backtrack matrix + for (i = 0; i < vpos; ++i) { + int *wi = w[i], *tmp; + int8_t *bi; + bi = b[i] = calloc(z, 1); + /* In the following, x is the current state, which is the + * lexicographically smaller local haplotype. xc is the complement of + * x, or the larger local haplotype; y0 and y1 are the two predecessors + * of x. */ + for (x = 0; x < z; ++x) { // x0 is the smaller + uint32_t y0, y1, xc; + int c0, c1; + xc = ~x&mask; y0 = x>>1; y1 = xc>>1; + c0 = prev[y0] + wi[x] + wi[xc]; + c1 = prev[y1] + wi[x] + wi[xc]; + if (c0 > c1) bi[x] = 0, curr[x] = c0; + else bi[x] = 1, curr[x] = c1; + } + tmp = prev; prev = curr; curr = tmp; // swap + } + { // backtrack + uint32_t max_x = 0; + int which = 0; + h = calloc(vpos, 1); + for (x = 0, max = 0, max_x = 0; x < z; ++x) + if (prev[x] > max) max = prev[x], max_x = x; + for (i = vpos - 1, x = max_x; i >= 0; --i) { + h[i] = which? (~x&1) : (x&1); + which = b[i][x]? !which : which; + x = b[i][x]? (~x&mask)>>1 : x>>1; + } + } + // free + for (i = 0; i < vpos; ++i) free(b[i]); + free(f[0]); free(f[1]); free(b); + return h; +} + +// phase each fragment +static uint64_t *fragphase(int vpos, const int8_t *path, nseq_t *hash, int flip) +{ + khint_t k; + uint64_t *pcnt; + uint32_t *left, *rght, max; + left = rght = 0; max = 0; + pcnt = calloc(vpos, 8); + for (k = 0; k < kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + int i, c[2]; + frag_t *f = &kh_val(hash, k); + if (f->vpos >= vpos) continue; + // get the phase + c[0] = c[1] = 0; + for (i = 0; i < f->vlen; ++i) { + if (f->seq[i] == 0) continue; + ++c[f->seq[i] == path[f->vpos + i] + 1? 0 : 1]; + } + f->phase = c[0] > c[1]? 0 : 1; + f->in = c[f->phase]; f->out = c[1 - f->phase]; + f->phased = f->in == f->out? 0 : 1; + f->ambig = (f->in && f->out && f->out < 3 && f->in <= f->out + 1)? 1 : 0; + // fix chimera + f->flip = 0; + if (flip && c[0] >= 3 && c[1] >= 3) { + int sum[2], m, mi, md; + if (f->vlen > max) { // enlarge the array + max = f->vlen; + kroundup32(max); + left = realloc(left, max * 4); + rght = realloc(rght, max * 4); + } + for (i = 0, sum[0] = sum[1] = 0; i < f->vlen; ++i) { // get left counts + if (f->seq[i]) { + int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1; + ++sum[c == path[f->vpos + i]? 0 : 1]; + } + left[i] = sum[1]<<16 | sum[0]; + } + for (i = f->vlen - 1, sum[0] = sum[1] = 0; i >= 0; --i) { // get right counts + if (f->seq[i]) { + int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1; + ++sum[c == path[f->vpos + i]? 0 : 1]; + } + rght[i] = sum[1]<<16 | sum[0]; + } + // find the best flip point + for (i = m = 0, mi = -1, md = -1; i < f->vlen - 1; ++i) { + int a[2]; + a[0] = (left[i]&0xffff) + (rght[i+1]>>16&0xffff) - (rght[i+1]&0xffff) * FLIP_PENALTY; + a[1] = (left[i]>>16&0xffff) + (rght[i+1]&0xffff) - (rght[i+1]>>16&0xffff) * FLIP_PENALTY; + if (a[0] > a[1]) { + if (a[0] > m) m = a[0], md = 0, mi = i; + } else { + if (a[1] > m) m = a[1], md = 1, mi = i; + } + } + if (m - c[0] >= FLIP_THRES && m - c[1] >= FLIP_THRES) { // then flip + f->flip = 1; + if (md == 0) { // flip the tail + for (i = mi + 1; i < f->vlen; ++i) + if (f->seq[i] == 1) f->seq[i] = 2; + else if (f->seq[i] == 2) f->seq[i] = 1; + } else { // flip the head + for (i = 0; i <= mi; ++i) + if (f->seq[i] == 1) f->seq[i] = 2; + else if (f->seq[i] == 2) f->seq[i] = 1; + } + } + } + // update pcnt[] + if (!f->single) { + for (i = 0; i < f->vlen; ++i) { + int c; + if (f->seq[i] == 0) continue; + c = f->phase? 2 - f->seq[i] : f->seq[i] - 1; + if (c == path[f->vpos + i]) { + if (f->phase == 0) ++pcnt[f->vpos + i]; + else pcnt[f->vpos + i] += 1ull<<32; + } else { + if (f->phase == 0) pcnt[f->vpos + i] += 1<<16; + else pcnt[f->vpos + i] += 1ull<<48; + } + } + } + } + } + free(left); free(rght); + return pcnt; +} + +static uint64_t *genmask(int vpos, const uint64_t *pcnt, int *_n) +{ + int i, max = 0, max_i = -1, m = 0, n = 0, beg = 0, score = 0; + uint64_t *list = 0; + for (i = 0; i < vpos; ++i) { + uint64_t x = pcnt[i]; + int c[4], pre = score, s; + c[0] = x&0xffff; c[1] = x>>16&0xffff; c[2] = x>>32&0xffff; c[3] = x>>48&0xffff; + s = (c[1] + c[3] == 0)? -(c[0] + c[2]) : (c[1] + c[3] - 1); + if (c[3] > c[2]) s += c[3] - c[2]; + if (c[1] > c[0]) s += c[1] - c[0]; + score += s; + if (score < 0) score = 0; + if (pre == 0 && score > 0) beg = i; // change from zero to non-zero + if ((i == vpos - 1 || score == 0) && max >= MASK_THRES) { + if (n == m) { + m = m? m<<1 : 4; + list = realloc(list, m * 8); + } + list[n++] = (uint64_t)beg<<32 | max_i; + i = max_i; // reset i to max_i + score = 0; + } else if (score > max) max = score, max_i = i; + if (score == 0) max = 0; + } + *_n = n; + return list; +} + +// trim heading and tailing ambiguous bases; mark deleted and remove sequence +static int clean_seqs(int vpos, nseq_t *hash) +{ + khint_t k; + int ret = 0; + for (k = 0; k < kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + frag_t *f = &kh_val(hash, k); + int beg, end, i; + if (f->vpos >= vpos) { + ret = 1; + continue; + } + for (i = 0; i < f->vlen; ++i) + if (f->seq[i] != 0) break; + beg = i; + for (i = f->vlen - 1; i >= 0; --i) + if (f->seq[i] != 0) break; + end = i + 1; + if (end - beg <= 0) kh_del(64, hash, k); + else { + if (beg != 0) memmove(f->seq, f->seq + beg, end - beg); + f->vpos += beg; f->vlen = end - beg; + f->single = f->vlen == 1? 1 : 0; + } + } + } + return ret; +} + +static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash) +{ + int i, is_flip, drop_ambi; + drop_ambi = g->flag & FLAG_DROP_AMBI; + is_flip = (drand48() < 0.5); + for (i = 0; i < g->n; ++i) { + int end, which; + uint64_t key; + khint_t k; + bam1_t *b = g->b[i]; + key = X31_hash_string(bam1_qname(b)); + end = bam_calend(&b->core, bam1_cigar(b)); + if (end > min_pos) break; + k = kh_get(64, hash, key); + if (k == kh_end(hash)) which = 3; + else { + frag_t *f = &kh_val(hash, k); + if (f->ambig) which = drop_ambi? 2 : 3; + else if (f->phased && f->flip) which = 2; + else if (f->phased == 0) which = 3; + else { // phased and not flipped + char c = 'Y'; + which = f->phase; + bam_aux_append(b, "ZP", 'A', 1, (uint8_t*)&c); + } + if (which < 2 && is_flip) which = 1 - which; // increase the randomness + } + if (which == 3) which = (drand48() < 0.5); + bam_write1(g->out[which], b); + bam_destroy1(b); + g->b[i] = 0; + } + memmove(g->b, g->b + i, (g->n - i) * sizeof(void*)); + g->n -= i; +} + +static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash) +{ + int i, j, n_seqs = kh_size(hash), n_masked = 0, min_pos; + khint_t k; + frag_t **seqs; + int8_t *path, *sitemask; + uint64_t *pcnt, *regmask; + + if (vpos == 0) return 0; + i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos + min_pos = i? cns[vpos]>>32 : 0x7fffffff; + if (vpos == 1) { + printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1); + printf("M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1, + "ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1); + for (k = 0; k < kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + frag_t *f = &kh_val(hash, k); + if (f->vpos) continue; + f->flip = 0; + if (f->seq[0] == 0) f->phased = 0; + else f->phased = 1, f->phase = f->seq[0] - 1; + } + } + dump_aln(g, min_pos, hash); + ++g->vpos_shift; + return 1; + } + { // phase + int **cnt; + uint64_t *mask; + printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1); + sitemask = calloc(vpos, 1); + cnt = count_all(g->k, vpos, hash); + path = dynaprog(g->k, vpos, cnt); + for (i = 0; i < vpos; ++i) free(cnt[i]); + free(cnt); + pcnt = fragphase(vpos, path, hash, 0); // do not fix chimeras when masking + mask = genmask(vpos, pcnt, &n_masked); + regmask = calloc(n_masked, 8); + for (i = 0; i < n_masked; ++i) { + regmask[i] = cns[mask[i]>>32]>>32<<32 | cns[(uint32_t)mask[i]]>>32; + for (j = mask[i]>>32; j <= (int32_t)mask[i]; ++j) + sitemask[j] = 1; + } + free(mask); + if (g->flag & FLAG_FIX_CHIMERA) { + free(pcnt); + pcnt = fragphase(vpos, path, hash, 1); + } + } + for (i = 0; i < n_masked; ++i) + printf("FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1); + for (i = 0; i < vpos; ++i) { + uint64_t x = pcnt[i]; + int8_t c[2]; + c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3); + c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3); + printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]], + i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff)); + } + free(path); free(pcnt); free(regmask); free(sitemask); + seqs = calloc(n_seqs, sizeof(void*)); + for (k = 0, i = 0; k < kh_end(hash); ++k) + if (kh_exist(hash, k) && kh_val(hash, k).vpos < vpos && !kh_val(hash, k).single) + seqs[i++] = &kh_val(hash, k); + n_seqs = i; + ks_introsort_rseq(n_seqs, seqs); + for (i = 0; i < n_seqs; ++i) { + frag_t *f = seqs[i]; + printf("EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen); + for (j = 0; j < f->vlen; ++j) { + uint32_t c = cns[f->vpos + j]; + if (f->seq[j] == 0) putchar('N'); + else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]); + } + printf("\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1); + } + free(seqs); + printf("//\n"); + fflush(stdout); + g->vpos_shift += vpos; + dump_aln(g, min_pos, hash); + return vpos; +} + +static void update_vpos(int vpos, nseq_t *hash) +{ + khint_t k; + for (k = 0; k < kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + frag_t *f = &kh_val(hash, k); + if (f->vpos < vpos) kh_del(64, hash, k); // TODO: if frag_t::seq is allocated dynamically, free it + else f->vpos -= vpos; + } + } +} + +static nseq_t *shrink_hash(nseq_t *hash) // TODO: to implement +{ + return hash; +} + +static int readaln(void *data, bam1_t *b) +{ + phaseg_t *g = (phaseg_t*)data; + int ret; + ret = bam_read1(g->fp, b); + if (ret < 0) return ret; + if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) { + if (g->n == g->m) { + g->m = g->m? g->m<<1 : 16; + g->b = realloc(g->b, g->m * sizeof(void*)); + } + g->b[g->n++] = bam_dup1(b); + } + return ret; +} + +static khash_t(set64) *loadpos(const char *fn, bam_header_t *h) +{ + gzFile fp; + kstream_t *ks; + int ret, dret; + kstring_t *str; + khash_t(set64) *hash; + + hash = kh_init(set64); + str = calloc(1, sizeof(kstring_t)); + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { + int tid = bam_get_tid(h, str->s); + if (tid >= 0 && dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) >= 0) { + uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1); + kh_put(set64, hash, x, &ret); + } else break; + } + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); + if (dret < 0) break; + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + return hash; +} + +static int gl2cns(float q[16]) +{ + int i, j, min_ij; + float min, min2; + min = min2 = 1e30; min_ij = -1; + for (i = 0; i < 4; ++i) { + for (j = i; j < 4; ++j) { + if (q[i<<2|j] < min) min_ij = i<<2|j, min2 = min, min = q[i<<2|j]; + else if (q[i<<2|j] < min2) min2 = q[i<<2|j]; + } + } + return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2; +} + +int main_phase(int argc, char *argv[]) +{ + extern void bam_init_header_hash(bam_header_t *header); + int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0; + const bam_pileup1_t *plp; + bam_plp_t iter; + bam_header_t *h; + nseq_t *seqs; + uint64_t *cns = 0; + phaseg_t g; + char *fn_list = 0; + khash_t(set64) *set = 0; + errmod_t *em; + uint16_t *bases; + + memset(&g, 0, sizeof(phaseg_t)); + g.flag = FLAG_FIX_CHIMERA; + g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256; + while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:A:")) >= 0) { + switch (c) { + case 'D': g.max_depth = atoi(optarg); break; + case 'q': g.min_varLOD = atoi(optarg); break; + case 'Q': g.min_baseQ = atoi(optarg); break; + case 'k': g.k = atoi(optarg); break; + case 'F': g.flag &= ~FLAG_FIX_CHIMERA; break; + case 'e': g.flag |= FLAG_LIST_EXCL; break; + case 'A': g.flag |= FLAG_DROP_AMBI; break; + case 'b': g.pre = strdup(optarg); break; + case 'l': fn_list = strdup(optarg); break; + } + } + if (argc == optind) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools phase [options] <in.bam>\n\n"); + fprintf(stderr, "Options: -k INT block length [%d]\n", g.k); + fprintf(stderr, " -b STR prefix of BAMs to output [null]\n"); + fprintf(stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD); + fprintf(stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ); + fprintf(stderr, " -D INT max read depth [%d]\n", g.max_depth); +// fprintf(stderr, " -l FILE list of sites to phase [null]\n"); + fprintf(stderr, " -F do not attempt to fix chimeras\n"); + fprintf(stderr, " -A drop reads with ambiguous phase\n"); +// fprintf(stderr, " -e do not discover SNPs (effective with -l)\n"); + fprintf(stderr, "\n"); + return 1; + } + g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); + h = bam_header_read(g.fp); + if (fn_list) { // read the list of sites to phase + bam_init_header_hash(h); + set = loadpos(fn_list, h); + free(fn_list); + } else g.flag &= ~FLAG_LIST_EXCL; + if (g.pre) { // open BAMs to write + char *s = malloc(strlen(g.pre) + 20); + strcpy(s, g.pre); strcat(s, ".0.bam"); g.out[0] = bam_open(s, "w"); + strcpy(s, g.pre); strcat(s, ".1.bam"); g.out[1] = bam_open(s, "w"); + strcpy(s, g.pre); strcat(s, ".chimera.bam"); g.out[2] = bam_open(s, "w"); + for (c = 0; c <= 2; ++c) bam_header_write(g.out[c], h); + free(s); + } + + iter = bam_plp_init(readaln, &g); + g.vpos_shift = 0; + seqs = kh_init(64); + em = errmod_init(1. - 0.83); + bases = calloc(g.max_depth, 2); + printf("CC\n"); + printf("CC\tDescriptions:\nCC\n"); + printf("CC\t CC comments\n"); + printf("CC\t PS start of a phase set\n"); + printf("CC\t FL filtered region\n"); + printf("CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n"); + printf("CC\t EV supporting reads; SAM format\n"); + printf("CC\t // end of a phase set\nCC\n"); + printf("CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n"); + printf("CC\t PS chr phaseSetStart phaseSetEnd\n"); + printf("CC\t FL chr filterStart filterEnd\n"); + printf("CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n"); + printf("CC\nCC\n"); + fflush(stdout); + while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) { + int i, k, c, tmp, dophase = 1, in_set = 0; + float q[16]; + if (tid < 0) break; + if (tid != lasttid) { // change of chromosome + g.vpos_shift = 0; + if (lasttid >= 0) { + seqs = shrink_hash(seqs); + phase(&g, h->target_name[lasttid], vpos, cns, seqs); + update_vpos(0x7fffffff, seqs); + } + lasttid = tid; + vpos = 0; + } + if (set && kh_get(set64, set, (uint64_t)tid<<32 | pos) != kh_end(set)) in_set = 1; + if (n > g.max_depth) continue; // do not proceed if the depth is too high + // fill the bases array and check if there is a variant + for (i = k = 0; i < n; ++i) { + const bam_pileup1_t *p = plp + i; + uint8_t *seq; + int q, baseQ, b; + if (p->is_del || p->is_refskip) continue; + baseQ = bam1_qual(p->b)[p->qpos]; + if (baseQ < g.min_baseQ) continue; + seq = bam1_seq(p->b); + b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)]; + if (b > 3) continue; + q = baseQ < p->b->core.qual? baseQ : p->b->core.qual; + if (q < 4) q = 4; + if (q > 63) q = 63; + bases[k++] = q<<5 | (int)bam1_strand(p->b)<<4 | b; + } + if (k == 0) continue; + errmod_cal(em, k, 4, bases, q); // compute genotype likelihood + c = gl2cns(q); // get the consensus + // tell if to proceed + if (set && (g.flag&FLAG_LIST_EXCL) && !in_set) continue; // not in the list + if (!in_set && (c&0xffff)>>2 < g.min_varLOD) continue; // not a variant + // add the variant + if (vpos == max_vpos) { + max_vpos = max_vpos? max_vpos<<1 : 128; + cns = realloc(cns, max_vpos * 8); + } + cns[vpos] = (uint64_t)pos<<32 | c; + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = plp + i; + uint64_t key; + khint_t k; + uint8_t *seq = bam1_seq(p->b); + frag_t *f; + if (p->is_del || p->is_refskip) continue; + if (p->b->core.qual == 0) continue; + // get the base code + c = nt16_nt4_table[(int)bam1_seqi(seq, p->qpos)]; + if (c == (cns[vpos]&3)) c = 1; + else if (c == (cns[vpos]>>16&3)) c = 2; + else c = 0; + // write to seqs + key = X31_hash_string(bam1_qname(p->b)); + k = kh_put(64, seqs, key, &tmp); + f = &kh_val(seqs, k); + if (tmp == 0) { // present in the hash table + if (vpos - f->vpos + 1 < MAX_VARS) { + f->vlen = vpos - f->vpos + 1; + f->seq[f->vlen-1] = c; + f->end = bam_calend(&p->b->core, bam1_cigar(p->b)); + } + dophase = 0; + } else { // absent + memset(f->seq, 0, MAX_VARS); + f->beg = p->b->core.pos; + f->end = bam_calend(&p->b->core, bam1_cigar(p->b)); + f->vpos = vpos, f->vlen = 1, f->seq[0] = c, f->single = f->phased = f->flip = f->ambig = 0; + } + } + if (dophase) { + seqs = shrink_hash(seqs); + phase(&g, h->target_name[tid], vpos, cns, seqs); + update_vpos(vpos, seqs); + cns[0] = cns[vpos]; + vpos = 0; + } + ++vpos; + } + if (tid >= 0) phase(&g, h->target_name[tid], vpos, cns, seqs); + bam_header_destroy(h); + bam_plp_destroy(iter); + bam_close(g.fp); + kh_destroy(64, seqs); + kh_destroy(set64, set); + free(cns); + errmod_destroy(em); + free(bases); + if (g.pre) { + for (c = 0; c <= 2; ++c) bam_close(g.out[c]); + free(g.pre); free(g.b); + } + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/razf.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,853 @@ +/* + * RAZF : Random Access compressed(Z) File + * Version: 1.0 + * Release Date: 2008-10-27 + * + * Copyright 2008, Jue Ruan <ruanjue@gmail.com>, Heng Li <lh3@sanger.ac.uk> + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _NO_RAZF + +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include "razf.h" + + +#if ZLIB_VERNUM < 0x1221 +struct _gz_header_s { + int text; + uLong time; + int xflags; + int os; + Bytef *extra; + uInt extra_len; + uInt extra_max; + Bytef *name; + uInt name_max; + Bytef *comment; + uInt comm_max; + int hcrc; + int done; +}; +#warning "zlib < 1.2.2.1; RAZF writing is disabled." +#endif + +#define DEF_MEM_LEVEL 8 + +static inline uint32_t byte_swap_4(uint32_t v){ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} + +static inline uint64_t byte_swap_8(uint64_t v){ + v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); + v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); + return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); +} + +static inline int is_big_endian(){ + int x = 0x01; + char *c = (char*)&x; + return (c[0] != 0x01); +} + +#ifndef _RZ_READONLY +static void add_zindex(RAZF *rz, int64_t in, int64_t out){ + if(rz->index->size == rz->index->cap){ + rz->index->cap = rz->index->cap * 1.5 + 2; + rz->index->cell_offsets = realloc(rz->index->cell_offsets, sizeof(int) * rz->index->cap); + rz->index->bin_offsets = realloc(rz->index->bin_offsets, sizeof(int64_t) * (rz->index->cap/RZ_BIN_SIZE + 1)); + } + if(rz->index->size % RZ_BIN_SIZE == 0) rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE] = out; + rz->index->cell_offsets[rz->index->size] = out - rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE]; + rz->index->size ++; +} + +static void save_zindex(RAZF *rz, int fd){ + int32_t i, v32; + int is_be; + is_be = is_big_endian(); + if(is_be) write(fd, &rz->index->size, sizeof(int)); + else { + v32 = byte_swap_4((uint32_t)rz->index->size); + write(fd, &v32, sizeof(uint32_t)); + } + v32 = rz->index->size / RZ_BIN_SIZE + 1; + if(!is_be){ + for(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]); + for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]); + } + write(fd, rz->index->bin_offsets, sizeof(int64_t) * v32); + write(fd, rz->index->cell_offsets, sizeof(int32_t) * rz->index->size); +} +#endif + +#ifdef _USE_KNETFILE +static void load_zindex(RAZF *rz, knetFile *fp){ +#else +static void load_zindex(RAZF *rz, int fd){ +#endif + int32_t i, v32; + int is_be; + if(!rz->load_index) return; + if(rz->index == NULL) rz->index = malloc(sizeof(ZBlockIndex)); + is_be = is_big_endian(); +#ifdef _USE_KNETFILE + knet_read(fp, &rz->index->size, sizeof(int)); +#else + read(fd, &rz->index->size, sizeof(int)); +#endif + if(!is_be) rz->index->size = byte_swap_4((uint32_t)rz->index->size); + rz->index->cap = rz->index->size; + v32 = rz->index->size / RZ_BIN_SIZE + 1; + rz->index->bin_offsets = malloc(sizeof(int64_t) * v32); +#ifdef _USE_KNETFILE + knet_read(fp, rz->index->bin_offsets, sizeof(int64_t) * v32); +#else + read(fd, rz->index->bin_offsets, sizeof(int64_t) * v32); +#endif + rz->index->cell_offsets = malloc(sizeof(int) * rz->index->size); +#ifdef _USE_KNETFILE + knet_read(fp, rz->index->cell_offsets, sizeof(int) * rz->index->size); +#else + read(fd, rz->index->cell_offsets, sizeof(int) * rz->index->size); +#endif + if(!is_be){ + for(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]); + for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]); + } +} + +#ifdef _RZ_READONLY +static RAZF* razf_open_w(int fd) +{ + fprintf(stderr, "[razf_open_w] Writing is not available with zlib ver < 1.2.2.1\n"); + return 0; +} +#else +static RAZF* razf_open_w(int fd){ + RAZF *rz; +#ifdef _WIN32 + setmode(fd, O_BINARY); +#endif + rz = calloc(1, sizeof(RAZF)); + rz->mode = 'w'; +#ifdef _USE_KNETFILE + rz->x.fpw = fd; +#else + rz->filedes = fd; +#endif + rz->stream = calloc(sizeof(z_stream), 1); + rz->inbuf = malloc(RZ_BUFFER_SIZE); + rz->outbuf = malloc(RZ_BUFFER_SIZE); + rz->index = calloc(sizeof(ZBlockIndex), 1); + deflateInit2(rz->stream, RZ_COMPRESS_LEVEL, Z_DEFLATED, WINDOW_BITS + 16, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY); + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + rz->header = calloc(sizeof(gz_header), 1); + rz->header->os = 0x03; //Unix + rz->header->text = 0; + rz->header->time = 0; + rz->header->extra = malloc(7); + strncpy((char*)rz->header->extra, "RAZF", 4); + rz->header->extra[4] = 1; // obsolete field + // block size = RZ_BLOCK_SIZE, Big-Endian + rz->header->extra[5] = RZ_BLOCK_SIZE >> 8; + rz->header->extra[6] = RZ_BLOCK_SIZE & 0xFF; + rz->header->extra_len = 7; + rz->header->name = rz->header->comment = 0; + rz->header->hcrc = 0; + deflateSetHeader(rz->stream, rz->header); + rz->block_pos = rz->block_off = 0; + return rz; +} + +static void _razf_write(RAZF* rz, const void *data, int size){ + int tout; + rz->stream->avail_in = size; + rz->stream->next_in = (void*)data; + while(1){ + tout = rz->stream->avail_out; + deflate(rz->stream, Z_NO_FLUSH); + rz->out += tout - rz->stream->avail_out; + if(rz->stream->avail_out) break; +#ifdef _USE_KNETFILE + write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#else + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#endif + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + if(rz->stream->avail_in == 0) break; + }; + rz->in += size - rz->stream->avail_in; + rz->block_off += size - rz->stream->avail_in; +} + +static void razf_flush(RAZF *rz){ + uint32_t tout; + if(rz->buf_len){ + _razf_write(rz, rz->inbuf, rz->buf_len); + rz->buf_off = rz->buf_len = 0; + } + if(rz->stream->avail_out){ +#ifdef _USE_KNETFILE + write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#else + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#endif + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + } + while(1){ + tout = rz->stream->avail_out; + deflate(rz->stream, Z_FULL_FLUSH); + rz->out += tout - rz->stream->avail_out; + if(rz->stream->avail_out == 0){ +#ifdef _USE_KNETFILE + write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#else + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#endif + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + } else break; + } + rz->block_pos = rz->out; + rz->block_off = 0; +} + +static void razf_end_flush(RAZF *rz){ + uint32_t tout; + if(rz->buf_len){ + _razf_write(rz, rz->inbuf, rz->buf_len); + rz->buf_off = rz->buf_len = 0; + } + while(1){ + tout = rz->stream->avail_out; + deflate(rz->stream, Z_FINISH); + rz->out += tout - rz->stream->avail_out; + if(rz->stream->avail_out < RZ_BUFFER_SIZE){ +#ifdef _USE_KNETFILE + write(rz->x.fpw, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#else + write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out); +#endif + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + } else break; + } +} + +static void _razf_buffered_write(RAZF *rz, const void *data, int size){ + int i, n; + while(1){ + if(rz->buf_len == RZ_BUFFER_SIZE){ + _razf_write(rz, rz->inbuf, rz->buf_len); + rz->buf_len = 0; + } + if(size + rz->buf_len < RZ_BUFFER_SIZE){ + for(i=0;i<size;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i]; + rz->buf_len += size; + return; + } else { + n = RZ_BUFFER_SIZE - rz->buf_len; + for(i=0;i<n;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i]; + size -= n; + data += n; + rz->buf_len += n; + } + } +} + +int razf_write(RAZF* rz, const void *data, int size){ + int ori_size, n; + int64_t next_block; + ori_size = size; + next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE; + while(rz->in + rz->buf_len + size >= next_block){ + n = next_block - rz->in - rz->buf_len; + _razf_buffered_write(rz, data, n); + data += n; + size -= n; + razf_flush(rz); + add_zindex(rz, rz->in, rz->out); + next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE; + } + _razf_buffered_write(rz, data, size); + return ori_size; +} +#endif + +/* gzip flag byte */ +#define ASCII_FLAG 0x01 /* bit 0 set: file probably ascii text */ +#define HEAD_CRC 0x02 /* bit 1 set: header CRC present */ +#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ +#define ORIG_NAME 0x08 /* bit 3 set: original file name present */ +#define COMMENT 0x10 /* bit 4 set: file comment present */ +#define RESERVED 0xE0 /* bits 5..7: reserved */ + +static int _read_gz_header(unsigned char *data, int size, int *extra_off, int *extra_len){ + int method, flags, n, len; + if(size < 2) return 0; + if(data[0] != 0x1f || data[1] != 0x8b) return 0; + if(size < 4) return 0; + method = data[2]; + flags = data[3]; + if(method != Z_DEFLATED || (flags & RESERVED)) return 0; + n = 4 + 6; // Skip 6 bytes + *extra_off = n + 2; + *extra_len = 0; + if(flags & EXTRA_FIELD){ + if(size < n + 2) return 0; + len = ((int)data[n + 1] << 8) | data[n]; + n += 2; + *extra_off = n; + while(len){ + if(n >= size) return 0; + n ++; + len --; + } + *extra_len = n - (*extra_off); + } + if(flags & ORIG_NAME) while(n < size && data[n++]); + if(flags & COMMENT) while(n < size && data[n++]); + if(flags & HEAD_CRC){ + if(n + 2 > size) return 0; + n += 2; + } + return n; +} + +#ifdef _USE_KNETFILE +static RAZF* razf_open_r(knetFile *fp, int _load_index){ +#else +static RAZF* razf_open_r(int fd, int _load_index){ +#endif + RAZF *rz; + int ext_off, ext_len; + int n, is_be, ret; + int64_t end; + unsigned char c[] = "RAZF"; + rz = calloc(1, sizeof(RAZF)); + rz->mode = 'r'; +#ifdef _USE_KNETFILE + rz->x.fpr = fp; +#else +#ifdef _WIN32 + setmode(fd, O_BINARY); +#endif + rz->filedes = fd; +#endif + rz->stream = calloc(sizeof(z_stream), 1); + rz->inbuf = malloc(RZ_BUFFER_SIZE); + rz->outbuf = malloc(RZ_BUFFER_SIZE); + rz->end = rz->src_end = 0x7FFFFFFFFFFFFFFFLL; +#ifdef _USE_KNETFILE + n = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE); +#else + n = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE); +#endif + ret = _read_gz_header(rz->inbuf, n, &ext_off, &ext_len); + if(ret == 0){ + PLAIN_FILE: + rz->in = n; + rz->file_type = FILE_TYPE_PLAIN; + memcpy(rz->outbuf, rz->inbuf, n); + rz->buf_len = n; + free(rz->stream); + rz->stream = NULL; + return rz; + } + rz->header_size = ret; + ret = inflateInit2(rz->stream, -WINDOW_BITS); + if(ret != Z_OK){ inflateEnd(rz->stream); goto PLAIN_FILE;} + rz->stream->avail_in = n - rz->header_size; + rz->stream->next_in = rz->inbuf + rz->header_size; + rz->stream->avail_out = RZ_BUFFER_SIZE; + rz->stream->next_out = rz->outbuf; + rz->file_type = FILE_TYPE_GZ; + rz->in = rz->header_size; + rz->block_pos = rz->header_size; + rz->next_block_pos = rz->header_size; + rz->block_off = 0; + if(ext_len < 7 || memcmp(rz->inbuf + ext_off, c, 4) != 0) return rz; + if(((((unsigned char*)rz->inbuf)[ext_off + 5] << 8) | ((unsigned char*)rz->inbuf)[ext_off + 6]) != RZ_BLOCK_SIZE){ + fprintf(stderr, " -- WARNING: RZ_BLOCK_SIZE is not %d, treat source as gz file. in %s -- %s:%d --\n", RZ_BLOCK_SIZE, __FUNCTION__, __FILE__, __LINE__); + return rz; + } + rz->load_index = _load_index; + rz->file_type = FILE_TYPE_RZ; +#ifdef _USE_KNETFILE + if(knet_seek(fp, -16, SEEK_END) == -1){ +#else + if(lseek(fd, -16, SEEK_END) == -1){ +#endif + UNSEEKABLE: + rz->seekable = 0; + rz->index = NULL; + rz->src_end = rz->end = 0x7FFFFFFFFFFFFFFFLL; + } else { + is_be = is_big_endian(); + rz->seekable = 1; +#ifdef _USE_KNETFILE + knet_read(fp, &end, sizeof(int64_t)); +#else + read(fd, &end, sizeof(int64_t)); +#endif + if(!is_be) rz->src_end = (int64_t)byte_swap_8((uint64_t)end); + else rz->src_end = end; + +#ifdef _USE_KNETFILE + knet_read(fp, &end, sizeof(int64_t)); +#else + read(fd, &end, sizeof(int64_t)); +#endif + if(!is_be) rz->end = (int64_t)byte_swap_8((uint64_t)end); + else rz->end = end; + if(n > rz->end){ + rz->stream->avail_in -= n - rz->end; + n = rz->end; + } + if(rz->end > rz->src_end){ +#ifdef _USE_KNETFILE + knet_seek(fp, rz->in, SEEK_SET); +#else + lseek(fd, rz->in, SEEK_SET); +#endif + goto UNSEEKABLE; + } +#ifdef _USE_KNETFILE + knet_seek(fp, rz->end, SEEK_SET); + if(knet_tell(fp) != rz->end){ + knet_seek(fp, rz->in, SEEK_SET); +#else + if(lseek(fd, rz->end, SEEK_SET) != rz->end){ + lseek(fd, rz->in, SEEK_SET); +#endif + goto UNSEEKABLE; + } +#ifdef _USE_KNETFILE + load_zindex(rz, fp); + knet_seek(fp, n, SEEK_SET); +#else + load_zindex(rz, fd); + lseek(fd, n, SEEK_SET); +#endif + } + return rz; +} + +#ifdef _USE_KNETFILE +RAZF* razf_dopen(int fd, const char *mode){ + if (strstr(mode, "r")) fprintf(stderr,"[razf_dopen] implement me\n"); + else if(strstr(mode, "w")) return razf_open_w(fd); + return NULL; +} + +RAZF* razf_dopen2(int fd, const char *mode) +{ + fprintf(stderr,"[razf_dopen2] implement me\n"); + return NULL; +} +#else +RAZF* razf_dopen(int fd, const char *mode){ + if(strstr(mode, "r")) return razf_open_r(fd, 1); + else if(strstr(mode, "w")) return razf_open_w(fd); + else return NULL; +} + +RAZF* razf_dopen2(int fd, const char *mode) +{ + if(strstr(mode, "r")) return razf_open_r(fd, 0); + else if(strstr(mode, "w")) return razf_open_w(fd); + else return NULL; +} +#endif + +static inline RAZF* _razf_open(const char *filename, const char *mode, int _load_index){ + int fd; + RAZF *rz; + if(strstr(mode, "r")){ +#ifdef _USE_KNETFILE + knetFile *fd = knet_open(filename, "r"); + if (fd == 0) { + fprintf(stderr, "[_razf_open] fail to open %s\n", filename); + return NULL; + } +#else +#ifdef _WIN32 + fd = open(filename, O_RDONLY | O_BINARY); +#else + fd = open(filename, O_RDONLY); +#endif +#endif + if(fd < 0) return NULL; + rz = razf_open_r(fd, _load_index); + } else if(strstr(mode, "w")){ +#ifdef _WIN32 + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0666); +#else + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0666); +#endif + if(fd < 0) return NULL; + rz = razf_open_w(fd); + } else return NULL; + return rz; +} + +RAZF* razf_open(const char *filename, const char *mode){ + return _razf_open(filename, mode, 1); +} + +RAZF* razf_open2(const char *filename, const char *mode){ + return _razf_open(filename, mode, 0); +} + +int razf_get_data_size(RAZF *rz, int64_t *u_size, int64_t *c_size){ + int64_t n; + if(rz->mode != 'r' && rz->mode != 'R') return 0; + switch(rz->file_type){ + case FILE_TYPE_PLAIN: + if(rz->end == 0x7fffffffffffffffLL){ +#ifdef _USE_KNETFILE + if(knet_seek(rz->x.fpr, 0, SEEK_CUR) == -1) return 0; + n = knet_tell(rz->x.fpr); + knet_seek(rz->x.fpr, 0, SEEK_END); + rz->end = knet_tell(rz->x.fpr); + knet_seek(rz->x.fpr, n, SEEK_SET); +#else + if((n = lseek(rz->filedes, 0, SEEK_CUR)) == -1) return 0; + rz->end = lseek(rz->filedes, 0, SEEK_END); + lseek(rz->filedes, n, SEEK_SET); +#endif + } + *u_size = *c_size = rz->end; + return 1; + case FILE_TYPE_GZ: + return 0; + case FILE_TYPE_RZ: + if(rz->src_end == rz->end) return 0; + *u_size = rz->src_end; + *c_size = rz->end; + return 1; + default: + return 0; + } +} + +static int _razf_read(RAZF* rz, void *data, int size){ + int ret, tin; + if(rz->z_eof || rz->z_err) return 0; + if (rz->file_type == FILE_TYPE_PLAIN) { +#ifdef _USE_KNETFILE + ret = knet_read(rz->x.fpr, data, size); +#else + ret = read(rz->filedes, data, size); +#endif + if (ret == 0) rz->z_eof = 1; + return ret; + } + rz->stream->avail_out = size; + rz->stream->next_out = data; + while(rz->stream->avail_out){ + if(rz->stream->avail_in == 0){ + if(rz->in >= rz->end){ rz->z_eof = 1; break; } + if(rz->end - rz->in < RZ_BUFFER_SIZE){ +#ifdef _USE_KNETFILE + rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, rz->end -rz->in); +#else + rz->stream->avail_in = read(rz->filedes, rz->inbuf, rz->end -rz->in); +#endif + } else { +#ifdef _USE_KNETFILE + rz->stream->avail_in = knet_read(rz->x.fpr, rz->inbuf, RZ_BUFFER_SIZE); +#else + rz->stream->avail_in = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE); +#endif + } + if(rz->stream->avail_in == 0){ + rz->z_eof = 1; + break; + } + rz->stream->next_in = rz->inbuf; + } + tin = rz->stream->avail_in; + ret = inflate(rz->stream, Z_BLOCK); + rz->in += tin - rz->stream->avail_in; + if(ret == Z_NEED_DICT || ret == Z_MEM_ERROR || ret == Z_DATA_ERROR){ + fprintf(stderr, "[_razf_read] inflate error: %d %s (at %s:%d)\n", ret, rz->stream->msg ? rz->stream->msg : "", __FILE__, __LINE__); + rz->z_err = 1; + break; + } + if(ret == Z_STREAM_END){ + rz->z_eof = 1; + break; + } + if ((rz->stream->data_type&128) && !(rz->stream->data_type&64)){ + rz->buf_flush = 1; + rz->next_block_pos = rz->in; + break; + } + } + return size - rz->stream->avail_out; +} + +int razf_read(RAZF *rz, void *data, int size){ + int ori_size, i; + ori_size = size; + while(size > 0){ + if(rz->buf_len){ + if(size < rz->buf_len){ + for(i=0;i<size;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i]; + rz->buf_off += size; + rz->buf_len -= size; + data += size; + rz->block_off += size; + size = 0; + break; + } else { + for(i=0;i<rz->buf_len;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i]; + data += rz->buf_len; + size -= rz->buf_len; + rz->block_off += rz->buf_len; + rz->buf_off = 0; + rz->buf_len = 0; + if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + } + } else if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + if(rz->buf_flush) continue; + rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE); + if(rz->z_eof && rz->buf_len == 0) break; + } + rz->out += ori_size - size; + return ori_size - size; +} + +int razf_skip(RAZF* rz, int size){ + int ori_size; + ori_size = size; + while(size > 0){ + if(rz->buf_len){ + if(size < rz->buf_len){ + rz->buf_off += size; + rz->buf_len -= size; + rz->block_off += size; + size = 0; + break; + } else { + size -= rz->buf_len; + rz->buf_off = 0; + rz->buf_len = 0; + rz->block_off += rz->buf_len; + if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + } + } else if(rz->buf_flush){ + rz->block_pos = rz->next_block_pos; + rz->block_off = 0; + rz->buf_flush = 0; + } + if(rz->buf_flush) continue; + rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE); + if(rz->z_eof || rz->z_err) break; + } + rz->out += ori_size - size; + return ori_size - size; +} + +static void _razf_reset_read(RAZF *rz, int64_t in, int64_t out){ +#ifdef _USE_KNETFILE + knet_seek(rz->x.fpr, in, SEEK_SET); +#else + lseek(rz->filedes, in, SEEK_SET); +#endif + rz->in = in; + rz->out = out; + rz->block_pos = in; + rz->next_block_pos = in; + rz->block_off = 0; + rz->buf_flush = 0; + rz->z_eof = rz->z_err = 0; + inflateReset(rz->stream); + rz->stream->avail_in = 0; + rz->buf_off = rz->buf_len = 0; +} + +int64_t razf_jump(RAZF *rz, int64_t block_start, int block_offset){ + int64_t pos; + rz->z_eof = 0; + if(rz->file_type == FILE_TYPE_PLAIN){ + rz->buf_off = rz->buf_len = 0; + pos = block_start + block_offset; +#ifdef _USE_KNETFILE + knet_seek(rz->x.fpr, pos, SEEK_SET); + pos = knet_tell(rz->x.fpr); +#else + pos = lseek(rz->filedes, pos, SEEK_SET); +#endif + rz->out = rz->in = pos; + return pos; + } + if(block_start == rz->block_pos && block_offset >= rz->block_off) { + block_offset -= rz->block_off; + goto SKIP; // Needn't reset inflate + } + if(block_start == 0) block_start = rz->header_size; // Automaticly revist wrong block_start + _razf_reset_read(rz, block_start, 0); + SKIP: + if(block_offset) razf_skip(rz, block_offset); + return rz->block_off; +} + +int64_t razf_seek(RAZF* rz, int64_t pos, int where){ + int64_t idx; + int64_t seek_pos, new_out; + rz->z_eof = 0; + if (where == SEEK_CUR) pos += rz->out; + else if (where == SEEK_END) pos += rz->src_end; + if(rz->file_type == FILE_TYPE_PLAIN){ +#ifdef _USE_KNETFILE + knet_seek(rz->x.fpr, pos, SEEK_SET); + seek_pos = knet_tell(rz->x.fpr); +#else + seek_pos = lseek(rz->filedes, pos, SEEK_SET); +#endif + rz->buf_off = rz->buf_len = 0; + rz->out = rz->in = seek_pos; + return seek_pos; + } else if(rz->file_type == FILE_TYPE_GZ){ + if(pos >= rz->out) goto SKIP; + return rz->out; + } + if(pos == rz->out) return pos; + if(pos > rz->src_end) return rz->out; + if(!rz->seekable || !rz->load_index){ + if(pos >= rz->out) goto SKIP; + } + idx = pos / RZ_BLOCK_SIZE - 1; + seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]); + new_out = (idx + 1) * RZ_BLOCK_SIZE; + if(pos > rz->out && new_out <= rz->out) goto SKIP; + _razf_reset_read(rz, seek_pos, new_out); + SKIP: + razf_skip(rz, (int)(pos - rz->out)); + return rz->out; +} + +uint64_t razf_tell2(RAZF *rz) +{ + /* + if (rz->load_index) { + int64_t idx, seek_pos; + idx = rz->out / RZ_BLOCK_SIZE - 1; + seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]); + if (seek_pos != rz->block_pos || rz->out%RZ_BLOCK_SIZE != rz->block_off) + fprintf(stderr, "[razf_tell2] inconsistent block offset: (%lld, %lld) != (%lld, %lld)\n", + (long long)seek_pos, (long long)rz->out%RZ_BLOCK_SIZE, (long long)rz->block_pos, (long long) rz->block_off); + } + */ + return (uint64_t)rz->block_pos<<16 | (rz->block_off&0xffff); +} + +int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where) +{ + if (where != SEEK_SET) return -1; + return razf_jump(rz, voffset>>16, voffset&0xffff); +} + +void razf_close(RAZF *rz){ + if(rz->mode == 'w'){ +#ifndef _RZ_READONLY + razf_end_flush(rz); + deflateEnd(rz->stream); +#ifdef _USE_KNETFILE + save_zindex(rz, rz->x.fpw); + if(is_big_endian()){ + write(rz->x.fpw, &rz->in, sizeof(int64_t)); + write(rz->x.fpw, &rz->out, sizeof(int64_t)); + } else { + uint64_t v64 = byte_swap_8((uint64_t)rz->in); + write(rz->x.fpw, &v64, sizeof(int64_t)); + v64 = byte_swap_8((uint64_t)rz->out); + write(rz->x.fpw, &v64, sizeof(int64_t)); + } +#else + save_zindex(rz, rz->filedes); + if(is_big_endian()){ + write(rz->filedes, &rz->in, sizeof(int64_t)); + write(rz->filedes, &rz->out, sizeof(int64_t)); + } else { + uint64_t v64 = byte_swap_8((uint64_t)rz->in); + write(rz->filedes, &v64, sizeof(int64_t)); + v64 = byte_swap_8((uint64_t)rz->out); + write(rz->filedes, &v64, sizeof(int64_t)); + } +#endif +#endif + } else if(rz->mode == 'r'){ + if(rz->stream) inflateEnd(rz->stream); + } + if(rz->inbuf) free(rz->inbuf); + if(rz->outbuf) free(rz->outbuf); + if(rz->header){ + free(rz->header->extra); + free(rz->header->name); + free(rz->header->comment); + free(rz->header); + } + if(rz->index){ + free(rz->index->bin_offsets); + free(rz->index->cell_offsets); + free(rz->index); + } + free(rz->stream); +#ifdef _USE_KNETFILE + if (rz->mode == 'r') + knet_close(rz->x.fpr); + if (rz->mode == 'w') + close(rz->x.fpw); +#else + close(rz->filedes); +#endif + free(rz); +} + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/razf.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,134 @@ + /*- + * RAZF : Random Access compressed(Z) File + * Version: 1.0 + * Release Date: 2008-10-27 + * + * Copyright 2008, Jue Ruan <ruanjue@gmail.com>, Heng Li <lh3@sanger.ac.uk> + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#ifndef __RAZF_RJ_H +#define __RAZF_RJ_H + +#include <stdint.h> +#include <stdio.h> +#include "zlib.h" + +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +#if ZLIB_VERNUM < 0x1221 +#define _RZ_READONLY +struct _gz_header_s; +typedef struct _gz_header_s _gz_header; +#define gz_header _gz_header +#endif + +#define WINDOW_BITS 15 + +#ifndef RZ_BLOCK_SIZE +#define RZ_BLOCK_SIZE (1<<WINDOW_BITS) +#endif + +#ifndef RZ_BUFFER_SIZE +#define RZ_BUFFER_SIZE 4096 +#endif + +#ifndef RZ_COMPRESS_LEVEL +#define RZ_COMPRESS_LEVEL 6 +#endif + +#define RZ_BIN_SIZE ((1LLU << 32) / RZ_BLOCK_SIZE) + +typedef struct { + uint32_t *cell_offsets; // i + int64_t *bin_offsets; // i / BIN_SIZE + int size; + int cap; +} ZBlockIndex; +/* When storing index, output bytes in Big-Endian everywhere */ + +#define FILE_TYPE_RZ 1 +#define FILE_TYPE_PLAIN 2 +#define FILE_TYPE_GZ 3 + +typedef struct RandomAccessZFile { + char mode; /* 'w' : write mode; 'r' : read mode */ + int file_type; + /* plain file or rz file, razf_read support plain file as input too, in this case, razf_read work as buffered fread */ +#ifdef _USE_KNETFILE + union { + knetFile *fpr; + int fpw; + } x; +#else + int filedes; /* the file descriptor */ +#endif + z_stream *stream; + ZBlockIndex *index; + int64_t in, out, end, src_end; + /* in: n bytes total in; out: n bytes total out; */ + /* end: the end of all data blocks, while the start of index; src_end: the true end position in uncompressed file */ + int buf_flush; // buffer should be flush, suspend inflate util buffer is empty + int64_t block_pos, block_off, next_block_pos; + /* block_pos: the start postiion of current block in compressed file */ + /* block_off: tell how many bytes have been read from current block */ + void *inbuf, *outbuf; + int header_size; + gz_header *header; + /* header is used to transfer inflate_state->mode from HEAD to TYPE after call inflateReset */ + int buf_off, buf_len; + int z_err, z_eof; + int seekable; + /* Indice where the source is seekable */ + int load_index; + /* set has_index to 0 in mode 'w', then index will be discarded */ +} RAZF; + +#ifdef __cplusplus +extern "C" { +#endif + + RAZF* razf_dopen(int data_fd, const char *mode); + RAZF *razf_open(const char *fn, const char *mode); + int razf_write(RAZF* rz, const void *data, int size); + int razf_read(RAZF* rz, void *data, int size); + int64_t razf_seek(RAZF* rz, int64_t pos, int where); + void razf_close(RAZF* rz); + +#define razf_tell(rz) ((rz)->out) + + RAZF* razf_open2(const char *filename, const char *mode); + RAZF* razf_dopen2(int fd, const char *mode); + uint64_t razf_tell2(RAZF *rz); + int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where); + +#ifdef __cplusplus +} +#endif + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/razip.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,141 @@ +#include <stdio.h> +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include "razf.h" + +#define WINDOW_SIZE 4096 + +static int razf_main_usage() +{ + printf("\n"); + printf("Usage: razip [options] [file] ...\n\n"); + printf("Options: -c write on standard output, keep original files unchanged\n"); + printf(" -d decompress\n"); + printf(" -l list compressed file contents\n"); + printf(" -b INT decompress at INT position in the uncompressed file\n"); + printf(" -s INT decompress INT bytes in the uncompressed file\n"); + printf(" -h give this help\n"); + printf("\n"); + return 0; +} + +static int write_open(const char *fn, int is_forced) +{ + int fd = -1; + char c; + if (!is_forced) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) { + printf("razip: %s already exists; do you wish to overwrite (y or n)? ", fn); + scanf("%c", &c); + if (c != 'Y' && c != 'y') { + printf("razip: not overwritten\n"); + exit(1); + } + } + } + if (fd < 0) { + if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) { + fprintf(stderr, "razip: %s: Fail to write\n", fn); + exit(1); + } + } + return fd; +} + +int main(int argc, char **argv) +{ + int c, compress, pstdout, is_forced; + RAZF *rz; + void *buffer; + long start, end, size; + + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; + while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){ + switch(c){ + case 'h': return razf_main_usage(); + case 'd': compress = 0; break; + case 'c': pstdout = 1; break; + case 'l': compress = 2; break; + case 'b': start = atol(optarg); break; + case 's': size = atol(optarg); break; + case 'f': is_forced = 1; break; + } + } + if (size >= 0) end = start + size; + if(end >= 0 && end < start){ + fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end); + return 1; + } + if(compress == 1){ + int f_src, f_dst = -1; + if(argc > optind){ + if((f_src = open(argv[optind], O_RDONLY)) < 0){ + fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]); + return 1; + } + if(pstdout){ + f_dst = fileno(stdout); + } else { + char *name = malloc(sizeof(strlen(argv[optind]) + 5)); + strcpy(name, argv[optind]); + strcat(name, ".rz"); + f_dst = write_open(name, is_forced); + if (f_dst < 0) return 1; + free(name); + } + } else if(pstdout){ + f_src = fileno(stdin); + f_dst = fileno(stdout); + } else return razf_main_usage(); + rz = razf_dopen(f_dst, "w"); + buffer = malloc(WINDOW_SIZE); + while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c); + razf_close(rz); // f_dst will be closed here + if (argc > optind && !pstdout) unlink(argv[optind]); + free(buffer); + close(f_src); + return 0; + } else { + if(argc <= optind) return razf_main_usage(); + if(compress == 2){ + rz = razf_open(argv[optind], "r"); + if(rz->file_type == FILE_TYPE_RZ) { + printf("%20s%20s%7s %s\n", "compressed", "uncompressed", "ratio", "name"); + printf("%20lld%20lld%6.1f%% %s\n", (long long)rz->end, (long long)rz->src_end, rz->end * 100.0f / rz->src_end, + argv[optind]); + } else fprintf(stdout, "%s is not a regular rz file\n", argv[optind]); + } else { + int f_dst; + if (argc > optind && !pstdout) { + char *name; + if (strstr(argv[optind], ".rz") - argv[optind] != strlen(argv[optind]) - 3) { + printf("razip: %s: unknown suffix -- ignored\n", argv[optind]); + return 1; + } + name = strdup(argv[optind]); + name[strlen(name) - 3] = '\0'; + f_dst = write_open(name, is_forced); + free(name); + } else f_dst = fileno(stdout); + rz = razf_open(argv[optind], "r"); + buffer = malloc(WINDOW_SIZE); + razf_seek(rz, start, SEEK_SET); + while(1){ + if(end < 0) c = razf_read(rz, buffer, WINDOW_SIZE); + else c = razf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); + if(c <= 0) break; + start += c; + write(f_dst, buffer, c); + if(end >= 0 && start >= end) break; + } + free(buffer); + if (!pstdout) unlink(argv[optind]); + } + razf_close(rz); + return 0; + } +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/sam.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,186 @@ +#include <string.h> +#include <unistd.h> +#include "faidx.h" +#include "sam.h" + +#define TYPE_BAM 1 +#define TYPE_READ 2 + +bam_header_t *bam_header_dup(const bam_header_t *h0) +{ + bam_header_t *h; + int i; + h = bam_header_init(); + *h = *h0; + h->hash = h->dict = h->rg2lib = 0; + h->text = (char*)calloc(h->l_text + 1, 1); + memcpy(h->text, h0->text, h->l_text); + h->target_len = (uint32_t*)calloc(h->n_targets, 4); + h->target_name = (char**)calloc(h->n_targets, sizeof(void*)); + for (i = 0; i < h->n_targets; ++i) { + h->target_len[i] = h0->target_len[i]; + h->target_name[i] = strdup(h0->target_name[i]); + } + return h; +} +static void append_header_text(bam_header_t *header, char* text, int len) +{ + int x = header->l_text + 1; + int y = header->l_text + len + 1; // 1 byte null + if (text == 0) return; + kroundup32(x); + kroundup32(y); + if (x < y) header->text = (char*)realloc(header->text, y); + strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here. + header->l_text += len; + header->text[header->l_text] = 0; +} + +int samthreads(samfile_t *fp, int n_threads, int n_sub_blks) +{ + if (!(fp->type&1) || (fp->type&2)) return -1; + bgzf_mt(fp->x.bam, n_threads, n_sub_blks); + return 0; +} + +samfile_t *samopen(const char *fn, const char *mode, const void *aux) +{ + samfile_t *fp; + fp = (samfile_t*)calloc(1, sizeof(samfile_t)); + if (strchr(mode, 'r')) { // read + fp->type |= TYPE_READ; + if (strchr(mode, 'b')) { // binary + fp->type |= TYPE_BAM; + fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); + if (fp->x.bam == 0) goto open_err_ret; + fp->header = bam_header_read(fp->x.bam); + } else { // text + fp->x.tamr = sam_open(fn); + if (fp->x.tamr == 0) goto open_err_ret; + fp->header = sam_header_read(fp->x.tamr); + if (fp->header->n_targets == 0) { // no @SQ fields + if (aux) { // check if aux is present + bam_header_t *textheader = fp->header; + fp->header = sam_header_read2((const char*)aux); + if (fp->header == 0) goto open_err_ret; + append_header_text(fp->header, textheader->text, textheader->l_text); + bam_header_destroy(textheader); + } + if (fp->header->n_targets == 0 && bam_verbose >= 1) + fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); + } else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); + } + } else if (strchr(mode, 'w')) { // write + fp->header = bam_header_dup((const bam_header_t*)aux); + if (strchr(mode, 'b')) { // binary + char bmode[3]; + int i, compress_level = -1; + for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0; + fp->type |= TYPE_BAM; + fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode); + if (fp->x.bam == 0) goto open_err_ret; + bam_header_write(fp->x.bam, fp->header); + } else { // text + // open file + fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; + if (fp->x.tamw == 0) goto open_err_ret; + if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2; + else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2; + else fp->type |= BAM_OFDEC<<2; + // write header + if (strchr(mode, 'h')) { + int i; + bam_header_t *alt; + // parse the header text + alt = bam_header_init(); + alt->l_text = fp->header->l_text; alt->text = fp->header->text; + sam_header_parse(alt); + alt->l_text = 0; alt->text = 0; + // check if there are @SQ lines in the header + fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL + if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} + if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1) + fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n"); + } else { // then dump ->target_{name,len} + for (i = 0; i < fp->header->n_targets; ++i) + fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); + } + bam_header_destroy(alt); + } + } + } + return fp; + +open_err_ret: + free(fp); + return 0; +} + +void samclose(samfile_t *fp) +{ + if (fp == 0) return; + if (fp->header) bam_header_destroy(fp->header); + if (fp->type & TYPE_BAM) bam_close(fp->x.bam); + else if (fp->type & TYPE_READ) sam_close(fp->x.tamr); + else fclose(fp->x.tamw); + free(fp); +} + +int samread(samfile_t *fp, bam1_t *b) +{ + if (fp == 0 || !(fp->type & TYPE_READ)) return -1; // not open for reading + if (fp->type & TYPE_BAM) return bam_read1(fp->x.bam, b); + else return sam_read1(fp->x.tamr, fp->header, b); +} + +int samwrite(samfile_t *fp, const bam1_t *b) +{ + if (fp == 0 || (fp->type & TYPE_READ)) return -1; // not open for writing + if (fp->type & TYPE_BAM) return bam_write1(fp->x.bam, b); + else { + char *s = bam_format1_core(fp->header, b, fp->type>>2&3); + int l = strlen(s); + fputs(s, fp->x.tamw); fputc('\n', fp->x.tamw); + free(s); + return l + 1; + } +} + +int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *func_data) +{ + bam_plbuf_t *buf; + int ret; + bam1_t *b; + b = bam_init1(); + buf = bam_plbuf_init(func, func_data); + bam_plbuf_set_mask(buf, mask); + while ((ret = samread(fp, b)) >= 0) + bam_plbuf_push(b, buf); + bam_plbuf_push(0, buf); + bam_plbuf_destroy(buf); + bam_destroy1(b); + return 0; +} + +char *samfaipath(const char *fn_ref) +{ + char *fn_list = 0; + if (fn_ref == 0) return 0; + fn_list = calloc(strlen(fn_ref) + 5, 1); + strcat(strcpy(fn_list, fn_ref), ".fai"); + if (access(fn_list, R_OK) == -1) { // fn_list is unreadable + if (access(fn_ref, R_OK) == -1) { + fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref); + } else { + if (bam_verbose >= 3) fprintf(stderr, "[samfaipath] build FASTA index...\n"); + if (fai_build(fn_ref) == -1) { + fprintf(stderr, "[samfaipath] fail to build FASTA index.\n"); + free(fn_list); fn_list = 0; + } + } + } + return fn_list; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/sam.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,99 @@ +#ifndef BAM_SAM_H +#define BAM_SAM_H + +#include "bam.h" + +/*! + @header + + This file provides higher level of I/O routines and unifies the APIs + for SAM and BAM formats. These APIs are more convenient and + recommended. + + @copyright Genome Research Ltd. + */ + +/*! @typedef + @abstract SAM/BAM file handler + @field type type of the handler; bit 1 for BAM, 2 for reading and bit 3-4 for flag format + @field bam BAM file handler; valid if (type&1) == 1 + @field tamr SAM file handler for reading; valid if type == 2 + @field tamw SAM file handler for writing; valid if type == 0 + @field header header struct + */ +typedef struct { + int type; + union { + tamFile tamr; + bamFile bam; + FILE *tamw; + } x; + bam_header_t *header; +} samfile_t; + +#ifdef __cplusplus +extern "C" { +#endif + + /*! + @abstract Open a SAM/BAM file + + @param fn SAM/BAM file name; "-" is recognized as stdin (for + reading) or stdout (for writing). + + @param mode open mode /[rw](b?)(u?)(h?)([xX]?)/: 'r' for reading, + 'w' for writing, 'b' for BAM I/O, 'u' for uncompressed BAM output, + 'h' for outputing header in SAM, 'x' for HEX flag and 'X' for + string flag. If 'b' present, it must immediately follow 'r' or + 'w'. Valid modes are "r", "w", "wh", "wx", "whx", "wX", "whX", + "rb", "wb" and "wbu" exclusively. + + @param aux auxiliary data; if mode[0]=='w', aux points to + bam_header_t; if strcmp(mode, "rb")!=0 and @SQ header lines in SAM + are absent, aux points the file name of the list of the reference; + aux is not used otherwise. If @SQ header lines are present in SAM, + aux is not used, either. + + @return SAM/BAM file handler + */ + samfile_t *samopen(const char *fn, const char *mode, const void *aux); + + /*! + @abstract Close a SAM/BAM handler + @param fp file handler to be closed + */ + void samclose(samfile_t *fp); + + /*! + @abstract Read one alignment + @param fp file handler + @param b alignment + @return bytes read + */ + int samread(samfile_t *fp, bam1_t *b); + + /*! + @abstract Write one alignment + @param fp file handler + @param b alignment + @return bytes written + */ + int samwrite(samfile_t *fp, const bam1_t *b); + + /*! + @abstract Get the pileup for a whole alignment file + @param fp file handler + @param mask mask transferred to bam_plbuf_set_mask() + @param func user defined function called in the pileup process + #param data user provided data for func() + */ + int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data); + + char *samfaipath(const char *fn_ref); + int samthreads(samfile_t *fp, int n_threads, int n_sub_blks); + +#ifdef __cplusplus +} +#endif + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/sam_header.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,810 @@ +#include "sam_header.h" +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <stdlib.h> +#include <stdarg.h> + +#include "khash.h" +KHASH_MAP_INIT_STR(str, const char *) + +struct _HeaderList +{ + struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only. + struct _HeaderList *next; + void *data; +}; +typedef struct _HeaderList list_t; +typedef list_t HeaderDict; + +typedef struct +{ + char key[2]; + char *value; +} +HeaderTag; + +typedef struct +{ + char type[2]; + list_t *tags; +} +HeaderLine; + +const char *o_hd_tags[] = {"SO","GO",NULL}; +const char *r_hd_tags[] = {"VN",NULL}; + +const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL}; +const char *r_sq_tags[] = {"SN","LN",NULL}; +const char *u_sq_tags[] = {"SN",NULL}; + +const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL}; +const char *r_rg_tags[] = {"ID",NULL}; +const char *u_rg_tags[] = {"ID",NULL}; + +const char *o_pg_tags[] = {"VN","CL",NULL}; +const char *r_pg_tags[] = {"ID",NULL}; + +const char *types[] = {"HD","SQ","RG","PG","CO",NULL}; +const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL}; +const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL}; +const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL}; + + +static void debug(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); +} + +#if 0 +// Replaced by list_append_to_end +static list_t *list_prepend(list_t *root, void *data) +{ + list_t *l = malloc(sizeof(list_t)); + l->next = root; + l->data = data; + return l; +} +#endif + +// Relies on the root->last being correct. Do not use with the other list_* +// routines unless they are fixed to modify root->last as well. +static list_t *list_append_to_end(list_t *root, void *data) +{ + list_t *l = malloc(sizeof(list_t)); + l->last = l; + l->next = NULL; + l->data = data; + + if ( !root ) + return l; + + root->last->next = l; + root->last = l; + return root; +} + +static list_t *list_append(list_t *root, void *data) +{ + list_t *l = root; + while (l && l->next) + l = l->next; + if ( l ) + { + l->next = malloc(sizeof(list_t)); + l = l->next; + } + else + { + l = malloc(sizeof(list_t)); + root = l; + } + l->data = data; + l->next = NULL; + return root; +} + +static void list_free(list_t *root) +{ + list_t *l = root; + while (root) + { + l = root; + root = root->next; + free(l); + } +} + + + +// Look for a tag "XY" in a predefined const char *[] array. +static int tag_exists(const char *tag, const char **tags) +{ + int itag=0; + if ( !tags ) return -1; + while ( tags[itag] ) + { + if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag; + itag++; + } + return -1; +} + + + +// Mimics the behaviour of getline, except it returns pointer to the next chunk of the text +// or NULL if everything has been read. The lineptr should be freed by the caller. The +// newline character is stripped. +static const char *nextline(char **lineptr, size_t *n, const char *text) +{ + int len; + const char *to = text; + + if ( !*to ) return NULL; + + while ( *to && *to!='\n' && *to!='\r' ) to++; + len = to - text + 1; + + if ( *to ) + { + // Advance the pointer for the next call + if ( *to=='\n' ) to++; + else if ( *to=='\r' && *(to+1)=='\n' ) to+=2; + } + if ( !len ) + return to; + + if ( !*lineptr ) + { + *lineptr = malloc(len); + *n = len; + } + else if ( *n<len ) + { + *lineptr = realloc(*lineptr, len); + *n = len; + } + if ( !*lineptr ) { + debug("[nextline] Insufficient memory!\n"); + return 0; + } + + memcpy(*lineptr,text,len); + (*lineptr)[len-1] = 0; + + return to; +} + +// name points to "XY", value_from points to the first character of the value string and +// value_to points to the last character of the value string. +static HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to) +{ + HeaderTag *tag = malloc(sizeof(HeaderTag)); + int len = value_to-value_from+1; + + tag->key[0] = name[0]; + tag->key[1] = name[1]; + tag->value = malloc(len+1); + memcpy(tag->value,value_from,len+1); + tag->value[len] = 0; + return tag; +} + +static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key) +{ + list_t *tags = hline->tags; + while (tags) + { + HeaderTag *tag = tags->data; + if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag; + tags = tags->next; + } + return NULL; +} + + +// Return codes: +// 0 .. different types or unique tags differ or conflicting tags, cannot be merged +// 1 .. all tags identical -> no need to merge, drop one +// 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated +// 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line +static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2) +{ + HeaderTag *t1, *t2; + + if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] ) + return 0; + + int itype = tag_exists(hline1->type,types); + if ( itype==-1 ) { + debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]); + return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code + } + + if ( unique_tags[itype] ) + { + t1 = header_line_has_tag(hline1,unique_tags[itype][0]); + t2 = header_line_has_tag(hline2,unique_tags[itype][0]); + if ( !t1 || !t2 ) // this should never happen, the unique tags are required + return 2; + + if ( strcmp(t1->value,t2->value) ) + return 0; // the unique tags differ, cannot be merged + } + if ( !required_tags[itype] && !optional_tags[itype] ) + { + t1 = hline1->tags->data; + t2 = hline2->tags->data; + if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments + return 0; + } + + int missing=0, itag=0; + while ( required_tags[itype] && required_tags[itype][itag] ) + { + t1 = header_line_has_tag(hline1,required_tags[itype][itag]); + t2 = header_line_has_tag(hline2,required_tags[itype][itag]); + if ( !t1 && !t2 ) + return 2; // this should never happen + else if ( !t1 || !t2 ) + missing = 1; // there is some tag missing in one of the hlines + else if ( strcmp(t1->value,t2->value) ) + { + if ( unique_tags[itype] ) + return 2; // the lines have a matching unique tag but have a conflicting tag + + return 0; // the lines contain conflicting tags, cannot be merged + } + itag++; + } + itag = 0; + while ( optional_tags[itype] && optional_tags[itype][itag] ) + { + t1 = header_line_has_tag(hline1,optional_tags[itype][itag]); + t2 = header_line_has_tag(hline2,optional_tags[itype][itag]); + if ( !t1 && !t2 ) + { + itag++; + continue; + } + if ( !t1 || !t2 ) + missing = 1; // there is some tag missing in one of the hlines + else if ( strcmp(t1->value,t2->value) ) + { + if ( unique_tags[itype] ) + return 2; // the lines have a matching unique tag but have a conflicting tag + + return 0; // the lines contain conflicting tags, cannot be merged + } + itag++; + } + if ( missing ) return 3; // there are some missing complementary tags with no conflicts, can be merged + return 1; +} + + +static HeaderLine *sam_header_line_clone(const HeaderLine *hline) +{ + list_t *tags; + HeaderLine *out = malloc(sizeof(HeaderLine)); + out->type[0] = hline->type[0]; + out->type[1] = hline->type[1]; + out->tags = NULL; + + tags = hline->tags; + while (tags) + { + HeaderTag *old = tags->data; + + HeaderTag *new = malloc(sizeof(HeaderTag)); + new->key[0] = old->key[0]; + new->key[1] = old->key[1]; + new->value = strdup(old->value); + out->tags = list_append(out->tags, new); + + tags = tags->next; + } + return out; +} + +static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline) +{ + list_t *tmpl_tags; + + if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] ) + return 0; + + tmpl_tags = tmpl_hline->tags; + while (tmpl_tags) + { + HeaderTag *tmpl_tag = tmpl_tags->data; + HeaderTag *out_tag = header_line_has_tag(out_hline, tmpl_tag->key); + if ( !out_tag ) + { + HeaderTag *tag = malloc(sizeof(HeaderTag)); + tag->key[0] = tmpl_tag->key[0]; + tag->key[1] = tmpl_tag->key[1]; + tag->value = strdup(tmpl_tag->value); + out_hline->tags = list_append(out_hline->tags,tag); + } + tmpl_tags = tmpl_tags->next; + } + return 1; +} + + +static HeaderLine *sam_header_line_parse(const char *headerLine) +{ + HeaderLine *hline; + HeaderTag *tag; + const char *from, *to; + from = headerLine; + + if ( *from != '@' ) { + debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine); + return 0; + } + to = ++from; + + while (*to && *to!='\t') to++; + if ( to-from != 2 ) { + debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine); + return 0; + } + + hline = malloc(sizeof(HeaderLine)); + hline->type[0] = from[0]; + hline->type[1] = from[1]; + hline->tags = NULL; + + int itype = tag_exists(hline->type, types); + + from = to; + while (*to && *to=='\t') to++; + if ( to-from != 1 ) { + debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); + free(hline); + return 0; + } + from = to; + while (*from) + { + while (*to && *to!='\t') to++; + + if ( !required_tags[itype] && !optional_tags[itype] ) + { + // CO is a special case, it can contain anything, including tabs + if ( *to ) { to++; continue; } + tag = new_tag(" ",from,to-1); + } + else + tag = new_tag(from,from+3,to-1); + + if ( header_line_has_tag(hline,tag->key) ) + debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine); + hline->tags = list_append(hline->tags, tag); + + from = to; + while (*to && *to=='\t') to++; + if ( *to && to-from != 1 ) { + debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); + return 0; + } + + from = to; + } + return hline; +} + + +// Must be of an existing type, all tags must be recognised and all required tags must be present +static int sam_header_line_validate(HeaderLine *hline) +{ + list_t *tags; + HeaderTag *tag; + int itype, itag; + + // Is the type correct? + itype = tag_exists(hline->type, types); + if ( itype==-1 ) + { + debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]); + return 0; + } + + // Has all required tags? + itag = 0; + while ( required_tags[itype] && required_tags[itype][itag] ) + { + if ( !header_line_has_tag(hline,required_tags[itype][itag]) ) + { + debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1], + hline->type[0],hline->type[1]); + return 0; + } + itag++; + } + + // Are all tags recognised? + tags = hline->tags; + while ( tags ) + { + tag = tags->data; + if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) ) + { + // Lower case tags are user-defined values. + if( !(islower(tag->key[0]) || islower(tag->key[1])) ) + { + // Neither is lower case, but tag was not recognized. + debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]); + // return 0; // Even unknown tags are allowed - for forward compatibility with new attributes + } + // else - allow user defined tag + } + tags = tags->next; + } + + return 1; +} + + +static void print_header_line(FILE *fp, HeaderLine *hline) +{ + list_t *tags = hline->tags; + HeaderTag *tag; + + fprintf(fp, "@%c%c", hline->type[0],hline->type[1]); + while (tags) + { + tag = tags->data; + + fprintf(fp, "\t"); + if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) + fprintf(fp, "%c%c:", tag->key[0],tag->key[1]); + fprintf(fp, "%s", tag->value); + + tags = tags->next; + } + fprintf(fp,"\n"); +} + + +static void sam_header_line_free(HeaderLine *hline) +{ + list_t *tags = hline->tags; + while (tags) + { + HeaderTag *tag = tags->data; + free(tag->value); + free(tag); + tags = tags->next; + } + list_free(hline->tags); + free(hline); +} + +void sam_header_free(void *_header) +{ + HeaderDict *header = (HeaderDict*)_header; + list_t *hlines = header; + while (hlines) + { + sam_header_line_free(hlines->data); + hlines = hlines->next; + } + list_free(header); +} + +HeaderDict *sam_header_clone(const HeaderDict *dict) +{ + HeaderDict *out = NULL; + while (dict) + { + HeaderLine *hline = dict->data; + out = list_append(out, sam_header_line_clone(hline)); + dict = dict->next; + } + return out; +} + +// Returns a newly allocated string +char *sam_header_write(const void *_header) +{ + const HeaderDict *header = (const HeaderDict*)_header; + char *out = NULL; + int len=0, nout=0; + const list_t *hlines; + + // Calculate the length of the string to allocate + hlines = header; + while (hlines) + { + len += 4; // @XY and \n + + HeaderLine *hline = hlines->data; + list_t *tags = hline->tags; + while (tags) + { + HeaderTag *tag = tags->data; + len += strlen(tag->value) + 1; // \t + if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) + len += strlen(tag->value) + 3; // XY: + tags = tags->next; + } + hlines = hlines->next; + } + + nout = 0; + out = malloc(len+1); + hlines = header; + while (hlines) + { + HeaderLine *hline = hlines->data; + + nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]); + + list_t *tags = hline->tags; + while (tags) + { + HeaderTag *tag = tags->data; + nout += sprintf(out+nout,"\t"); + if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) + nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]); + nout += sprintf(out+nout,"%s", tag->value); + tags = tags->next; + } + hlines = hlines->next; + nout += sprintf(out+nout,"\n"); + } + out[len] = 0; + return out; +} + +void *sam_header_parse2(const char *headerText) +{ + list_t *hlines = NULL; + HeaderLine *hline; + const char *text; + char *buf=NULL; + size_t nbuf = 0; + int tovalidate = 0; + + if ( !headerText ) + return 0; + + text = headerText; + while ( (text=nextline(&buf, &nbuf, text)) ) + { + hline = sam_header_line_parse(buf); + if ( hline && (!tovalidate || sam_header_line_validate(hline)) ) + // With too many (~250,000) reference sequences the header parsing was too slow with list_append. + hlines = list_append_to_end(hlines, hline); + else + { + if (hline) sam_header_line_free(hline); + sam_header_free(hlines); + if ( buf ) free(buf); + return NULL; + } + } + if ( buf ) free(buf); + + return hlines; +} + +void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2]) +{ + const HeaderDict *dict = (const HeaderDict*)_dict; + const list_t *l = dict; + khash_t(str) *tbl = kh_init(str); + khiter_t k; + int ret; + + if (_dict == 0) return tbl; // return an empty (not null) hash table + while (l) + { + HeaderLine *hline = l->data; + if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) + { + l = l->next; + continue; + } + + HeaderTag *key, *value; + key = header_line_has_tag(hline,key_tag); + value = header_line_has_tag(hline,value_tag); + if ( !key || !value ) + { + l = l->next; + continue; + } + + k = kh_get(str, tbl, key->value); + if ( k != kh_end(tbl) ) + debug("[sam_header_lookup_table] They key %s not unique.\n", key->value); + k = kh_put(str, tbl, key->value, &ret); + kh_value(tbl, k) = value->value; + + l = l->next; + } + return tbl; +} + +char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n) +{ + const HeaderDict *dict = (const HeaderDict*)_dict; + const list_t *l = dict; + int max, n; + char **ret; + + ret = 0; *_n = max = n = 0; + while (l) + { + HeaderLine *hline = l->data; + if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) + { + l = l->next; + continue; + } + + HeaderTag *key; + key = header_line_has_tag(hline,key_tag); + if ( !key ) + { + l = l->next; + continue; + } + + if (n == max) { + max = max? max<<1 : 4; + ret = realloc(ret, max * sizeof(void*)); + } + ret[n++] = key->value; + + l = l->next; + } + *_n = n; + return ret; +} + +void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value) +{ + list_t *l = iter; + if ( !l ) return NULL; + + while (l) + { + HeaderLine *hline = l->data; + if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) + { + l = l->next; + continue; + } + + HeaderTag *key, *value; + key = header_line_has_tag(hline,key_tag); + value = header_line_has_tag(hline,value_tag); + if ( !key && !value ) + { + l = l->next; + continue; + } + + *_key = key->value; + *_value = value->value; + return l->next; + } + return l; +} + +const char *sam_tbl_get(void *h, const char *key) +{ + khash_t(str) *tbl = (khash_t(str)*)h; + khint_t k; + k = kh_get(str, tbl, key); + return k == kh_end(tbl)? 0 : kh_val(tbl, k); +} + +int sam_tbl_size(void *h) +{ + khash_t(str) *tbl = (khash_t(str)*)h; + return h? kh_size(tbl) : 0; +} + +void sam_tbl_destroy(void *h) +{ + khash_t(str) *tbl = (khash_t(str)*)h; + kh_destroy(str, tbl); +} + +void *sam_header_merge(int n, const void **_dicts) +{ + const HeaderDict **dicts = (const HeaderDict**)_dicts; + HeaderDict *out_dict; + int idict, status; + + if ( n<2 ) return NULL; + + out_dict = sam_header_clone(dicts[0]); + + for (idict=1; idict<n; idict++) + { + const list_t *tmpl_hlines = dicts[idict]; + + while ( tmpl_hlines ) + { + list_t *out_hlines = out_dict; + int inserted = 0; + while ( out_hlines ) + { + status = sam_header_compare_lines(tmpl_hlines->data, out_hlines->data); + if ( status==0 ) + { + out_hlines = out_hlines->next; + continue; + } + + if ( status==2 ) + { + print_header_line(stderr,tmpl_hlines->data); + print_header_line(stderr,out_hlines->data); + debug("Conflicting lines, cannot merge the headers.\n"); + return 0; + } + if ( status==3 ) + sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data); + + inserted = 1; + break; + } + if ( !inserted ) + out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data)); + + tmpl_hlines = tmpl_hlines->next; + } + } + + return out_dict; +} + +char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n) +{ + int nout = 0; + char **out = NULL; + + *n = 0; + list_t *l = (list_t *)dict; + if ( !l ) return NULL; + + int i, ntags = 0; + while ( tags[ntags] ) ntags++; + + while (l) + { + HeaderLine *hline = l->data; + if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) + { + l = l->next; + continue; + } + out = (char**) realloc(out, sizeof(char*)*(nout+1)*ntags); + for (i=0; i<ntags; i++) + { + HeaderTag *key = header_line_has_tag(hline, tags[i]); + if ( !key ) + { + out[nout*ntags+i] = NULL; + continue; + } + out[nout*ntags+i] = key->value; + } + nout++; + l = l->next; + } + *n = nout; + return out; +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/sam_header.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,48 @@ +#ifndef __SAM_HEADER_H__ +#define __SAM_HEADER_H__ + +#ifdef __cplusplus +extern "C" { +#endif + + void *sam_header_parse2(const char *headerText); + void *sam_header_merge(int n, const void **dicts); + void sam_header_free(void *header); + char *sam_header_write(const void *headerDict); // returns a newly allocated string + + /* + // Usage example + const char *key, *val; + void *iter = sam_header_parse2(bam->header->text); + while ( iter = sam_header_key_val(iter, "RG","ID","SM" &key,&val) ) printf("%s\t%s\n", key,val); + */ + void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **key, const char **value); + char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n); + + /* + // Usage example + int i, j, n; + const char *tags[] = {"SN","LN","UR","M5",NULL}; + void *dict = sam_header_parse2(bam->header->text); + char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &n); + for (i=0; i<n; i++) + { + for (j=0; j<4; j++) + if ( tbl[4*i+j] ) printf("\t%s", tbl[4*i+j]); + else printf("-"); + printf("\n"); + } + if (tbl) free(tbl); + */ + char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n); + + void *sam_header2tbl(const void *dict, char type[2], char key_tag[2], char value_tag[2]); + const char *sam_tbl_get(void *h, const char *key); + int sam_tbl_size(void *h); + void sam_tbl_destroy(void *h); + +#ifdef __cplusplus +} +#endif + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/sam_view.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,441 @@ +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <unistd.h> +#include <math.h> +#include <inttypes.h> +#include "sam_header.h" +#include "sam.h" +#include "faidx.h" +#include "kstring.h" +#include "khash.h" +KHASH_SET_INIT_STR(rg) + +// When counting records instead of printing them, +// data passed to the bam_fetch callback is encapsulated in this struct. +typedef struct { + bam_header_t *header; + int64_t *count; // int does overflow for very big BAMs +} count_func_data_t; + +typedef khash_t(rg) *rghash_t; + +// FIXME: we'd better use no global variables... +static rghash_t g_rghash = 0; +static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0, g_qual_scale = 0, g_min_qlen = 0; +static uint32_t g_subsam_seed = 0; +static double g_subsam_frac = -1.; +static char *g_library, *g_rg; +static void *g_bed; + +void *bed_read(const char *fn); +void bed_destroy(void *_h); +int bed_overlap(const void *_h, const char *chr, int beg, int end); + +static int process_aln(const bam_header_t *h, bam1_t *b) +{ + if (g_qual_scale > 1) { + int i; + uint8_t *qual = bam1_qual(b); + for (i = 0; i < b->core.l_qseq; ++i) { + int c = qual[i] * g_qual_scale; + qual[i] = c < 93? c : 93; + } + } + if (g_min_qlen > 0) { + int k, qlen = 0; + uint32_t *cigar = bam1_cigar(b); + for (k = 0; k < b->core.n_cigar; ++k) + if ((bam_cigar_type(bam_cigar_op(cigar[k]))&1) || bam_cigar_op(cigar[k]) == BAM_CHARD_CLIP) + qlen += bam_cigar_oplen(cigar[k]); + if (qlen < g_min_qlen) return 1; + } + if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off)) + return 1; + if (g_bed && b->core.tid >= 0 && !bed_overlap(g_bed, h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)))) + return 1; + if (g_subsam_frac > 0.) { + uint32_t k = __ac_X31_hash_string(bam1_qname(b)) + g_subsam_seed; + if ((double)(k&0xffffff) / 0x1000000 >= g_subsam_frac) return 1; + } + if (g_rg || g_rghash) { + uint8_t *s = bam_aux_get(b, "RG"); + if (s) { + if (g_rg) return (strcmp(g_rg, (char*)(s + 1)) == 0)? 0 : 1; + if (g_rghash) { + khint_t k = kh_get(rg, g_rghash, (char*)(s + 1)); + return (k != kh_end(g_rghash))? 0 : 1; + } + } + } + if (g_library) { + const char *p = bam_get_library((bam_header_t*)h, b); + return (p && strcmp(p, g_library) == 0)? 0 : 1; + } + return 0; +} + +static char *drop_rg(char *hdtxt, rghash_t h, int *len) +{ + char *p = hdtxt, *q, *r, *s; + kstring_t str; + memset(&str, 0, sizeof(kstring_t)); + while (1) { + int toprint = 0; + q = strchr(p, '\n'); + if (q == 0) q = p + strlen(p); + if (q - p < 3) break; // the line is too short; then stop + if (strncmp(p, "@RG\t", 4) == 0) { + int c; + khint_t k; + if ((r = strstr(p, "\tID:")) != 0) { + r += 4; + for (s = r; *s != '\0' && *s != '\n' && *s != '\t'; ++s); + c = *s; *s = '\0'; + k = kh_get(rg, h, r); + *s = c; + if (k != kh_end(h)) toprint = 1; + } + } else toprint = 1; + if (toprint) { + kputsn(p, q - p, &str); kputc('\n', &str); + } + p = q + 1; + } + *len = str.l; + return str.s; +} + +// callback function for bam_fetch() that prints nonskipped records +static int view_func(const bam1_t *b, void *data) +{ + if (!process_aln(((samfile_t*)data)->header, (bam1_t*)b)) + samwrite((samfile_t*)data, b); + return 0; +} + +// callback function for bam_fetch() that counts nonskipped records +static int count_func(const bam1_t *b, void *data) +{ + if (!process_aln(((count_func_data_t*)data)->header, (bam1_t*)b)) { + (*((count_func_data_t*)data)->count)++; + } + return 0; +} + +static int usage(int is_long_help); + +int main_samview(int argc, char *argv[]) +{ + int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, compress_level = -1, is_bamout = 0, is_count = 0; + int of_type = BAM_OFDEC, is_long_help = 0, n_threads = 0; + int64_t count = 0; + samfile_t *in = 0, *out = 0; + char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0, *q; + + /* parse command-line options */ + strcpy(in_mode, "r"); strcpy(out_mode, "w"); + while ((c = getopt(argc, argv, "SbBct:h1Ho:q:f:F:ul:r:xX?T:R:L:s:Q:@:m:")) >= 0) { + switch (c) { + case 's': + if ((g_subsam_seed = strtol(optarg, &q, 10)) != 0) { + srand(g_subsam_seed); + g_subsam_seed = rand(); + } + g_subsam_frac = strtod(q, &q); + break; + case 'm': g_min_qlen = atoi(optarg); break; + case 'c': is_count = 1; break; + case 'S': is_bamin = 0; break; + case 'b': is_bamout = 1; break; + case 't': fn_list = strdup(optarg); is_bamin = 0; break; + case 'h': is_header = 1; break; + case 'H': is_header_only = 1; break; + case 'o': fn_out = strdup(optarg); break; + case 'f': g_flag_on = strtol(optarg, 0, 0); break; + case 'F': g_flag_off = strtol(optarg, 0, 0); break; + case 'q': g_min_mapQ = atoi(optarg); break; + case 'u': compress_level = 0; break; + case '1': compress_level = 1; break; + case 'l': g_library = strdup(optarg); break; + case 'L': g_bed = bed_read(optarg); break; + case 'r': g_rg = strdup(optarg); break; + case 'R': fn_rg = strdup(optarg); break; + case 'x': of_type = BAM_OFHEX; break; + case 'X': of_type = BAM_OFSTR; break; + case '?': is_long_help = 1; break; + case 'T': fn_ref = strdup(optarg); is_bamin = 0; break; + case 'B': bam_no_B = 1; break; + case 'Q': g_qual_scale = atoi(optarg); break; + case '@': n_threads = strtol(optarg, 0, 0); break; + default: return usage(is_long_help); + } + } + if (compress_level >= 0) is_bamout = 1; + if (is_header_only) is_header = 1; + if (is_bamout) strcat(out_mode, "b"); + else { + if (of_type == BAM_OFHEX) strcat(out_mode, "x"); + else if (of_type == BAM_OFSTR) strcat(out_mode, "X"); + } + if (is_bamin) strcat(in_mode, "b"); + if (is_header) strcat(out_mode, "h"); + if (compress_level >= 0) { + char tmp[2]; + tmp[0] = compress_level + '0'; tmp[1] = '\0'; + strcat(out_mode, tmp); + } + if (argc == optind) return usage(is_long_help); // potential memory leak... + + // read the list of read groups + if (fn_rg) { + FILE *fp_rg; + char buf[1024]; + int ret; + g_rghash = kh_init(rg); + fp_rg = fopen(fn_rg, "r"); + while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but bear me... + kh_put(rg, g_rghash, strdup(buf), &ret); // we'd better check duplicates... + fclose(fp_rg); + } + + // generate the fn_list if necessary + if (fn_list == 0 && fn_ref) fn_list = samfaipath(fn_ref); + // open file handlers + if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) { + fprintf(stderr, "[main_samview] fail to open \"%s\" for reading.\n", argv[optind]); + ret = 1; + goto view_end; + } + if (in->header == 0) { + fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", argv[optind]); + ret = 1; + goto view_end; + } + if (g_rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for... + char *tmp; + int l; + tmp = drop_rg(in->header->text, g_rghash, &l); + free(in->header->text); + in->header->text = tmp; + in->header->l_text = l; + } + if (!is_count && (out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) { + fprintf(stderr, "[main_samview] fail to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); + ret = 1; + goto view_end; + } + if (n_threads > 1) samthreads(out, n_threads, 256); + if (is_header_only) goto view_end; // no need to print alignments + + if (argc == optind + 1) { // convert/print the entire file + bam1_t *b = bam_init1(); + int r; + while ((r = samread(in, b)) >= 0) { // read one alignment from `in' + if (!process_aln(in->header, b)) { + if (!is_count) samwrite(out, b); // write the alignment to `out' + count++; + } + } + if (r < -1) { + fprintf(stderr, "[main_samview] truncated file.\n"); + ret = 1; + } + bam_destroy1(b); + } else { // retrieve alignments in specified regions + int i; + bam_index_t *idx = 0; + if (is_bamin) idx = bam_index_load(argv[optind]); // load BAM index + if (idx == 0) { // index is unavailable + fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM files.\n"); + ret = 1; + goto view_end; + } + for (i = optind + 1; i < argc; ++i) { + int tid, beg, end, result; + bam_parse_region(in->header, argv[i], &tid, &beg, &end); // parse a region in the format like `chr2:100-200' + if (tid < 0) { // reference name is not found + fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); + continue; + } + // fetch alignments + if (is_count) { + count_func_data_t count_data = { in->header, &count }; + result = bam_fetch(in->x.bam, idx, tid, beg, end, &count_data, count_func); + } else + result = bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func); + if (result < 0) { + fprintf(stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); + ret = 1; + break; + } + } + bam_index_destroy(idx); // destroy the BAM index + } + +view_end: + if (is_count && ret == 0) + printf("%" PRId64 "\n", count); + + // close files, free and return + free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg); + if (g_bed) bed_destroy(g_bed); + if (g_rghash) { + khint_t k; + for (k = 0; k < kh_end(g_rghash); ++k) + if (kh_exist(g_rghash, k)) free((char*)kh_key(g_rghash, k)); + kh_destroy(rg, g_rghash); + } + samclose(in); + if (!is_count) + samclose(out); + return ret; +} + +static int usage(int is_long_help) +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools view [options] <in.bam>|<in.sam> [region1 [...]]\n\n"); + fprintf(stderr, "Options: -b output BAM\n"); + fprintf(stderr, " -h print header for the SAM output\n"); + fprintf(stderr, " -H print header only (no alignments)\n"); + fprintf(stderr, " -S input is SAM\n"); + fprintf(stderr, " -u uncompressed BAM output (force -b)\n"); + fprintf(stderr, " -1 fast compression (force -b)\n"); + fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n"); + fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n"); + fprintf(stderr, " -c print only the count of matching records\n"); + fprintf(stderr, " -B collapse the backward CIGAR operation\n"); + fprintf(stderr, " -@ INT number of BAM compression threads [0]\n"); + fprintf(stderr, " -L FILE output alignments overlapping the input BED FILE [null]\n"); + fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); + fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n"); + fprintf(stderr, " -o FILE output file name [stdout]\n"); + fprintf(stderr, " -R FILE list of read groups to be outputted [null]\n"); + fprintf(stderr, " -f INT required flag, 0 for unset [0]\n"); + fprintf(stderr, " -F INT filtering flag, 0 for unset [0]\n"); + fprintf(stderr, " -q INT minimum mapping quality [0]\n"); + fprintf(stderr, " -l STR only output reads in library STR [null]\n"); + fprintf(stderr, " -r STR only output reads in read group STR [null]\n"); + fprintf(stderr, " -s FLOAT fraction of templates to subsample; integer part as seed [-1]\n"); + fprintf(stderr, " -? longer help\n"); + fprintf(stderr, "\n"); + if (is_long_help) + fprintf(stderr, "Notes:\n\ +\n\ + 1. By default, this command assumes the file on the command line is in\n\ + the BAM format and it prints the alignments in SAM. If `-t' is\n\ + applied, the input file is assumed to be in the SAM format. The\n\ + file supplied with `-t' is SPACE/TAB delimited with the first two\n\ + fields of each line consisting of the reference name and the\n\ + corresponding sequence length. The `.fai' file generated by `faidx'\n\ + can be used here. This file may be empty if reads are unaligned.\n\ +\n\ + 2. SAM->BAM conversion: `samtools view -bT ref.fa in.sam.gz'.\n\ +\n\ + 3. BAM->SAM conversion: `samtools view in.bam'.\n\ +\n\ + 4. A region should be presented in one of the following formats:\n\ + `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n\ + specified, the input alignment file must be an indexed BAM file.\n\ +\n\ + 5. Option `-u' is preferred over `-b' when the output is piped to\n\ + another samtools command.\n\ +\n\ + 6. In a string FLAG, each character represents one bit with\n\ + p=0x1 (paired), P=0x2 (properly paired), u=0x4 (unmapped),\n\ + U=0x8 (mate unmapped), r=0x10 (reverse), R=0x20 (mate reverse)\n\ + 1=0x40 (first), 2=0x80 (second), s=0x100 (not primary), \n\ + f=0x200 (failure) and d=0x400 (duplicate). Note that `-x' and\n\ + `-X' are samtools-C specific. Picard and older samtools do not\n\ + support HEX or string flags.\n\ +\n"); + return 1; +} + +int main_import(int argc, char *argv[]) +{ + int argc2, ret; + char **argv2; + if (argc != 4) { + fprintf(stderr, "Usage: bamtk import <in.ref_list> <in.sam> <out.bam>\n"); + return 1; + } + argc2 = 6; + argv2 = calloc(6, sizeof(char*)); + argv2[0] = "import", argv2[1] = "-o", argv2[2] = argv[3], argv2[3] = "-bt", argv2[4] = argv[1], argv2[5] = argv[2]; + ret = main_samview(argc2, argv2); + free(argv2); + return ret; +} + +int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 }; + +int main_bam2fq(int argc, char *argv[]) +{ + bamFile fp; + bam_header_t *h; + bam1_t *b; + int8_t *buf; + int max_buf, c, no12 = 0; + while ((c = getopt(argc, argv, "n")) > 0) + if (c == 'n') no12 = 1; + if (argc == 1) { + fprintf(stderr, "Usage: samtools bam2fq <in.bam>\n"); + return 1; + } + fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); + if (fp == 0) return 1; + h = bam_header_read(fp); + b = bam_init1(); + buf = 0; + max_buf = 0; + while (bam_read1(fp, b) >= 0) { + int i, qlen = b->core.l_qseq; + uint8_t *seq; + putchar('@'); fputs(bam1_qname(b), stdout); + if (no12) putchar('\n'); + else { + if ((b->core.flag & 0x40) && !(b->core.flag & 0x80)) puts("/1"); + else if ((b->core.flag & 0x80) && !(b->core.flag & 0x40)) puts("/2"); + else putchar('\n'); + } + if (max_buf < qlen + 1) { + max_buf = qlen + 1; + kroundup32(max_buf); + buf = realloc(buf, max_buf); + } + buf[qlen] = 0; + seq = bam1_seq(b); + for (i = 0; i < qlen; ++i) + buf[i] = bam1_seqi(seq, i); + if (b->core.flag & 16) { // reverse complement + for (i = 0; i < qlen>>1; ++i) { + int8_t t = seq_comp_table[buf[qlen - 1 - i]]; + buf[qlen - 1 - i] = seq_comp_table[buf[i]]; + buf[i] = t; + } + if (qlen&1) buf[i] = seq_comp_table[buf[i]]; + } + for (i = 0; i < qlen; ++i) + buf[i] = bam_nt16_rev_table[buf[i]]; + puts((char*)buf); + puts("+"); + seq = bam1_qual(b); + for (i = 0; i < qlen; ++i) + buf[i] = 33 + seq[i]; + if (b->core.flag & 16) { // reverse + for (i = 0; i < qlen>>1; ++i) { + int8_t t = buf[qlen - 1 - i]; + buf[qlen - 1 - i] = buf[i]; + buf[i] = t; + } + } + puts((char*)buf); + } + free(buf); + bam_destroy1(b); + bam_header_destroy(h); + bam_close(fp); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/sample.c Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,107 @@ +#include <stdlib.h> +#include <string.h> +#include "sample.h" +#include "khash.h" +KHASH_MAP_INIT_STR(sm, int) + +bam_sample_t *bam_smpl_init(void) +{ + bam_sample_t *s; + s = calloc(1, sizeof(bam_sample_t)); + s->rg2smid = kh_init(sm); + s->sm2id = kh_init(sm); + return s; +} + +void bam_smpl_destroy(bam_sample_t *sm) +{ + int i; + khint_t k; + khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid; + if (sm == 0) return; + for (i = 0; i < sm->n; ++i) free(sm->smpl[i]); + free(sm->smpl); + for (k = kh_begin(rg2smid); k != kh_end(rg2smid); ++k) + if (kh_exist(rg2smid, k)) free((char*)kh_key(rg2smid, k)); + kh_destroy(sm, sm->rg2smid); + kh_destroy(sm, sm->sm2id); + free(sm); +} + +static void add_pair(bam_sample_t *sm, khash_t(sm) *sm2id, const char *key, const char *val) +{ + khint_t k_rg, k_sm; + int ret; + khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid; + k_rg = kh_get(sm, rg2smid, key); + if (k_rg != kh_end(rg2smid)) return; // duplicated @RG-ID + k_rg = kh_put(sm, rg2smid, strdup(key), &ret); + k_sm = kh_get(sm, sm2id, val); + if (k_sm == kh_end(sm2id)) { // absent + if (sm->n == sm->m) { + sm->m = sm->m? sm->m<<1 : 1; + sm->smpl = realloc(sm->smpl, sizeof(void*) * sm->m); + } + sm->smpl[sm->n] = strdup(val); + k_sm = kh_put(sm, sm2id, sm->smpl[sm->n], &ret); + kh_val(sm2id, k_sm) = sm->n++; + } + kh_val(rg2smid, k_rg) = kh_val(sm2id, k_sm); +} + +int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt) +{ + const char *p = txt, *q, *r; + kstring_t buf, first_sm; + int n = 0; + khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id; + if (txt == 0) { + add_pair(sm, sm2id, fn, fn); + return 0; + } + memset(&buf, 0, sizeof(kstring_t)); + memset(&first_sm, 0, sizeof(kstring_t)); + while ((q = strstr(p, "@RG")) != 0) { + p = q + 3; + r = q = 0; + if ((q = strstr(p, "\tID:")) != 0) q += 4; + if ((r = strstr(p, "\tSM:")) != 0) r += 4; + if (r && q) { + char *u, *v; + int oq, or; + for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u); + for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v); + oq = *u; or = *v; *u = *v = '\0'; + buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf); + add_pair(sm, sm2id, buf.s, r); + if ( !first_sm.s ) + kputs(r,&first_sm); + *u = oq; *v = or; + } else break; + p = q > r? q : r; + ++n; + } + if (n == 0) add_pair(sm, sm2id, fn, fn); + // If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but + // use the tag instead. + else if ( n==1 && first_sm.s ) + add_pair(sm,sm2id,fn,first_sm.s); + if ( first_sm.s ) + free(first_sm.s); + +// add_pair(sm, sm2id, fn, fn); + free(buf.s); + return 0; +} + +int bam_smpl_rg2smid(const bam_sample_t *sm, const char *fn, const char *rg, kstring_t *str) +{ + khint_t k; + khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid; + if (rg) { + str->l = 0; + kputs(fn, str); kputc('/', str); kputs(rg, str); + k = kh_get(sm, rg2smid, str->s); + } else k = kh_get(sm, rg2smid, fn); + return k == kh_end(rg2smid)? -1 : kh_val(rg2smid, k); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/sample.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,17 @@ +#ifndef BAM_SAMPLE_H +#define BAM_SAMPLE_H + +#include "kstring.h" + +typedef struct { + int n, m; + char **smpl; + void *rg2smid, *sm2id; +} bam_sample_t; + +bam_sample_t *bam_smpl_init(void); +int bam_smpl_add(bam_sample_t *sm, const char *abs, const char *txt); +int bam_smpl_rg2smid(const bam_sample_t *sm, const char *fn, const char *rg, kstring_t *str); +void bam_smpl_destroy(bam_sample_t *sm); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/samtools.1 Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1066 @@ +.TH samtools 1 "15 March 2013" "samtools-0.1.19" "Bioinformatics tools" +.SH NAME +.PP +samtools - Utilities for the Sequence Alignment/Map (SAM) format + +bcftools - Utilities for the Binary Call Format (BCF) and VCF +.SH SYNOPSIS +.PP +samtools view -bt ref_list.txt -o aln.bam aln.sam.gz +.PP +samtools sort aln.bam aln.sorted +.PP +samtools index aln.sorted.bam +.PP +samtools idxstats aln.sorted.bam +.PP +samtools view aln.sorted.bam chr2:20,100,000-20,200,000 +.PP +samtools merge out.bam in1.bam in2.bam in3.bam +.PP +samtools faidx ref.fasta +.PP +samtools pileup -vcf ref.fasta aln.sorted.bam +.PP +samtools mpileup -C50 -gf ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam +.PP +samtools tview aln.sorted.bam ref.fasta +.PP +bcftools index in.bcf +.PP +bcftools view in.bcf chr2:100-200 > out.vcf +.PP +bcftools view -Nvm0.99 in.bcf > out.vcf 2> out.afs + +.SH DESCRIPTION +.PP +Samtools is a set of utilities that manipulate alignments in the BAM +format. It imports from and exports to the SAM (Sequence Alignment/Map) +format, does sorting, merging and indexing, and allows to retrieve reads +in any regions swiftly. + +Samtools is designed to work on a stream. It regards an input file `-' +as the standard input (stdin) and an output file `-' as the standard +output (stdout). Several commands can thus be combined with Unix +pipes. Samtools always output warning and error messages to the standard +error output (stderr). + +Samtools is also able to open a BAM (not SAM) file on a remote FTP or +HTTP server if the BAM file name starts with `ftp://' or `http://'. +Samtools checks the current working directory for the index file and +will download the index upon absence. Samtools does not retrieve the +entire alignment file unless it is asked to do so. + +.SH SAMTOOLS COMMANDS AND OPTIONS + +.TP 10 +.B view +samtools view [-bchuHS] [-t in.refList] [-o output] [-f reqFlag] [-F +skipFlag] [-q minMapQ] [-l library] [-r readGroup] [-R rgFile] <in.bam>|<in.sam> [region1 [...]] + +Extract/print all or sub alignments in SAM or BAM format. If no region +is specified, all the alignments will be printed; otherwise only +alignments overlapping the specified regions will be output. An +alignment may be given multiple times if it is overlapping several +regions. A region can be presented, for example, in the following +format: `chr2' (the whole chr2), `chr2:1000000' (region starting from +1,000,000bp) or `chr2:1,000,000-2,000,000' (region between 1,000,000 and +2,000,000bp including the end points). The coordinate is 1-based. + +.B OPTIONS: +.RS +.TP 10 +.B -b +Output in the BAM format. +.TP +.BI -f \ INT +Only output alignments with all bits in INT present in the FLAG +field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0] +.TP +.BI -F \ INT +Skip alignments with bits present in INT [0] +.TP +.B -h +Include the header in the output. +.TP +.B -H +Output the header only. +.TP +.BI -l \ STR +Only output reads in library STR [null] +.TP +.BI -o \ FILE +Output file [stdout] +.TP +.BI -q \ INT +Skip alignments with MAPQ smaller than INT [0] +.TP +.BI -r \ STR +Only output reads in read group STR [null] +.TP +.BI -R \ FILE +Output reads in read groups listed in +.I FILE +[null] +.TP +.BI -s \ FLOAT +Fraction of templates/pairs to subsample; the integer part is treated as the +seed for the random number generator [-1] +.TP +.B -S +Input is in SAM. If @SQ header lines are absent, the +.B `-t' +option is required. +.TP +.B -c +Instead of printing the alignments, only count them and print the +total number. All filter options, such as +.B `-f', +.B `-F' +and +.B `-q' +, are taken into account. +.TP +.BI -t \ FILE +This file is TAB-delimited. Each line must contain the reference name +and the length of the reference, one line for each distinct reference; +additional fields are ignored. This file also defines the order of the +reference sequences in sorting. If you run `samtools faidx <ref.fa>', +the resultant index file +.I <ref.fa>.fai +can be used as this +.I <in.ref_list> +file. +.TP +.B -u +Output uncompressed BAM. This option saves time spent on +compression/decomprssion and is thus preferred when the output is piped +to another samtools command. +.RE + +.TP +.B tview +samtools tview +.RB [ \-p +.IR chr:pos ] +.RB [ \-s +.IR STR ] +.RB [ \-d +.IR display ] +.RI <in.sorted.bam> +.RI [ref.fasta] + +Text alignment viewer (based on the ncurses library). In the viewer, +press `?' for help and press `g' to check the alignment start from a +region in the format like `chr10:10,000,000' or `=10,000,000' when +viewing the same reference sequence. + +.B Options: +.RS +.TP 14 +.BI -d \ display +Output as (H)tml or (C)urses or (T)ext +.TP +.BI -p \ chr:pos +Go directly to this position +.TP +.BI -s \ STR +Display only reads from this sample or read group +.RE + +.TP +.B mpileup +samtools mpileup +.RB [ \-EBugp ] +.RB [ \-C +.IR capQcoef ] +.RB [ \-r +.IR reg ] +.RB [ \-f +.IR in.fa ] +.RB [ \-l +.IR list ] +.RB [ \-M +.IR capMapQ ] +.RB [ \-Q +.IR minBaseQ ] +.RB [ \-q +.IR minMapQ ] +.I in.bam +.RI [ in2.bam +.RI [ ... ]] + +Generate BCF or pileup for one or multiple BAM files. Alignment records +are grouped by sample identifiers in @RG header lines. If sample +identifiers are absent, each input file is regarded as one sample. + +In the pileup format (without +.BR -u or -g ), +each +line represents a genomic position, consisting of chromosome name, +coordinate, reference base, read bases, read qualities and alignment +mapping qualities. Information on match, mismatch, indel, strand, +mapping quality and start and end of a read are all encoded at the read +base column. At this column, a dot stands for a match to the reference +base on the forward strand, a comma for a match on the reverse strand, +a '>' or '<' for a reference skip, `ACGTN' for a mismatch on the forward +strand and `acgtn' for a mismatch on the reverse strand. A pattern +`\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this +reference position and the next reference position. The length of the +insertion is given by the integer in the pattern, followed by the +inserted sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' +represents a deletion from the reference. The deleted bases will be +presented as `*' in the following lines. Also at the read base column, a +symbol `^' marks the start of a read. The ASCII of the character +following `^' minus 33 gives the mapping quality. A symbol `$' marks the +end of a read segment. + +.B Input Options: +.RS +.TP 10 +.B -6 +Assume the quality is in the Illumina 1.3+ encoding. +.B -A +Do not skip anomalous read pairs in variant calling. +.TP +.B -B +Disable probabilistic realignment for the computation of base alignment +quality (BAQ). BAQ is the Phred-scaled probability of a read base being +misaligned. Applying this option greatly helps to reduce false SNPs +caused by misalignments. +.TP +.BI -b \ FILE +List of input BAM files, one file per line [null] +.TP +.BI -C \ INT +Coefficient for downgrading mapping quality for reads containing +excessive mismatches. Given a read with a phred-scaled probability q of +being generated from the mapped position, the new mapping quality is +about sqrt((INT-q)/INT)*INT. A zero value disables this +functionality; if enabled, the recommended value for BWA is 50. [0] +.TP +.BI -d \ INT +At a position, read maximally +.I INT +reads per input BAM. [250] +.TP +.B -E +Extended BAQ computation. This option helps sensitivity especially for MNPs, but may hurt +specificity a little bit. +.TP +.BI -f \ FILE +The +.BR faidx -indexed +reference file in the FASTA format. The file can be optionally compressed by +.BR razip . +[null] +.TP +.BI -l \ FILE +BED or position list file containing a list of regions or sites where pileup or BCF should be generated [null] +.TP +.BI -q \ INT +Minimum mapping quality for an alignment to be used [0] +.TP +.BI -Q \ INT +Minimum base quality for a base to be considered [13] +.TP +.BI -r \ STR +Only generate pileup in region +.I STR +[all sites] +.TP +.B Output Options: + +.TP +.B -D +Output per-sample read depth +.TP +.B -g +Compute genotype likelihoods and output them in the binary call format (BCF). +.TP +.B -S +Output per-sample Phred-scaled strand bias P-value +.TP +.B -u +Similar to +.B -g +except that the output is uncompressed BCF, which is preferred for piping. + +.TP +.B Options for Genotype Likelihood Computation (for -g or -u): + +.TP +.BI -e \ INT +Phred-scaled gap extension sequencing error probability. Reducing +.I INT +leads to longer indels. [20] +.TP +.BI -h \ INT +Coefficient for modeling homopolymer errors. Given an +.IR l -long +homopolymer +run, the sequencing error of an indel of size +.I s +is modeled as +.IR INT * s / l . +[100] +.TP +.B -I +Do not perform INDEL calling +.TP +.BI -L \ INT +Skip INDEL calling if the average per-sample depth is above +.IR INT . +[250] +.TP +.BI -o \ INT +Phred-scaled gap open sequencing error probability. Reducing +.I INT +leads to more indel calls. [40] +.TP +.BI -p +Apply -m and -F thresholds per sample to increase sensitivity of calling. +By default both options are applied to reads pooled from all samples. +.TP +.BI -P \ STR +Comma dilimited list of platforms (determined by +.BR @RG-PL ) +from which indel candidates are obtained. It is recommended to collect +indel candidates from sequencing technologies that have low indel error +rate such as ILLUMINA. [all] +.RE + +.TP +.B reheader +samtools reheader <in.header.sam> <in.bam> + +Replace the header in +.I in.bam +with the header in +.I in.header.sam. +This command is much faster than replacing the header with a +BAM->SAM->BAM conversion. + +.TP +.B cat +samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [ ... ] + +Concatenate BAMs. The sequence dictionary of each input BAM must be identical, +although this command does not check this. This command uses a similar trick +to +.B reheader +which enables fast BAM concatenation. + +.TP +.B sort +samtools sort [-nof] [-m maxMem] <in.bam> <out.prefix> + +Sort alignments by leftmost coordinates. File +.I <out.prefix>.bam +will be created. This command may also create temporary files +.I <out.prefix>.%d.bam +when the whole alignment cannot be fitted into memory (controlled by +option -m). + +.B OPTIONS: +.RS +.TP 8 +.B -o +Output the final alignment to the standard output. +.TP +.B -n +Sort by read names rather than by chromosomal coordinates +.TP +.B -f +Use +.I <out.prefix> +as the full output path and do not append +.I .bam +suffix. +.TP +.BI -m \ INT +Approximately the maximum required memory. [500000000] +.RE + +.TP +.B merge +samtools merge [-nur1f] [-h inh.sam] [-R reg] <out.bam> <in1.bam> <in2.bam> [...] + +Merge multiple sorted alignments. +The header reference lists of all the input BAM files, and the @SQ headers of +.IR inh.sam , +if any, must all refer to the same set of reference sequences. +The header reference list and (unless overridden by +.BR -h ) +`@' headers of +.I in1.bam +will be copied to +.IR out.bam , +and the headers of other files will be ignored. + +.B OPTIONS: +.RS +.TP 8 +.B -1 +Use zlib compression level 1 to comrpess the output +.TP +.B -f +Force to overwrite the output file if present. +.TP 8 +.BI -h \ FILE +Use the lines of +.I FILE +as `@' headers to be copied to +.IR out.bam , +replacing any header lines that would otherwise be copied from +.IR in1.bam . +.RI ( FILE +is actually in SAM format, though any alignment records it may contain +are ignored.) +.TP +.B -n +The input alignments are sorted by read names rather than by chromosomal +coordinates +.TP +.BI -R \ STR +Merge files in the specified region indicated by +.I STR +[null] +.TP +.B -r +Attach an RG tag to each alignment. The tag value is inferred from file names. +.TP +.B -u +Uncompressed BAM output +.RE + +.TP +.B index +samtools index <aln.bam> + +Index sorted alignment for fast random access. Index file +.I <aln.bam>.bai +will be created. + +.TP +.B idxstats +samtools idxstats <aln.bam> + +Retrieve and print stats in the index file. The output is TAB delimited +with each line consisting of reference sequence name, sequence length, # +mapped reads and # unmapped reads. + +.TP +.B faidx +samtools faidx <ref.fasta> [region1 [...]] + +Index reference sequence in the FASTA format or extract subsequence from +indexed reference sequence. If no region is specified, +.B faidx +will index the file and create +.I <ref.fasta>.fai +on the disk. If regions are speficified, the subsequences will be +retrieved and printed to stdout in the FASTA format. The input file can +be compressed in the +.B RAZF +format. + +.TP +.B fixmate +samtools fixmate <in.nameSrt.bam> <out.bam> + +Fill in mate coordinates, ISIZE and mate related flags from a +name-sorted alignment. + +.TP +.B rmdup +samtools rmdup [-sS] <input.srt.bam> <out.bam> + +Remove potential PCR duplicates: if multiple read pairs have identical +external coordinates, only retain the pair with highest mapping quality. +In the paired-end mode, this command +.B ONLY +works with FR orientation and requires ISIZE is correctly set. It does +not work for unpaired reads (e.g. two ends mapped to different +chromosomes or orphan reads). + +.B OPTIONS: +.RS +.TP 8 +.B -s +Remove duplicate for single-end reads. By default, the command works for +paired-end reads only. +.TP 8 +.B -S +Treat paired-end reads and single-end reads. +.RE + +.TP +.B calmd +samtools calmd [-EeubSr] [-C capQcoef] <aln.bam> <ref.fasta> + +Generate the MD tag. If the MD tag is already present, this command will +give a warning if the MD tag generated is different from the existing +tag. Output SAM by default. + +.B OPTIONS: +.RS +.TP 8 +.B -A +When used jointly with +.B -r +this option overwrites the original base quality. +.TP 8 +.B -e +Convert a the read base to = if it is identical to the aligned reference +base. Indel caller does not support the = bases at the moment. +.TP +.B -u +Output uncompressed BAM +.TP +.B -b +Output compressed BAM +.TP +.B -S +The input is SAM with header lines +.TP +.BI -C \ INT +Coefficient to cap mapping quality of poorly mapped reads. See the +.B pileup +command for details. [0] +.TP +.B -r +Compute the BQ tag (without -A) or cap base quality by BAQ (with -A). +.TP +.B -E +Extended BAQ calculation. This option trades specificity for sensitivity, though the +effect is minor. +.RE + +.TP +.B targetcut +samtools targetcut [-Q minBaseQ] [-i inPenalty] [-0 em0] [-1 em1] [-2 em2] [-f ref] <in.bam> + +This command identifies target regions by examining the continuity of read depth, computes +haploid consensus sequences of targets and outputs a SAM with each sequence corresponding +to a target. When option +.B -f +is in use, BAQ will be applied. This command is +.B only +designed for cutting fosmid clones from fosmid pool sequencing [Ref. Kitzman et al. (2010)]. +.RE + +.TP +.B phase +samtools phase [-AF] [-k len] [-b prefix] [-q minLOD] [-Q minBaseQ] <in.bam> + +Call and phase heterozygous SNPs. +.B OPTIONS: +.RS +.TP 8 +.B -A +Drop reads with ambiguous phase. +.TP 8 +.BI -b \ STR +Prefix of BAM output. When this option is in use, phase-0 reads will be saved in file +.BR STR .0.bam +and phase-1 reads in +.BR STR .1.bam. +Phase unknown reads will be randomly allocated to one of the two files. Chimeric reads +with switch errors will be saved in +.BR STR .chimeric.bam. +[null] +.TP +.B -F +Do not attempt to fix chimeric reads. +.TP +.BI -k \ INT +Maximum length for local phasing. [13] +.TP +.BI -q \ INT +Minimum Phred-scaled LOD to call a heterozygote. [40] +.TP +.BI -Q \ INT +Minimum base quality to be used in het calling. [13] +.RE + +.SH BCFTOOLS COMMANDS AND OPTIONS + +.TP 10 +.B view +.B bcftools view +.RB [ \-AbFGNQSucgv ] +.RB [ \-D +.IR seqDict ] +.RB [ \-l +.IR listLoci ] +.RB [ \-s +.IR listSample ] +.RB [ \-i +.IR gapSNPratio ] +.RB [ \-t +.IR mutRate ] +.RB [ \-p +.IR varThres ] +.RB [ \-m +.IR varThres ] +.RB [ \-P +.IR prior ] +.RB [ \-1 +.IR nGroup1 ] +.RB [ \-d +.IR minFrac ] +.RB [ \-U +.IR nPerm ] +.RB [ \-X +.IR permThres ] +.RB [ \-T +.IR trioType ] +.I in.bcf +.RI [ region ] + +Convert between BCF and VCF, call variant candidates and estimate allele +frequencies. + +.RS +.TP +.B Input/Output Options: +.TP 10 +.B -A +Retain all possible alternate alleles at variant sites. By default, the view +command discards unlikely alleles. +.TP 10 +.B -b +Output in the BCF format. The default is VCF. +.TP +.BI -D \ FILE +Sequence dictionary (list of chromosome names) for VCF->BCF conversion [null] +.TP +.B -F +Indicate PL is generated by r921 or before (ordering is different). +.TP +.B -G +Suppress all individual genotype information. +.TP +.BI -l \ FILE +List of sites at which information are outputted [all sites] +.TP +.B -N +Skip sites where the REF field is not A/C/G/T +.TP +.B -Q +Output the QCALL likelihood format +.TP +.BI -s \ FILE +List of samples to use. The first column in the input gives the sample names +and the second gives the ploidy, which can only be 1 or 2. When the 2nd column +is absent, the sample ploidy is assumed to be 2. In the output, the ordering of +samples will be identical to the one in +.IR FILE . +[null] +.TP +.B -S +The input is VCF instead of BCF. +.TP +.B -u +Uncompressed BCF output (force -b). +.TP +.B Consensus/Variant Calling Options: +.TP 10 +.B -c +Call variants using Bayesian inference. This option automatically invokes option +.BR -e . +.TP +.BI -d \ FLOAT +When +.B -v +is in use, skip loci where the fraction of samples covered by reads is below FLOAT. [0] +.TP +.B -e +Perform max-likelihood inference only, including estimating the site allele frequency, +testing Hardy-Weinberg equlibrium and testing associations with LRT. +.TP +.B -g +Call per-sample genotypes at variant sites (force -c) +.TP +.BI -i \ FLOAT +Ratio of INDEL-to-SNP mutation rate [0.15] +.TP +.BI -m \ FLOAT +New model for improved multiallelic and rare-variant calling. Another +ALT allele is accepted if P(chi^2) of LRT exceeds the FLOAT threshold. The +parameter seems robust and the actual value usually does not affect the results +much; a good value to use is 0.99. This is the recommended calling method. [0] +.TP +.BI -p \ FLOAT +A site is considered to be a variant if P(ref|D)<FLOAT [0.5] +.TP +.BI -P \ STR +Prior or initial allele frequency spectrum. If STR can be +.IR full , +.IR cond2 , +.I flat +or the file consisting of error output from a previous variant calling +run. +.TP +.BI -t \ FLOAT +Scaled muttion rate for variant calling [0.001] +.TP +.BI -T \ STR +Enable pair/trio calling. For trio calling, option +.B -s +is usually needed to be applied to configure the trio members and their ordering. +In the file supplied to the option +.BR -s , +the first sample must be the child, the second the father and the third the mother. +The valid values of +.I STR +are `pair', `trioauto', `trioxd' and `trioxs', where `pair' calls differences between two input samples, and `trioxd' (`trioxs') specifies that the input +is from the X chromosome non-PAR regions and the child is a female (male). [null] +.TP +.B -v +Output variant sites only (force -c) +.TP +.B Contrast Calling and Association Test Options: +.TP +.BI -1 \ INT +Number of group-1 samples. This option is used for dividing the samples into +two groups for contrast SNP calling or association test. +When this option is in use, the following VCF INFO will be outputted: +PC2, PCHI2 and QCHI2. [0] +.TP +.BI -U \ INT +Number of permutations for association test (effective only with +.BR -1 ) +[0] +.TP +.BI -X \ FLOAT +Only perform permutations for P(chi^2)<FLOAT (effective only with +.BR -U ) +[0.01] +.RE + +.TP +.B index +.B bcftools index +.I in.bcf + +Index sorted BCF for random access. +.RE + +.TP +.B cat +.B bcftools cat +.I in1.bcf +.RI [ "in2.bcf " [ ... "]]]" + +Concatenate BCF files. The input files are required to be sorted and +have identical samples appearing in the same order. +.RE +.SH SAM FORMAT + +Sequence Alignment/Map (SAM) format is TAB-delimited. Apart from the header lines, which are started +with the `@' symbol, each alignment line consists of: + +.TS +center box; +cb | cb | cb +n | l | l . +Col Field Description +_ +1 QNAME Query template/pair NAME +2 FLAG bitwise FLAG +3 RNAME Reference sequence NAME +4 POS 1-based leftmost POSition/coordinate of clipped sequence +5 MAPQ MAPping Quality (Phred-scaled) +6 CIAGR extended CIGAR string +7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME) +8 MPOS 1-based Mate POSistion +9 TLEN inferred Template LENgth (insert size) +10 SEQ query SEQuence on the same strand as the reference +11 QUAL query QUALity (ASCII-33 gives the Phred base quality) +12+ OPT variable OPTional fields in the format TAG:VTYPE:VALUE +.TE + +.PP +Each bit in the FLAG field is defined as: + +.TS +center box; +cb | cb | cb +l | c | l . +Flag Chr Description +_ +0x0001 p the read is paired in sequencing +0x0002 P the read is mapped in a proper pair +0x0004 u the query sequence itself is unmapped +0x0008 U the mate is unmapped +0x0010 r strand of the query (1 for reverse) +0x0020 R strand of the mate +0x0040 1 the read is the first read in a pair +0x0080 2 the read is the second read in a pair +0x0100 s the alignment is not primary +0x0200 f the read fails platform/vendor quality checks +0x0400 d the read is either a PCR or an optical duplicate +.TE + +where the second column gives the string representation of the FLAG field. + +.SH VCF FORMAT + +The Variant Call Format (VCF) is a TAB-delimited format with each data line consists of the following fields: +.TS +center box; +cb | cb | cb +n | l | l . +Col Field Description +_ +1 CHROM CHROMosome name +2 POS the left-most POSition of the variant +3 ID unique variant IDentifier +4 REF the REFerence allele +5 ALT the ALTernate allele(s), separated by comma +6 QUAL variant/reference QUALity +7 FILTER FILTers applied +8 INFO INFOrmation related to the variant, separated by semi-colon +9 FORMAT FORMAT of the genotype fields, separated by colon (optional) +10+ SAMPLE SAMPLE genotypes and per-sample information (optional) +.TE + +.PP +The following table gives the +.B INFO +tags used by samtools and bcftools. + +.TS +center box; +cb | cb | cb +l | l | l . +Tag Format Description +_ +AF1 double Max-likelihood estimate of the site allele frequency (AF) of the first ALT allele +DP int Raw read depth (without quality filtering) +DP4 int[4] # high-quality reference forward bases, ref reverse, alternate for and alt rev bases +FQ int Consensus quality. Positive: sample genotypes different; negative: otherwise +MQ int Root-Mean-Square mapping quality of covering reads +PC2 int[2] Phred probability of AF in group1 samples being larger (,smaller) than in group2 +PCHI2 double Posterior weighted chi^2 P-value between group1 and group2 samples +PV4 double[4] P-value for strand bias, baseQ bias, mapQ bias and tail distance bias +QCHI2 int Phred-scaled PCHI2 +RP int # permutations yielding a smaller PCHI2 +CLR int Phred log ratio of genotype likelihoods with and without the trio/pair constraint +UGT string Most probable genotype configuration without the trio constraint +CGT string Most probable configuration with the trio constraint +VDB float Tests variant positions within reads. Intended for filtering RNA-seq artifacts around splice sites +RPB float Mann-Whitney rank-sum test for tail distance bias +HWE float Hardy-Weinberg equilibrium test, Wigginton et al., PMID: 15789306 +.TE + +.SH EXAMPLES +.IP o 2 +Import SAM to BAM when +.B @SQ +lines are present in the header: + + samtools view -bS aln.sam > aln.bam + +If +.B @SQ +lines are absent: + + samtools faidx ref.fa + samtools view -bt ref.fa.fai aln.sam > aln.bam + +where +.I ref.fa.fai +is generated automatically by the +.B faidx +command. + +.IP o 2 +Attach the +.B RG +tag while merging sorted alignments: + + perl -e 'print "@RG\\tID:ga\\tSM:hs\\tLB:ga\\tPL:Illumina\\n@RG\\tID:454\\tSM:hs\\tLB:454\\tPL:454\\n"' > rg.txt + samtools merge -rh rg.txt merged.bam ga.bam 454.bam + +The value in a +.B RG +tag is determined by the file name the read is coming from. In this +example, in the +.IR merged.bam , +reads from +.I ga.bam +will be attached +.IR RG:Z:ga , +while reads from +.I 454.bam +will be attached +.IR RG:Z:454 . + +.IP o 2 +Call SNPs and short INDELs for one diploid individual: + + samtools mpileup -ugf ref.fa aln.bam | bcftools view -bvcg - > var.raw.bcf + bcftools view var.raw.bcf | vcfutils.pl varFilter -D 100 > var.flt.vcf + +The +.B -D +option of varFilter controls the maximum read depth, which should be +adjusted to about twice the average read depth. One may consider to add +.B -C50 +to +.B mpileup +if mapping quality is overestimated for reads containing excessive +mismatches. Applying this option usually helps +.B BWA-short +but may not other mappers. + +.IP o 2 +Generate the consensus sequence for one diploid individual: + + samtools mpileup -uf ref.fa aln.bam | bcftools view -cg - | vcfutils.pl vcf2fq > cns.fq + +.IP o 2 +Call somatic mutations from a pair of samples: + + samtools mpileup -DSuf ref.fa aln.bam | bcftools view -bvcgT pair - > var.bcf + +In the output INFO field, +.I CLR +gives the Phred-log ratio between the likelihood by treating the +two samples independently, and the likelihood by requiring the genotype to be identical. +This +.I CLR +is effectively a score measuring the confidence of somatic calls. The higher the better. + +.IP o 2 +Call de novo and somatic mutations from a family trio: + + samtools mpileup -DSuf ref.fa aln.bam | bcftools view -bvcgT pair -s samples.txt - > var.bcf + +File +.I samples.txt +should consist of three lines specifying the member and order of samples (in the order of child-father-mother). +Similarly, +.I CLR +gives the Phred-log likelihood ratio with and without the trio constraint. +.I UGT +shows the most likely genotype configuration without the trio constraint, and +.I CGT +gives the most likely genotype configuration satisfying the trio constraint. + +.IP o 2 +Phase one individual: + + samtools calmd -AEur aln.bam ref.fa | samtools phase -b prefix - > phase.out + +The +.B calmd +command is used to reduce false heterozygotes around INDELs. + +.IP o 2 +Call SNPs and short indels for multiple diploid individuals: + + samtools mpileup -P ILLUMINA -ugf ref.fa *.bam | bcftools view -bcvg - > var.raw.bcf + bcftools view var.raw.bcf | vcfutils.pl varFilter -D 2000 > var.flt.vcf + +Individuals are identified from the +.B SM +tags in the +.B @RG +header lines. Individuals can be pooled in one alignment file; one +individual can also be separated into multiple files. The +.B -P +option specifies that indel candidates should be collected only from +read groups with the +.B @RG-PL +tag set to +.IR ILLUMINA . +Collecting indel candidates from reads sequenced by an indel-prone +technology may affect the performance of indel calling. + +Note that there is a new calling model which can be invoked by + + bcftools view -m0.99 ... + +which fixes some severe limitations of the default method. + +For filtering, best results seem to be achieved by first applying the +.IR SnpGap +filter and then applying some machine learning approach + + vcf-annotate -f SnpGap=n + vcf filter ... + +Both can be found in the +.B vcftools +and +.B htslib +package (links below). + +.IP o 2 +Derive the allele frequency spectrum (AFS) on a list of sites from multiple individuals: + + samtools mpileup -Igf ref.fa *.bam > all.bcf + bcftools view -bl sites.list all.bcf > sites.bcf + bcftools view -cGP cond2 sites.bcf > /dev/null 2> sites.1.afs + bcftools view -cGP sites.1.afs sites.bcf > /dev/null 2> sites.2.afs + bcftools view -cGP sites.2.afs sites.bcf > /dev/null 2> sites.3.afs + ...... + +where +.I sites.list +contains the list of sites with each line consisting of the reference +sequence name and position. The following +.B bcftools +commands estimate AFS by EM. + +.IP o 2 +Dump BAQ applied alignment for other SNP callers: + + samtools calmd -bAr aln.bam > aln.baq.bam + +It adds and corrects the +.B NM +and +.B MD +tags at the same time. The +.B calmd +command also comes with the +.B -C +option, the same as the one in +.B pileup +and +.BR mpileup . +Apply if it helps. + +.SH LIMITATIONS +.PP +.IP o 2 +Unaligned words used in bam_import.c, bam_endian.h, bam.c and bam_aux.c. +.IP o 2 +Samtools paired-end rmdup does not work for unpaired reads (e.g. orphan +reads or ends mapped to different chromosomes). If this is a concern, +please use Picard's MarkDuplicate which correctly handles these cases, +although a little slower. + +.SH AUTHOR +.PP +Heng Li from the Sanger Institute wrote the C version of samtools. Bob +Handsaker from the Broad Institute implemented the BGZF library and Jue +Ruan from Beijing Genomics Institute wrote the RAZF library. John +Marshall and Petr Danecek contribute to the source code and various +people from the 1000 Genomes Project have contributed to the SAM format +specification. + +.SH SEE ALSO +.PP +Samtools website: <http://samtools.sourceforge.net> +.br +Samtools latest source: <https://github.com/samtools/samtools> +.br +VCFtools website with stable link to VCF specification: <http://vcftools.sourceforge.net> +.br +HTSlib website: <https://github.com/samtools/htslib>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/win32/xcurses.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1377 @@ +/* Public Domain Curses */ + +/* $Id: curses.h,v 1.295 2008/07/15 17:13:25 wmcbrine Exp $ */ + +/*----------------------------------------------------------------------* + * PDCurses * + *----------------------------------------------------------------------*/ + +#ifndef __PDCURSES__ +#define __PDCURSES__ 1 + +/*man-start************************************************************** + +PDCurses definitions list: (Only define those needed) + + XCURSES True if compiling for X11. + PDC_RGB True if you want to use RGB color definitions + (Red = 1, Green = 2, Blue = 4) instead of BGR. + PDC_WIDE True if building wide-character support. + PDC_DLL_BUILD True if building a Win32 DLL. + NCURSES_MOUSE_VERSION Use the ncurses mouse API instead + of PDCurses' traditional mouse API. + +PDCurses portable platform definitions list: + + PDC_BUILD Defines API build version. + PDCURSES Enables access to PDCurses-only routines. + XOPEN Always true. + SYSVcurses True if you are compiling for SYSV portability. + BSDcurses True if you are compiling for BSD portability. + +**man-end****************************************************************/ + +#define PDC_BUILD 3401 +#define PDCURSES 1 /* PDCurses-only routines */ +#define XOPEN 1 /* X/Open Curses routines */ +#define SYSVcurses 1 /* System V Curses routines */ +#define BSDcurses 1 /* BSD Curses routines */ +#define CHTYPE_LONG 1 /* size of chtype; long */ + +/*----------------------------------------------------------------------*/ + +#include <stdarg.h> +#include <stddef.h> +#include <stdio.h> /* Required by X/Open usage below */ + +#ifdef PDC_WIDE +# include <wchar.h> +#endif + +#if defined(__cplusplus) || defined(__cplusplus__) || defined(__CPLUSPLUS) +extern "C" +{ +# define bool _bool +#endif + +/*---------------------------------------------------------------------- + * + * PDCurses Manifest Constants + * + */ + +#ifndef FALSE +# define FALSE 0 +#endif +#ifndef TRUE +# define TRUE 1 +#endif +#ifndef NULL +# define NULL (void *)0 +#endif +#ifndef ERR +# define ERR (-1) +#endif +#ifndef OK +# define OK 0 +#endif + +/*---------------------------------------------------------------------- + * + * PDCurses Type Declarations + * + */ + +typedef unsigned char bool; /* PDCurses Boolean type */ + +#ifdef CHTYPE_LONG +# if _LP64 +typedef unsigned int chtype; +# else +typedef unsigned long chtype; /* 16-bit attr + 16-bit char */ +# endif +#else +typedef unsigned short chtype; /* 8-bit attr + 8-bit char */ +#endif + +#ifdef PDC_WIDE +typedef chtype cchar_t; +#endif + +typedef chtype attr_t; + +/*---------------------------------------------------------------------- + * + * PDCurses Mouse Interface -- SYSVR4, with extensions + * + */ + +typedef struct +{ + int x; /* absolute column, 0 based, measured in characters */ + int y; /* absolute row, 0 based, measured in characters */ + short button[3]; /* state of each button */ + int changes; /* flags indicating what has changed with the mouse */ +} MOUSE_STATUS; + +#define BUTTON_RELEASED 0x0000 +#define BUTTON_PRESSED 0x0001 +#define BUTTON_CLICKED 0x0002 +#define BUTTON_DOUBLE_CLICKED 0x0003 +#define BUTTON_TRIPLE_CLICKED 0x0004 +#define BUTTON_MOVED 0x0005 /* PDCurses */ +#define WHEEL_SCROLLED 0x0006 /* PDCurses */ +#define BUTTON_ACTION_MASK 0x0007 /* PDCurses */ + +#define PDC_BUTTON_SHIFT 0x0008 /* PDCurses */ +#define PDC_BUTTON_CONTROL 0x0010 /* PDCurses */ +#define PDC_BUTTON_ALT 0x0020 /* PDCurses */ +#define BUTTON_MODIFIER_MASK 0x0038 /* PDCurses */ + +#define MOUSE_X_POS (Mouse_status.x) +#define MOUSE_Y_POS (Mouse_status.y) + +/* + * Bits associated with the .changes field: + * 3 2 1 0 + * 210987654321098765432109876543210 + * 1 <- button 1 has changed + * 10 <- button 2 has changed + * 100 <- button 3 has changed + * 1000 <- mouse has moved + * 10000 <- mouse position report + * 100000 <- mouse wheel up + * 1000000 <- mouse wheel down + */ + +#define PDC_MOUSE_MOVED 0x0008 +#define PDC_MOUSE_POSITION 0x0010 +#define PDC_MOUSE_WHEEL_UP 0x0020 +#define PDC_MOUSE_WHEEL_DOWN 0x0040 + +#define A_BUTTON_CHANGED (Mouse_status.changes & 7) +#define MOUSE_MOVED (Mouse_status.changes & PDC_MOUSE_MOVED) +#define MOUSE_POS_REPORT (Mouse_status.changes & PDC_MOUSE_POSITION) +#define BUTTON_CHANGED(x) (Mouse_status.changes & (1 << ((x) - 1))) +#define BUTTON_STATUS(x) (Mouse_status.button[(x) - 1]) +#define MOUSE_WHEEL_UP (Mouse_status.changes & PDC_MOUSE_WHEEL_UP) +#define MOUSE_WHEEL_DOWN (Mouse_status.changes & PDC_MOUSE_WHEEL_DOWN) + +/* mouse bit-masks */ + +#define BUTTON1_RELEASED 0x00000001L +#define BUTTON1_PRESSED 0x00000002L +#define BUTTON1_CLICKED 0x00000004L +#define BUTTON1_DOUBLE_CLICKED 0x00000008L +#define BUTTON1_TRIPLE_CLICKED 0x00000010L +#define BUTTON1_MOVED 0x00000010L /* PDCurses */ + +#define BUTTON2_RELEASED 0x00000020L +#define BUTTON2_PRESSED 0x00000040L +#define BUTTON2_CLICKED 0x00000080L +#define BUTTON2_DOUBLE_CLICKED 0x00000100L +#define BUTTON2_TRIPLE_CLICKED 0x00000200L +#define BUTTON2_MOVED 0x00000200L /* PDCurses */ + +#define BUTTON3_RELEASED 0x00000400L +#define BUTTON3_PRESSED 0x00000800L +#define BUTTON3_CLICKED 0x00001000L +#define BUTTON3_DOUBLE_CLICKED 0x00002000L +#define BUTTON3_TRIPLE_CLICKED 0x00004000L +#define BUTTON3_MOVED 0x00004000L /* PDCurses */ + +/* For the ncurses-compatible functions only, BUTTON4_PRESSED and + BUTTON5_PRESSED are returned for mouse scroll wheel up and down; + otherwise PDCurses doesn't support buttons 4 and 5 */ + +#define BUTTON4_RELEASED 0x00008000L +#define BUTTON4_PRESSED 0x00010000L +#define BUTTON4_CLICKED 0x00020000L +#define BUTTON4_DOUBLE_CLICKED 0x00040000L +#define BUTTON4_TRIPLE_CLICKED 0x00080000L + +#define BUTTON5_RELEASED 0x00100000L +#define BUTTON5_PRESSED 0x00200000L +#define BUTTON5_CLICKED 0x00400000L +#define BUTTON5_DOUBLE_CLICKED 0x00800000L +#define BUTTON5_TRIPLE_CLICKED 0x01000000L + +#define MOUSE_WHEEL_SCROLL 0x02000000L /* PDCurses */ +#define BUTTON_MODIFIER_SHIFT 0x04000000L /* PDCurses */ +#define BUTTON_MODIFIER_CONTROL 0x08000000L /* PDCurses */ +#define BUTTON_MODIFIER_ALT 0x10000000L /* PDCurses */ + +#define ALL_MOUSE_EVENTS 0x1fffffffL +#define REPORT_MOUSE_POSITION 0x20000000L + +/* ncurses mouse interface */ + +typedef unsigned long mmask_t; + +typedef struct +{ + short id; /* unused, always 0 */ + int x, y, z; /* x, y same as MOUSE_STATUS; z unused */ + mmask_t bstate; /* equivalent to changes + button[], but + in the same format as used for mousemask() */ +} MEVENT; + +#ifdef NCURSES_MOUSE_VERSION +# define BUTTON_SHIFT BUTTON_MODIFIER_SHIFT +# define BUTTON_CONTROL BUTTON_MODIFIER_CONTROL +# define BUTTON_CTRL BUTTON_MODIFIER_CONTROL +# define BUTTON_ALT BUTTON_MODIFIER_ALT +#else +# define BUTTON_SHIFT PDC_BUTTON_SHIFT +# define BUTTON_CONTROL PDC_BUTTON_CONTROL +# define BUTTON_ALT PDC_BUTTON_ALT +#endif + +/*---------------------------------------------------------------------- + * + * PDCurses Structure Definitions + * + */ + +typedef struct _win /* definition of a window */ +{ + int _cury; /* current pseudo-cursor */ + int _curx; + int _maxy; /* max window coordinates */ + int _maxx; + int _begy; /* origin on screen */ + int _begx; + int _flags; /* window properties */ + chtype _attrs; /* standard attributes and colors */ + chtype _bkgd; /* background, normally blank */ + bool _clear; /* causes clear at next refresh */ + bool _leaveit; /* leaves cursor where it is */ + bool _scroll; /* allows window scrolling */ + bool _nodelay; /* input character wait flag */ + bool _immed; /* immediate update flag */ + bool _sync; /* synchronise window ancestors */ + bool _use_keypad; /* flags keypad key mode active */ + chtype **_y; /* pointer to line pointer array */ + int *_firstch; /* first changed character in line */ + int *_lastch; /* last changed character in line */ + int _tmarg; /* top of scrolling region */ + int _bmarg; /* bottom of scrolling region */ + int _delayms; /* milliseconds of delay for getch() */ + int _parx, _pary; /* coords relative to parent (0,0) */ + struct _win *_parent; /* subwin's pointer to parent win */ +} WINDOW; + +/* Avoid using the SCREEN struct directly -- use the corresponding + functions if possible. This struct may eventually be made private. */ + +typedef struct +{ + bool alive; /* if initscr() called, and not endwin() */ + bool autocr; /* if cr -> lf */ + bool cbreak; /* if terminal unbuffered */ + bool echo; /* if terminal echo */ + bool raw_inp; /* raw input mode (v. cooked input) */ + bool raw_out; /* raw output mode (7 v. 8 bits) */ + bool audible; /* FALSE if the bell is visual */ + bool mono; /* TRUE if current screen is mono */ + bool resized; /* TRUE if TERM has been resized */ + bool orig_attr; /* TRUE if we have the original colors */ + short orig_fore; /* original screen foreground color */ + short orig_back; /* original screen foreground color */ + int cursrow; /* position of physical cursor */ + int curscol; /* position of physical cursor */ + int visibility; /* visibility of cursor */ + int orig_cursor; /* original cursor size */ + int lines; /* new value for LINES */ + int cols; /* new value for COLS */ + unsigned long _trap_mbe; /* trap these mouse button events */ + unsigned long _map_mbe_to_key; /* map mouse buttons to slk */ + int mouse_wait; /* time to wait (in ms) for a + button release after a press, in + order to count it as a click */ + int slklines; /* lines in use by slk_init() */ + WINDOW *slk_winptr; /* window for slk */ + int linesrippedoff; /* lines ripped off via ripoffline() */ + int linesrippedoffontop; /* lines ripped off on + top via ripoffline() */ + int delaytenths; /* 1/10ths second to wait block + getch() for */ + bool _preserve; /* TRUE if screen background + to be preserved */ + int _restore; /* specifies if screen background + to be restored, and how */ + bool save_key_modifiers; /* TRUE if each key modifiers saved + with each key press */ + bool return_key_modifiers; /* TRUE if modifier keys are + returned as "real" keys */ + bool key_code; /* TRUE if last key is a special key; + used internally by get_wch() */ +#ifdef XCURSES + int XcurscrSize; /* size of Xcurscr shared memory block */ + bool sb_on; + int sb_viewport_y; + int sb_viewport_x; + int sb_total_y; + int sb_total_x; + int sb_cur_y; + int sb_cur_x; +#endif + short line_color; /* color of line attributes - default -1 */ +} SCREEN; + +/*---------------------------------------------------------------------- + * + * PDCurses External Variables + * + */ + +#ifdef PDC_DLL_BUILD +# ifdef CURSES_LIBRARY +# define PDCEX __declspec(dllexport) extern +# else +# define PDCEX __declspec(dllimport) +# endif +#else +# define PDCEX extern +#endif + +PDCEX int LINES; /* terminal height */ +PDCEX int COLS; /* terminal width */ +PDCEX WINDOW *stdscr; /* the default screen window */ +PDCEX WINDOW *curscr; /* the current screen image */ +PDCEX SCREEN *SP; /* curses variables */ +PDCEX MOUSE_STATUS Mouse_status; +PDCEX int COLORS; +PDCEX int COLOR_PAIRS; +PDCEX int TABSIZE; +PDCEX chtype acs_map[]; /* alternate character set map */ +PDCEX char ttytype[]; /* terminal name/description */ + +/*man-start************************************************************** + +PDCurses Text Attributes +======================== + +Originally, PDCurses used a short (16 bits) for its chtype. To include +color, a number of things had to be sacrificed from the strict Unix and +System V support. The main problem was fitting all character attributes +and color into an unsigned char (all 8 bits!). + +Today, PDCurses by default uses a long (32 bits) for its chtype, as in +System V. The short chtype is still available, by undefining CHTYPE_LONG +and rebuilding the library. + +The following is the structure of a win->_attrs chtype: + +short form: + +------------------------------------------------- +|15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0| +------------------------------------------------- + color number | attrs | character eg 'a' + +The available non-color attributes are bold, reverse and blink. Others +have no effect. The high order char is an index into an array of +physical colors (defined in color.c) -- 32 foreground/background color +pairs (5 bits) plus 3 bits for other attributes. + +long form: + +---------------------------------------------------------------------------- +|31|30|29|28|27|26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|..| 3| 2| 1| 0| +---------------------------------------------------------------------------- + color number | modifiers | character eg 'a' + +The available non-color attributes are bold, underline, invisible, +right-line, left-line, protect, reverse and blink. 256 color pairs (8 +bits), 8 bits for other attributes, and 16 bits for character data. + +**man-end****************************************************************/ + +/*** Video attribute macros ***/ + +#define A_NORMAL (chtype)0 + +#ifdef CHTYPE_LONG +# define A_ALTCHARSET (chtype)0x00010000 +# define A_RIGHTLINE (chtype)0x00020000 +# define A_LEFTLINE (chtype)0x00040000 +# define A_INVIS (chtype)0x00080000 +# define A_UNDERLINE (chtype)0x00100000 +# define A_REVERSE (chtype)0x00200000 +# define A_BLINK (chtype)0x00400000 +# define A_BOLD (chtype)0x00800000 + +# define A_ATTRIBUTES (chtype)0xffff0000 +# define A_CHARTEXT (chtype)0x0000ffff +# define A_COLOR (chtype)0xff000000 + +# define A_ITALIC A_INVIS +# define A_PROTECT (A_UNDERLINE | A_LEFTLINE | A_RIGHTLINE) + +# define PDC_ATTR_SHIFT 19 +# define PDC_COLOR_SHIFT 24 +#else +# define A_BOLD (chtype)0x0100 /* X/Open */ +# define A_REVERSE (chtype)0x0200 /* X/Open */ +# define A_BLINK (chtype)0x0400 /* X/Open */ + +# define A_ATTRIBUTES (chtype)0xff00 /* X/Open */ +# define A_CHARTEXT (chtype)0x00ff /* X/Open */ +# define A_COLOR (chtype)0xf800 /* System V */ + +# define A_ALTCHARSET A_NORMAL /* X/Open */ +# define A_PROTECT A_NORMAL /* X/Open */ +# define A_UNDERLINE A_NORMAL /* X/Open */ + +# define A_LEFTLINE A_NORMAL +# define A_RIGHTLINE A_NORMAL +# define A_ITALIC A_NORMAL +# define A_INVIS A_NORMAL + +# define PDC_ATTR_SHIFT 8 +# define PDC_COLOR_SHIFT 11 +#endif + +#define A_STANDOUT (A_REVERSE | A_BOLD) /* X/Open */ +#define A_DIM A_NORMAL + +#define CHR_MSK A_CHARTEXT /* Obsolete */ +#define ATR_MSK A_ATTRIBUTES /* Obsolete */ +#define ATR_NRM A_NORMAL /* Obsolete */ + +/* For use with attr_t -- X/Open says, "these shall be distinct", so + this is a non-conforming implementation. */ + +#define WA_ALTCHARSET A_ALTCHARSET +#define WA_BLINK A_BLINK +#define WA_BOLD A_BOLD +#define WA_DIM A_DIM +#define WA_INVIS A_INVIS +#define WA_LEFT A_LEFTLINE +#define WA_PROTECT A_PROTECT +#define WA_REVERSE A_REVERSE +#define WA_RIGHT A_RIGHTLINE +#define WA_STANDOUT A_STANDOUT +#define WA_UNDERLINE A_UNDERLINE + +#define WA_HORIZONTAL A_NORMAL +#define WA_LOW A_NORMAL +#define WA_TOP A_NORMAL +#define WA_VERTICAL A_NORMAL + +/*** Alternate character set macros ***/ + +/* 'w' = 32-bit chtype; acs_map[] index | A_ALTCHARSET + 'n' = 16-bit chtype; it gets the fallback set because no bit is + available for A_ALTCHARSET */ + +#ifdef CHTYPE_LONG +# define ACS_PICK(w, n) ((chtype)w | A_ALTCHARSET) +#else +# define ACS_PICK(w, n) ((chtype)n) +#endif + +/* VT100-compatible symbols -- box chars */ + +#define ACS_ULCORNER ACS_PICK('l', '+') +#define ACS_LLCORNER ACS_PICK('m', '+') +#define ACS_URCORNER ACS_PICK('k', '+') +#define ACS_LRCORNER ACS_PICK('j', '+') +#define ACS_RTEE ACS_PICK('u', '+') +#define ACS_LTEE ACS_PICK('t', '+') +#define ACS_BTEE ACS_PICK('v', '+') +#define ACS_TTEE ACS_PICK('w', '+') +#define ACS_HLINE ACS_PICK('q', '-') +#define ACS_VLINE ACS_PICK('x', '|') +#define ACS_PLUS ACS_PICK('n', '+') + +/* VT100-compatible symbols -- other */ + +#define ACS_S1 ACS_PICK('o', '-') +#define ACS_S9 ACS_PICK('s', '_') +#define ACS_DIAMOND ACS_PICK('`', '+') +#define ACS_CKBOARD ACS_PICK('a', ':') +#define ACS_DEGREE ACS_PICK('f', '\'') +#define ACS_PLMINUS ACS_PICK('g', '#') +#define ACS_BULLET ACS_PICK('~', 'o') + +/* Teletype 5410v1 symbols -- these are defined in SysV curses, but + are not well-supported by most terminals. Stick to VT100 characters + for optimum portability. */ + +#define ACS_LARROW ACS_PICK(',', '<') +#define ACS_RARROW ACS_PICK('+', '>') +#define ACS_DARROW ACS_PICK('.', 'v') +#define ACS_UARROW ACS_PICK('-', '^') +#define ACS_BOARD ACS_PICK('h', '#') +#define ACS_LANTERN ACS_PICK('i', '*') +#define ACS_BLOCK ACS_PICK('0', '#') + +/* That goes double for these -- undocumented SysV symbols. Don't use + them. */ + +#define ACS_S3 ACS_PICK('p', '-') +#define ACS_S7 ACS_PICK('r', '-') +#define ACS_LEQUAL ACS_PICK('y', '<') +#define ACS_GEQUAL ACS_PICK('z', '>') +#define ACS_PI ACS_PICK('{', 'n') +#define ACS_NEQUAL ACS_PICK('|', '+') +#define ACS_STERLING ACS_PICK('}', 'L') + +/* Box char aliases */ + +#define ACS_BSSB ACS_ULCORNER +#define ACS_SSBB ACS_LLCORNER +#define ACS_BBSS ACS_URCORNER +#define ACS_SBBS ACS_LRCORNER +#define ACS_SBSS ACS_RTEE +#define ACS_SSSB ACS_LTEE +#define ACS_SSBS ACS_BTEE +#define ACS_BSSS ACS_TTEE +#define ACS_BSBS ACS_HLINE +#define ACS_SBSB ACS_VLINE +#define ACS_SSSS ACS_PLUS + +/* cchar_t aliases */ + +#ifdef PDC_WIDE +# define WACS_ULCORNER (&(acs_map['l'])) +# define WACS_LLCORNER (&(acs_map['m'])) +# define WACS_URCORNER (&(acs_map['k'])) +# define WACS_LRCORNER (&(acs_map['j'])) +# define WACS_RTEE (&(acs_map['u'])) +# define WACS_LTEE (&(acs_map['t'])) +# define WACS_BTEE (&(acs_map['v'])) +# define WACS_TTEE (&(acs_map['w'])) +# define WACS_HLINE (&(acs_map['q'])) +# define WACS_VLINE (&(acs_map['x'])) +# define WACS_PLUS (&(acs_map['n'])) + +# define WACS_S1 (&(acs_map['o'])) +# define WACS_S9 (&(acs_map['s'])) +# define WACS_DIAMOND (&(acs_map['`'])) +# define WACS_CKBOARD (&(acs_map['a'])) +# define WACS_DEGREE (&(acs_map['f'])) +# define WACS_PLMINUS (&(acs_map['g'])) +# define WACS_BULLET (&(acs_map['~'])) + +# define WACS_LARROW (&(acs_map[','])) +# define WACS_RARROW (&(acs_map['+'])) +# define WACS_DARROW (&(acs_map['.'])) +# define WACS_UARROW (&(acs_map['-'])) +# define WACS_BOARD (&(acs_map['h'])) +# define WACS_LANTERN (&(acs_map['i'])) +# define WACS_BLOCK (&(acs_map['0'])) + +# define WACS_S3 (&(acs_map['p'])) +# define WACS_S7 (&(acs_map['r'])) +# define WACS_LEQUAL (&(acs_map['y'])) +# define WACS_GEQUAL (&(acs_map['z'])) +# define WACS_PI (&(acs_map['{'])) +# define WACS_NEQUAL (&(acs_map['|'])) +# define WACS_STERLING (&(acs_map['}'])) + +# define WACS_BSSB WACS_ULCORNER +# define WACS_SSBB WACS_LLCORNER +# define WACS_BBSS WACS_URCORNER +# define WACS_SBBS WACS_LRCORNER +# define WACS_SBSS WACS_RTEE +# define WACS_SSSB WACS_LTEE +# define WACS_SSBS WACS_BTEE +# define WACS_BSSS WACS_TTEE +# define WACS_BSBS WACS_HLINE +# define WACS_SBSB WACS_VLINE +# define WACS_SSSS WACS_PLUS +#endif + +/*** Color macros ***/ + +#define COLOR_BLACK 0 + +#ifdef PDC_RGB /* RGB */ +# define COLOR_RED 1 +# define COLOR_GREEN 2 +# define COLOR_BLUE 4 +#else /* BGR */ +# define COLOR_BLUE 1 +# define COLOR_GREEN 2 +# define COLOR_RED 4 +#endif + +#define COLOR_CYAN (COLOR_BLUE | COLOR_GREEN) +#define COLOR_MAGENTA (COLOR_RED | COLOR_BLUE) +#define COLOR_YELLOW (COLOR_RED | COLOR_GREEN) + +#define COLOR_WHITE 7 + +/*---------------------------------------------------------------------- + * + * Function and Keypad Key Definitions. + * Many are just for compatibility. + * + */ + +#define KEY_CODE_YES 0x100 /* If get_wch() gives a key code */ + +#define KEY_BREAK 0x101 /* Not on PC KBD */ +#define KEY_DOWN 0x102 /* Down arrow key */ +#define KEY_UP 0x103 /* Up arrow key */ +#define KEY_LEFT 0x104 /* Left arrow key */ +#define KEY_RIGHT 0x105 /* Right arrow key */ +#define KEY_HOME 0x106 /* home key */ +#define KEY_BACKSPACE 0x107 /* not on pc */ +#define KEY_F0 0x108 /* function keys; 64 reserved */ + +#define KEY_DL 0x148 /* delete line */ +#define KEY_IL 0x149 /* insert line */ +#define KEY_DC 0x14a /* delete character */ +#define KEY_IC 0x14b /* insert char or enter ins mode */ +#define KEY_EIC 0x14c /* exit insert char mode */ +#define KEY_CLEAR 0x14d /* clear screen */ +#define KEY_EOS 0x14e /* clear to end of screen */ +#define KEY_EOL 0x14f /* clear to end of line */ +#define KEY_SF 0x150 /* scroll 1 line forward */ +#define KEY_SR 0x151 /* scroll 1 line back (reverse) */ +#define KEY_NPAGE 0x152 /* next page */ +#define KEY_PPAGE 0x153 /* previous page */ +#define KEY_STAB 0x154 /* set tab */ +#define KEY_CTAB 0x155 /* clear tab */ +#define KEY_CATAB 0x156 /* clear all tabs */ +#define KEY_ENTER 0x157 /* enter or send (unreliable) */ +#define KEY_SRESET 0x158 /* soft/reset (partial/unreliable) */ +#define KEY_RESET 0x159 /* reset/hard reset (unreliable) */ +#define KEY_PRINT 0x15a /* print/copy */ +#define KEY_LL 0x15b /* home down/bottom (lower left) */ +#define KEY_ABORT 0x15c /* abort/terminate key (any) */ +#define KEY_SHELP 0x15d /* short help */ +#define KEY_LHELP 0x15e /* long help */ +#define KEY_BTAB 0x15f /* Back tab key */ +#define KEY_BEG 0x160 /* beg(inning) key */ +#define KEY_CANCEL 0x161 /* cancel key */ +#define KEY_CLOSE 0x162 /* close key */ +#define KEY_COMMAND 0x163 /* cmd (command) key */ +#define KEY_COPY 0x164 /* copy key */ +#define KEY_CREATE 0x165 /* create key */ +#define KEY_END 0x166 /* end key */ +#define KEY_EXIT 0x167 /* exit key */ +#define KEY_FIND 0x168 /* find key */ +#define KEY_HELP 0x169 /* help key */ +#define KEY_MARK 0x16a /* mark key */ +#define KEY_MESSAGE 0x16b /* message key */ +#define KEY_MOVE 0x16c /* move key */ +#define KEY_NEXT 0x16d /* next object key */ +#define KEY_OPEN 0x16e /* open key */ +#define KEY_OPTIONS 0x16f /* options key */ +#define KEY_PREVIOUS 0x170 /* previous object key */ +#define KEY_REDO 0x171 /* redo key */ +#define KEY_REFERENCE 0x172 /* ref(erence) key */ +#define KEY_REFRESH 0x173 /* refresh key */ +#define KEY_REPLACE 0x174 /* replace key */ +#define KEY_RESTART 0x175 /* restart key */ +#define KEY_RESUME 0x176 /* resume key */ +#define KEY_SAVE 0x177 /* save key */ +#define KEY_SBEG 0x178 /* shifted beginning key */ +#define KEY_SCANCEL 0x179 /* shifted cancel key */ +#define KEY_SCOMMAND 0x17a /* shifted command key */ +#define KEY_SCOPY 0x17b /* shifted copy key */ +#define KEY_SCREATE 0x17c /* shifted create key */ +#define KEY_SDC 0x17d /* shifted delete char key */ +#define KEY_SDL 0x17e /* shifted delete line key */ +#define KEY_SELECT 0x17f /* select key */ +#define KEY_SEND 0x180 /* shifted end key */ +#define KEY_SEOL 0x181 /* shifted clear line key */ +#define KEY_SEXIT 0x182 /* shifted exit key */ +#define KEY_SFIND 0x183 /* shifted find key */ +#define KEY_SHOME 0x184 /* shifted home key */ +#define KEY_SIC 0x185 /* shifted input key */ + +#define KEY_SLEFT 0x187 /* shifted left arrow key */ +#define KEY_SMESSAGE 0x188 /* shifted message key */ +#define KEY_SMOVE 0x189 /* shifted move key */ +#define KEY_SNEXT 0x18a /* shifted next key */ +#define KEY_SOPTIONS 0x18b /* shifted options key */ +#define KEY_SPREVIOUS 0x18c /* shifted prev key */ +#define KEY_SPRINT 0x18d /* shifted print key */ +#define KEY_SREDO 0x18e /* shifted redo key */ +#define KEY_SREPLACE 0x18f /* shifted replace key */ +#define KEY_SRIGHT 0x190 /* shifted right arrow */ +#define KEY_SRSUME 0x191 /* shifted resume key */ +#define KEY_SSAVE 0x192 /* shifted save key */ +#define KEY_SSUSPEND 0x193 /* shifted suspend key */ +#define KEY_SUNDO 0x194 /* shifted undo key */ +#define KEY_SUSPEND 0x195 /* suspend key */ +#define KEY_UNDO 0x196 /* undo key */ + +/* PDCurses-specific key definitions -- PC only */ + +#define ALT_0 0x197 +#define ALT_1 0x198 +#define ALT_2 0x199 +#define ALT_3 0x19a +#define ALT_4 0x19b +#define ALT_5 0x19c +#define ALT_6 0x19d +#define ALT_7 0x19e +#define ALT_8 0x19f +#define ALT_9 0x1a0 +#define ALT_A 0x1a1 +#define ALT_B 0x1a2 +#define ALT_C 0x1a3 +#define ALT_D 0x1a4 +#define ALT_E 0x1a5 +#define ALT_F 0x1a6 +#define ALT_G 0x1a7 +#define ALT_H 0x1a8 +#define ALT_I 0x1a9 +#define ALT_J 0x1aa +#define ALT_K 0x1ab +#define ALT_L 0x1ac +#define ALT_M 0x1ad +#define ALT_N 0x1ae +#define ALT_O 0x1af +#define ALT_P 0x1b0 +#define ALT_Q 0x1b1 +#define ALT_R 0x1b2 +#define ALT_S 0x1b3 +#define ALT_T 0x1b4 +#define ALT_U 0x1b5 +#define ALT_V 0x1b6 +#define ALT_W 0x1b7 +#define ALT_X 0x1b8 +#define ALT_Y 0x1b9 +#define ALT_Z 0x1ba + +#define CTL_LEFT 0x1bb /* Control-Left-Arrow */ +#define CTL_RIGHT 0x1bc +#define CTL_PGUP 0x1bd +#define CTL_PGDN 0x1be +#define CTL_HOME 0x1bf +#define CTL_END 0x1c0 + +#define KEY_A1 0x1c1 /* upper left on Virtual keypad */ +#define KEY_A2 0x1c2 /* upper middle on Virt. keypad */ +#define KEY_A3 0x1c3 /* upper right on Vir. keypad */ +#define KEY_B1 0x1c4 /* middle left on Virt. keypad */ +#define KEY_B2 0x1c5 /* center on Virt. keypad */ +#define KEY_B3 0x1c6 /* middle right on Vir. keypad */ +#define KEY_C1 0x1c7 /* lower left on Virt. keypad */ +#define KEY_C2 0x1c8 /* lower middle on Virt. keypad */ +#define KEY_C3 0x1c9 /* lower right on Vir. keypad */ + +#define PADSLASH 0x1ca /* slash on keypad */ +#define PADENTER 0x1cb /* enter on keypad */ +#define CTL_PADENTER 0x1cc /* ctl-enter on keypad */ +#define ALT_PADENTER 0x1cd /* alt-enter on keypad */ +#define PADSTOP 0x1ce /* stop on keypad */ +#define PADSTAR 0x1cf /* star on keypad */ +#define PADMINUS 0x1d0 /* minus on keypad */ +#define PADPLUS 0x1d1 /* plus on keypad */ +#define CTL_PADSTOP 0x1d2 /* ctl-stop on keypad */ +#define CTL_PADCENTER 0x1d3 /* ctl-enter on keypad */ +#define CTL_PADPLUS 0x1d4 /* ctl-plus on keypad */ +#define CTL_PADMINUS 0x1d5 /* ctl-minus on keypad */ +#define CTL_PADSLASH 0x1d6 /* ctl-slash on keypad */ +#define CTL_PADSTAR 0x1d7 /* ctl-star on keypad */ +#define ALT_PADPLUS 0x1d8 /* alt-plus on keypad */ +#define ALT_PADMINUS 0x1d9 /* alt-minus on keypad */ +#define ALT_PADSLASH 0x1da /* alt-slash on keypad */ +#define ALT_PADSTAR 0x1db /* alt-star on keypad */ +#define ALT_PADSTOP 0x1dc /* alt-stop on keypad */ +#define CTL_INS 0x1dd /* ctl-insert */ +#define ALT_DEL 0x1de /* alt-delete */ +#define ALT_INS 0x1df /* alt-insert */ +#define CTL_UP 0x1e0 /* ctl-up arrow */ +#define CTL_DOWN 0x1e1 /* ctl-down arrow */ +#define CTL_TAB 0x1e2 /* ctl-tab */ +#define ALT_TAB 0x1e3 +#define ALT_MINUS 0x1e4 +#define ALT_EQUAL 0x1e5 +#define ALT_HOME 0x1e6 +#define ALT_PGUP 0x1e7 +#define ALT_PGDN 0x1e8 +#define ALT_END 0x1e9 +#define ALT_UP 0x1ea /* alt-up arrow */ +#define ALT_DOWN 0x1eb /* alt-down arrow */ +#define ALT_RIGHT 0x1ec /* alt-right arrow */ +#define ALT_LEFT 0x1ed /* alt-left arrow */ +#define ALT_ENTER 0x1ee /* alt-enter */ +#define ALT_ESC 0x1ef /* alt-escape */ +#define ALT_BQUOTE 0x1f0 /* alt-back quote */ +#define ALT_LBRACKET 0x1f1 /* alt-left bracket */ +#define ALT_RBRACKET 0x1f2 /* alt-right bracket */ +#define ALT_SEMICOLON 0x1f3 /* alt-semi-colon */ +#define ALT_FQUOTE 0x1f4 /* alt-forward quote */ +#define ALT_COMMA 0x1f5 /* alt-comma */ +#define ALT_STOP 0x1f6 /* alt-stop */ +#define ALT_FSLASH 0x1f7 /* alt-forward slash */ +#define ALT_BKSP 0x1f8 /* alt-backspace */ +#define CTL_BKSP 0x1f9 /* ctl-backspace */ +#define PAD0 0x1fa /* keypad 0 */ + +#define CTL_PAD0 0x1fb /* ctl-keypad 0 */ +#define CTL_PAD1 0x1fc +#define CTL_PAD2 0x1fd +#define CTL_PAD3 0x1fe +#define CTL_PAD4 0x1ff +#define CTL_PAD5 0x200 +#define CTL_PAD6 0x201 +#define CTL_PAD7 0x202 +#define CTL_PAD8 0x203 +#define CTL_PAD9 0x204 + +#define ALT_PAD0 0x205 /* alt-keypad 0 */ +#define ALT_PAD1 0x206 +#define ALT_PAD2 0x207 +#define ALT_PAD3 0x208 +#define ALT_PAD4 0x209 +#define ALT_PAD5 0x20a +#define ALT_PAD6 0x20b +#define ALT_PAD7 0x20c +#define ALT_PAD8 0x20d +#define ALT_PAD9 0x20e + +#define CTL_DEL 0x20f /* clt-delete */ +#define ALT_BSLASH 0x210 /* alt-back slash */ +#define CTL_ENTER 0x211 /* ctl-enter */ + +#define SHF_PADENTER 0x212 /* shift-enter on keypad */ +#define SHF_PADSLASH 0x213 /* shift-slash on keypad */ +#define SHF_PADSTAR 0x214 /* shift-star on keypad */ +#define SHF_PADPLUS 0x215 /* shift-plus on keypad */ +#define SHF_PADMINUS 0x216 /* shift-minus on keypad */ +#define SHF_UP 0x217 /* shift-up on keypad */ +#define SHF_DOWN 0x218 /* shift-down on keypad */ +#define SHF_IC 0x219 /* shift-insert on keypad */ +#define SHF_DC 0x21a /* shift-delete on keypad */ + +#define KEY_MOUSE 0x21b /* "mouse" key */ +#define KEY_SHIFT_L 0x21c /* Left-shift */ +#define KEY_SHIFT_R 0x21d /* Right-shift */ +#define KEY_CONTROL_L 0x21e /* Left-control */ +#define KEY_CONTROL_R 0x21f /* Right-control */ +#define KEY_ALT_L 0x220 /* Left-alt */ +#define KEY_ALT_R 0x221 /* Right-alt */ +#define KEY_RESIZE 0x222 /* Window resize */ +#define KEY_SUP 0x223 /* Shifted up arrow */ +#define KEY_SDOWN 0x224 /* Shifted down arrow */ + +#define KEY_MIN KEY_BREAK /* Minimum curses key value */ +#define KEY_MAX KEY_SDOWN /* Maximum curses key */ + +#define KEY_F(n) (KEY_F0 + (n)) + +/*---------------------------------------------------------------------- + * + * PDCurses Function Declarations + * + */ + +/* Standard */ + +int addch(const chtype); +int addchnstr(const chtype *, int); +int addchstr(const chtype *); +int addnstr(const char *, int); +int addstr(const char *); +int attroff(chtype); +int attron(chtype); +int attrset(chtype); +int attr_get(attr_t *, short *, void *); +int attr_off(attr_t, void *); +int attr_on(attr_t, void *); +int attr_set(attr_t, short, void *); +int baudrate(void); +int beep(void); +int bkgd(chtype); +void bkgdset(chtype); +int border(chtype, chtype, chtype, chtype, chtype, chtype, chtype, chtype); +int box(WINDOW *, chtype, chtype); +bool can_change_color(void); +int cbreak(void); +int chgat(int, attr_t, short, const void *); +int clearok(WINDOW *, bool); +int clear(void); +int clrtobot(void); +int clrtoeol(void); +int color_content(short, short *, short *, short *); +int color_set(short, void *); +int copywin(const WINDOW *, WINDOW *, int, int, int, int, int, int, int); +int curs_set(int); +int def_prog_mode(void); +int def_shell_mode(void); +int delay_output(int); +int delch(void); +int deleteln(void); +void delscreen(SCREEN *); +int delwin(WINDOW *); +WINDOW *derwin(WINDOW *, int, int, int, int); +int doupdate(void); +WINDOW *dupwin(WINDOW *); +int echochar(const chtype); +int echo(void); +int endwin(void); +char erasechar(void); +int erase(void); +void filter(void); +int flash(void); +int flushinp(void); +chtype getbkgd(WINDOW *); +int getnstr(char *, int); +int getstr(char *); +WINDOW *getwin(FILE *); +int halfdelay(int); +bool has_colors(void); +bool has_ic(void); +bool has_il(void); +int hline(chtype, int); +void idcok(WINDOW *, bool); +int idlok(WINDOW *, bool); +void immedok(WINDOW *, bool); +int inchnstr(chtype *, int); +int inchstr(chtype *); +chtype inch(void); +int init_color(short, short, short, short); +int init_pair(short, short, short); +WINDOW *initscr(void); +int innstr(char *, int); +int insch(chtype); +int insdelln(int); +int insertln(void); +int insnstr(const char *, int); +int insstr(const char *); +int instr(char *); +int intrflush(WINDOW *, bool); +bool isendwin(void); +bool is_linetouched(WINDOW *, int); +bool is_wintouched(WINDOW *); +char *keyname(int); +int keypad(WINDOW *, bool); +char killchar(void); +int leaveok(WINDOW *, bool); +char *longname(void); +int meta(WINDOW *, bool); +int move(int, int); +int mvaddch(int, int, const chtype); +int mvaddchnstr(int, int, const chtype *, int); +int mvaddchstr(int, int, const chtype *); +int mvaddnstr(int, int, const char *, int); +int mvaddstr(int, int, const char *); +int mvchgat(int, int, int, attr_t, short, const void *); +int mvcur(int, int, int, int); +int mvdelch(int, int); +int mvderwin(WINDOW *, int, int); +int mvgetch(int, int); +int mvgetnstr(int, int, char *, int); +int mvgetstr(int, int, char *); +int mvhline(int, int, chtype, int); +chtype mvinch(int, int); +int mvinchnstr(int, int, chtype *, int); +int mvinchstr(int, int, chtype *); +int mvinnstr(int, int, char *, int); +int mvinsch(int, int, chtype); +int mvinsnstr(int, int, const char *, int); +int mvinsstr(int, int, const char *); +int mvinstr(int, int, char *); +int mvprintw(int, int, const char *, ...); +int mvscanw(int, int, const char *, ...); +int mvvline(int, int, chtype, int); +int mvwaddchnstr(WINDOW *, int, int, const chtype *, int); +int mvwaddchstr(WINDOW *, int, int, const chtype *); +int mvwaddch(WINDOW *, int, int, const chtype); +int mvwaddnstr(WINDOW *, int, int, const char *, int); +int mvwaddstr(WINDOW *, int, int, const char *); +int mvwchgat(WINDOW *, int, int, int, attr_t, short, const void *); +int mvwdelch(WINDOW *, int, int); +int mvwgetch(WINDOW *, int, int); +int mvwgetnstr(WINDOW *, int, int, char *, int); +int mvwgetstr(WINDOW *, int, int, char *); +int mvwhline(WINDOW *, int, int, chtype, int); +int mvwinchnstr(WINDOW *, int, int, chtype *, int); +int mvwinchstr(WINDOW *, int, int, chtype *); +chtype mvwinch(WINDOW *, int, int); +int mvwinnstr(WINDOW *, int, int, char *, int); +int mvwinsch(WINDOW *, int, int, chtype); +int mvwinsnstr(WINDOW *, int, int, const char *, int); +int mvwinsstr(WINDOW *, int, int, const char *); +int mvwinstr(WINDOW *, int, int, char *); +int mvwin(WINDOW *, int, int); +int mvwprintw(WINDOW *, int, int, const char *, ...); +int mvwscanw(WINDOW *, int, int, const char *, ...); +int mvwvline(WINDOW *, int, int, chtype, int); +int napms(int); +WINDOW *newpad(int, int); +SCREEN *newterm(const char *, FILE *, FILE *); +WINDOW *newwin(int, int, int, int); +int nl(void); +int nocbreak(void); +int nodelay(WINDOW *, bool); +int noecho(void); +int nonl(void); +void noqiflush(void); +int noraw(void); +int notimeout(WINDOW *, bool); +int overlay(const WINDOW *, WINDOW *); +int overwrite(const WINDOW *, WINDOW *); +int pair_content(short, short *, short *); +int pechochar(WINDOW *, chtype); +int pnoutrefresh(WINDOW *, int, int, int, int, int, int); +int prefresh(WINDOW *, int, int, int, int, int, int); +int printw(const char *, ...); +int putwin(WINDOW *, FILE *); +void qiflush(void); +int raw(void); +int redrawwin(WINDOW *); +int refresh(void); +int reset_prog_mode(void); +int reset_shell_mode(void); +int resetty(void); +int ripoffline(int, int (*)(WINDOW *, int)); +int savetty(void); +int scanw(const char *, ...); +int scr_dump(const char *); +int scr_init(const char *); +int scr_restore(const char *); +int scr_set(const char *); +int scrl(int); +int scroll(WINDOW *); +int scrollok(WINDOW *, bool); +SCREEN *set_term(SCREEN *); +int setscrreg(int, int); +int slk_attroff(const chtype); +int slk_attr_off(const attr_t, void *); +int slk_attron(const chtype); +int slk_attr_on(const attr_t, void *); +int slk_attrset(const chtype); +int slk_attr_set(const attr_t, short, void *); +int slk_clear(void); +int slk_color(short); +int slk_init(int); +char *slk_label(int); +int slk_noutrefresh(void); +int slk_refresh(void); +int slk_restore(void); +int slk_set(int, const char *, int); +int slk_touch(void); +int standend(void); +int standout(void); +int start_color(void); +WINDOW *subpad(WINDOW *, int, int, int, int); +WINDOW *subwin(WINDOW *, int, int, int, int); +int syncok(WINDOW *, bool); +chtype termattrs(void); +attr_t term_attrs(void); +char *termname(void); +void timeout(int); +int touchline(WINDOW *, int, int); +int touchwin(WINDOW *); +int typeahead(int); +int untouchwin(WINDOW *); +void use_env(bool); +int vidattr(chtype); +int vid_attr(attr_t, short, void *); +int vidputs(chtype, int (*)(int)); +int vid_puts(attr_t, short, void *, int (*)(int)); +int vline(chtype, int); +int vw_printw(WINDOW *, const char *, va_list); +int vwprintw(WINDOW *, const char *, va_list); +int vw_scanw(WINDOW *, const char *, va_list); +int vwscanw(WINDOW *, const char *, va_list); +int waddchnstr(WINDOW *, const chtype *, int); +int waddchstr(WINDOW *, const chtype *); +int waddch(WINDOW *, const chtype); +int waddnstr(WINDOW *, const char *, int); +int waddstr(WINDOW *, const char *); +int wattroff(WINDOW *, chtype); +int wattron(WINDOW *, chtype); +int wattrset(WINDOW *, chtype); +int wattr_get(WINDOW *, attr_t *, short *, void *); +int wattr_off(WINDOW *, attr_t, void *); +int wattr_on(WINDOW *, attr_t, void *); +int wattr_set(WINDOW *, attr_t, short, void *); +void wbkgdset(WINDOW *, chtype); +int wbkgd(WINDOW *, chtype); +int wborder(WINDOW *, chtype, chtype, chtype, chtype, + chtype, chtype, chtype, chtype); +int wchgat(WINDOW *, int, attr_t, short, const void *); +int wclear(WINDOW *); +int wclrtobot(WINDOW *); +int wclrtoeol(WINDOW *); +int wcolor_set(WINDOW *, short, void *); +void wcursyncup(WINDOW *); +int wdelch(WINDOW *); +int wdeleteln(WINDOW *); +int wechochar(WINDOW *, const chtype); +int werase(WINDOW *); +int wgetch(WINDOW *); +int wgetnstr(WINDOW *, char *, int); +int wgetstr(WINDOW *, char *); +int whline(WINDOW *, chtype, int); +int winchnstr(WINDOW *, chtype *, int); +int winchstr(WINDOW *, chtype *); +chtype winch(WINDOW *); +int winnstr(WINDOW *, char *, int); +int winsch(WINDOW *, chtype); +int winsdelln(WINDOW *, int); +int winsertln(WINDOW *); +int winsnstr(WINDOW *, const char *, int); +int winsstr(WINDOW *, const char *); +int winstr(WINDOW *, char *); +int wmove(WINDOW *, int, int); +int wnoutrefresh(WINDOW *); +int wprintw(WINDOW *, const char *, ...); +int wredrawln(WINDOW *, int, int); +int wrefresh(WINDOW *); +int wscanw(WINDOW *, const char *, ...); +int wscrl(WINDOW *, int); +int wsetscrreg(WINDOW *, int, int); +int wstandend(WINDOW *); +int wstandout(WINDOW *); +void wsyncdown(WINDOW *); +void wsyncup(WINDOW *); +void wtimeout(WINDOW *, int); +int wtouchln(WINDOW *, int, int, int); +int wvline(WINDOW *, chtype, int); + +/* Wide-character functions */ + +#ifdef PDC_WIDE +int addnwstr(const wchar_t *, int); +int addwstr(const wchar_t *); +int add_wch(const cchar_t *); +int add_wchnstr(const cchar_t *, int); +int add_wchstr(const cchar_t *); +int border_set(const cchar_t *, const cchar_t *, const cchar_t *, + const cchar_t *, const cchar_t *, const cchar_t *, + const cchar_t *, const cchar_t *); +int box_set(WINDOW *, const cchar_t *, const cchar_t *); +int echo_wchar(const cchar_t *); +int erasewchar(wchar_t *); +int getbkgrnd(cchar_t *); +int getcchar(const cchar_t *, wchar_t *, attr_t *, short *, void *); +int getn_wstr(wint_t *, int); +int get_wch(wint_t *); +int get_wstr(wint_t *); +int hline_set(const cchar_t *, int); +int innwstr(wchar_t *, int); +int ins_nwstr(const wchar_t *, int); +int ins_wch(const cchar_t *); +int ins_wstr(const wchar_t *); +int inwstr(wchar_t *); +int in_wch(cchar_t *); +int in_wchnstr(cchar_t *, int); +int in_wchstr(cchar_t *); +char *key_name(wchar_t); +int killwchar(wchar_t *); +int mvaddnwstr(int, int, const wchar_t *, int); +int mvaddwstr(int, int, const wchar_t *); +int mvadd_wch(int, int, const cchar_t *); +int mvadd_wchnstr(int, int, const cchar_t *, int); +int mvadd_wchstr(int, int, const cchar_t *); +int mvgetn_wstr(int, int, wint_t *, int); +int mvget_wch(int, int, wint_t *); +int mvget_wstr(int, int, wint_t *); +int mvhline_set(int, int, const cchar_t *, int); +int mvinnwstr(int, int, wchar_t *, int); +int mvins_nwstr(int, int, const wchar_t *, int); +int mvins_wch(int, int, const cchar_t *); +int mvins_wstr(int, int, const wchar_t *); +int mvinwstr(int, int, wchar_t *); +int mvin_wch(int, int, cchar_t *); +int mvin_wchnstr(int, int, cchar_t *, int); +int mvin_wchstr(int, int, cchar_t *); +int mvvline_set(int, int, const cchar_t *, int); +int mvwaddnwstr(WINDOW *, int, int, const wchar_t *, int); +int mvwaddwstr(WINDOW *, int, int, const wchar_t *); +int mvwadd_wch(WINDOW *, int, int, const cchar_t *); +int mvwadd_wchnstr(WINDOW *, int, int, const cchar_t *, int); +int mvwadd_wchstr(WINDOW *, int, int, const cchar_t *); +int mvwgetn_wstr(WINDOW *, int, int, wint_t *, int); +int mvwget_wch(WINDOW *, int, int, wint_t *); +int mvwget_wstr(WINDOW *, int, int, wint_t *); +int mvwhline_set(WINDOW *, int, int, const cchar_t *, int); +int mvwinnwstr(WINDOW *, int, int, wchar_t *, int); +int mvwins_nwstr(WINDOW *, int, int, const wchar_t *, int); +int mvwins_wch(WINDOW *, int, int, const cchar_t *); +int mvwins_wstr(WINDOW *, int, int, const wchar_t *); +int mvwin_wch(WINDOW *, int, int, cchar_t *); +int mvwin_wchnstr(WINDOW *, int, int, cchar_t *, int); +int mvwin_wchstr(WINDOW *, int, int, cchar_t *); +int mvwinwstr(WINDOW *, int, int, wchar_t *); +int mvwvline_set(WINDOW *, int, int, const cchar_t *, int); +int pecho_wchar(WINDOW *, const cchar_t*); +int setcchar(cchar_t*, const wchar_t*, const attr_t, short, const void*); +int slk_wset(int, const wchar_t *, int); +int unget_wch(const wchar_t); +int vline_set(const cchar_t *, int); +int waddnwstr(WINDOW *, const wchar_t *, int); +int waddwstr(WINDOW *, const wchar_t *); +int wadd_wch(WINDOW *, const cchar_t *); +int wadd_wchnstr(WINDOW *, const cchar_t *, int); +int wadd_wchstr(WINDOW *, const cchar_t *); +int wbkgrnd(WINDOW *, const cchar_t *); +void wbkgrndset(WINDOW *, const cchar_t *); +int wborder_set(WINDOW *, const cchar_t *, const cchar_t *, + const cchar_t *, const cchar_t *, const cchar_t *, + const cchar_t *, const cchar_t *, const cchar_t *); +int wecho_wchar(WINDOW *, const cchar_t *); +int wgetbkgrnd(WINDOW *, cchar_t *); +int wgetn_wstr(WINDOW *, wint_t *, int); +int wget_wch(WINDOW *, wint_t *); +int wget_wstr(WINDOW *, wint_t *); +int whline_set(WINDOW *, const cchar_t *, int); +int winnwstr(WINDOW *, wchar_t *, int); +int wins_nwstr(WINDOW *, const wchar_t *, int); +int wins_wch(WINDOW *, const cchar_t *); +int wins_wstr(WINDOW *, const wchar_t *); +int winwstr(WINDOW *, wchar_t *); +int win_wch(WINDOW *, cchar_t *); +int win_wchnstr(WINDOW *, cchar_t *, int); +int win_wchstr(WINDOW *, cchar_t *); +wchar_t *wunctrl(cchar_t *); +int wvline_set(WINDOW *, const cchar_t *, int); +#endif + +/* Quasi-standard */ + +chtype getattrs(WINDOW *); +int getbegx(WINDOW *); +int getbegy(WINDOW *); +int getmaxx(WINDOW *); +int getmaxy(WINDOW *); +int getparx(WINDOW *); +int getpary(WINDOW *); +int getcurx(WINDOW *); +int getcury(WINDOW *); +void traceoff(void); +void traceon(void); +char *unctrl(chtype); + +int crmode(void); +int nocrmode(void); +int draino(int); +int resetterm(void); +int fixterm(void); +int saveterm(void); +int setsyx(int, int); + +int mouse_set(unsigned long); +int mouse_on(unsigned long); +int mouse_off(unsigned long); +int request_mouse_pos(void); +int map_button(unsigned long); +void wmouse_position(WINDOW *, int *, int *); +unsigned long getmouse(void); +unsigned long getbmap(void); + +/* ncurses */ + +int assume_default_colors(int, int); +const char *curses_version(void); +bool has_key(int); +int use_default_colors(void); +int wresize(WINDOW *, int, int); + +int mouseinterval(int); +mmask_t mousemask(mmask_t, mmask_t *); +bool mouse_trafo(int *, int *, bool); +int nc_getmouse(MEVENT *); +int ungetmouse(MEVENT *); +bool wenclose(const WINDOW *, int, int); +bool wmouse_trafo(const WINDOW *, int *, int *, bool); + +/* PDCurses */ + +int addrawch(chtype); +int insrawch(chtype); +bool is_termresized(void); +int mvaddrawch(int, int, chtype); +int mvdeleteln(int, int); +int mvinsertln(int, int); +int mvinsrawch(int, int, chtype); +int mvwaddrawch(WINDOW *, int, int, chtype); +int mvwdeleteln(WINDOW *, int, int); +int mvwinsertln(WINDOW *, int, int); +int mvwinsrawch(WINDOW *, int, int, chtype); +int raw_output(bool); +int resize_term(int, int); +WINDOW *resize_window(WINDOW *, int, int); +int waddrawch(WINDOW *, chtype); +int winsrawch(WINDOW *, chtype); +char wordchar(void); + +#ifdef PDC_WIDE +wchar_t *slk_wlabel(int); +#endif + +void PDC_debug(const char *, ...); +int PDC_ungetch(int); +int PDC_set_blink(bool); +int PDC_set_line_color(short); +void PDC_set_title(const char *); + +int PDC_clearclipboard(void); +int PDC_freeclipboard(char *); +int PDC_getclipboard(char **, long *); +int PDC_setclipboard(const char *, long); + +unsigned long PDC_get_input_fd(void); +unsigned long PDC_get_key_modifiers(void); +int PDC_return_key_modifiers(bool); +int PDC_save_key_modifiers(bool); + +#ifdef XCURSES +WINDOW *Xinitscr(int, char **); +void XCursesExit(void); +int sb_init(void); +int sb_set_horz(int, int, int); +int sb_set_vert(int, int, int); +int sb_get_horz(int *, int *, int *); +int sb_get_vert(int *, int *, int *); +int sb_refresh(void); +#endif + +/*** Functions defined as macros ***/ + +/* getch() and ungetch() conflict with some DOS libraries */ + +#define getch() wgetch(stdscr) +#define ungetch(ch) PDC_ungetch(ch) + +#define COLOR_PAIR(n) (((chtype)(n) << PDC_COLOR_SHIFT) & A_COLOR) +#define PAIR_NUMBER(n) (((n) & A_COLOR) >> PDC_COLOR_SHIFT) + +/* These will _only_ work as macros */ + +#define getbegyx(w, y, x) (y = getbegy(w), x = getbegx(w)) +#define getmaxyx(w, y, x) (y = getmaxy(w), x = getmaxx(w)) +#define getparyx(w, y, x) (y = getpary(w), x = getparx(w)) +#define getyx(w, y, x) (y = getcury(w), x = getcurx(w)) + +#define getsyx(y, x) { if (curscr->_leaveit) (y)=(x)=-1; \ + else getyx(curscr,(y),(x)); } + +#ifdef NCURSES_MOUSE_VERSION +# define getmouse(x) nc_getmouse(x) +#endif + +/* return codes from PDC_getclipboard() and PDC_setclipboard() calls */ + +#define PDC_CLIP_SUCCESS 0 +#define PDC_CLIP_ACCESS_ERROR 1 +#define PDC_CLIP_EMPTY 2 +#define PDC_CLIP_MEMORY_ERROR 3 + +/* PDCurses key modifier masks */ + +#define PDC_KEY_MODIFIER_SHIFT 1 +#define PDC_KEY_MODIFIER_CONTROL 2 +#define PDC_KEY_MODIFIER_ALT 4 +#define PDC_KEY_MODIFIER_NUMLOCK 8 + +#if defined(__cplusplus) || defined(__cplusplus__) || defined(__CPLUSPLUS) +# undef bool +} +#endif + +#endif /* __PDCURSES__ */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/win32/zconf.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,332 @@ +/* zconf.h -- configuration of the zlib compression library + * Copyright (C) 1995-2005 Jean-loup Gailly. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* @(#) $Id$ */ + +#ifndef ZCONF_H +#define ZCONF_H + +/* + * If you *really* need a unique prefix for all types and library functions, + * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it. + */ +#ifdef Z_PREFIX +# define deflateInit_ z_deflateInit_ +# define deflate z_deflate +# define deflateEnd z_deflateEnd +# define inflateInit_ z_inflateInit_ +# define inflate z_inflate +# define inflateEnd z_inflateEnd +# define deflateInit2_ z_deflateInit2_ +# define deflateSetDictionary z_deflateSetDictionary +# define deflateCopy z_deflateCopy +# define deflateReset z_deflateReset +# define deflateParams z_deflateParams +# define deflateBound z_deflateBound +# define deflatePrime z_deflatePrime +# define inflateInit2_ z_inflateInit2_ +# define inflateSetDictionary z_inflateSetDictionary +# define inflateSync z_inflateSync +# define inflateSyncPoint z_inflateSyncPoint +# define inflateCopy z_inflateCopy +# define inflateReset z_inflateReset +# define inflateBack z_inflateBack +# define inflateBackEnd z_inflateBackEnd +# define compress z_compress +# define compress2 z_compress2 +# define compressBound z_compressBound +# define uncompress z_uncompress +# define adler32 z_adler32 +# define crc32 z_crc32 +# define get_crc_table z_get_crc_table +# define zError z_zError + +# define alloc_func z_alloc_func +# define free_func z_free_func +# define in_func z_in_func +# define out_func z_out_func +# define Byte z_Byte +# define uInt z_uInt +# define uLong z_uLong +# define Bytef z_Bytef +# define charf z_charf +# define intf z_intf +# define uIntf z_uIntf +# define uLongf z_uLongf +# define voidpf z_voidpf +# define voidp z_voidp +#endif + +#if defined(__MSDOS__) && !defined(MSDOS) +# define MSDOS +#endif +#if (defined(OS_2) || defined(__OS2__)) && !defined(OS2) +# define OS2 +#endif +#if defined(_WINDOWS) && !defined(WINDOWS) +# define WINDOWS +#endif +#if defined(_WIN32) || defined(_WIN32_WCE) || defined(__WIN32__) +# ifndef WIN32 +# define WIN32 +# endif +#endif +#if (defined(MSDOS) || defined(OS2) || defined(WINDOWS)) && !defined(WIN32) +# if !defined(__GNUC__) && !defined(__FLAT__) && !defined(__386__) +# ifndef SYS16BIT +# define SYS16BIT +# endif +# endif +#endif + +/* + * Compile with -DMAXSEG_64K if the alloc function cannot allocate more + * than 64k bytes at a time (needed on systems with 16-bit int). + */ +#ifdef SYS16BIT +# define MAXSEG_64K +#endif +#ifdef MSDOS +# define UNALIGNED_OK +#endif + +#ifdef __STDC_VERSION__ +# ifndef STDC +# define STDC +# endif +# if __STDC_VERSION__ >= 199901L +# ifndef STDC99 +# define STDC99 +# endif +# endif +#endif +#if !defined(STDC) && (defined(__STDC__) || defined(__cplusplus)) +# define STDC +#endif +#if !defined(STDC) && (defined(__GNUC__) || defined(__BORLANDC__)) +# define STDC +#endif +#if !defined(STDC) && (defined(MSDOS) || defined(WINDOWS) || defined(WIN32)) +# define STDC +#endif +#if !defined(STDC) && (defined(OS2) || defined(__HOS_AIX__)) +# define STDC +#endif + +#if defined(__OS400__) && !defined(STDC) /* iSeries (formerly AS/400). */ +# define STDC +#endif + +#ifndef STDC +# ifndef const /* cannot use !defined(STDC) && !defined(const) on Mac */ +# define const /* note: need a more gentle solution here */ +# endif +#endif + +/* Some Mac compilers merge all .h files incorrectly: */ +#if defined(__MWERKS__)||defined(applec)||defined(THINK_C)||defined(__SC__) +# define NO_DUMMY_DECL +#endif + +/* Maximum value for memLevel in deflateInit2 */ +#ifndef MAX_MEM_LEVEL +# ifdef MAXSEG_64K +# define MAX_MEM_LEVEL 8 +# else +# define MAX_MEM_LEVEL 9 +# endif +#endif + +/* Maximum value for windowBits in deflateInit2 and inflateInit2. + * WARNING: reducing MAX_WBITS makes minigzip unable to extract .gz files + * created by gzip. (Files created by minigzip can still be extracted by + * gzip.) + */ +#ifndef MAX_WBITS +# define MAX_WBITS 15 /* 32K LZ77 window */ +#endif + +/* The memory requirements for deflate are (in bytes): + (1 << (windowBits+2)) + (1 << (memLevel+9)) + that is: 128K for windowBits=15 + 128K for memLevel = 8 (default values) + plus a few kilobytes for small objects. For example, if you want to reduce + the default memory requirements from 256K to 128K, compile with + make CFLAGS="-O -DMAX_WBITS=14 -DMAX_MEM_LEVEL=7" + Of course this will generally degrade compression (there's no free lunch). + + The memory requirements for inflate are (in bytes) 1 << windowBits + that is, 32K for windowBits=15 (default value) plus a few kilobytes + for small objects. +*/ + + /* Type declarations */ + +#ifndef OF /* function prototypes */ +# ifdef STDC +# define OF(args) args +# else +# define OF(args) () +# endif +#endif + +/* The following definitions for FAR are needed only for MSDOS mixed + * model programming (small or medium model with some far allocations). + * This was tested only with MSC; for other MSDOS compilers you may have + * to define NO_MEMCPY in zutil.h. If you don't need the mixed model, + * just define FAR to be empty. + */ +#ifdef SYS16BIT +# if defined(M_I86SM) || defined(M_I86MM) + /* MSC small or medium model */ +# define SMALL_MEDIUM +# ifdef _MSC_VER +# define FAR _far +# else +# define FAR far +# endif +# endif +# if (defined(__SMALL__) || defined(__MEDIUM__)) + /* Turbo C small or medium model */ +# define SMALL_MEDIUM +# ifdef __BORLANDC__ +# define FAR _far +# else +# define FAR far +# endif +# endif +#endif + +#if defined(WINDOWS) || defined(WIN32) + /* If building or using zlib as a DLL, define ZLIB_DLL. + * This is not mandatory, but it offers a little performance increase. + */ +# ifdef ZLIB_DLL +# if defined(WIN32) && (!defined(__BORLANDC__) || (__BORLANDC__ >= 0x500)) +# ifdef ZLIB_INTERNAL +# define ZEXTERN extern __declspec(dllexport) +# else +# define ZEXTERN extern __declspec(dllimport) +# endif +# endif +# endif /* ZLIB_DLL */ + /* If building or using zlib with the WINAPI/WINAPIV calling convention, + * define ZLIB_WINAPI. + * Caution: the standard ZLIB1.DLL is NOT compiled using ZLIB_WINAPI. + */ +# ifdef ZLIB_WINAPI +# ifdef FAR +# undef FAR +# endif +# include <windows.h> + /* No need for _export, use ZLIB.DEF instead. */ + /* For complete Windows compatibility, use WINAPI, not __stdcall. */ +# define ZEXPORT WINAPI +# ifdef WIN32 +# define ZEXPORTVA WINAPIV +# else +# define ZEXPORTVA FAR CDECL +# endif +# endif +#endif + +#if defined (__BEOS__) +# ifdef ZLIB_DLL +# ifdef ZLIB_INTERNAL +# define ZEXPORT __declspec(dllexport) +# define ZEXPORTVA __declspec(dllexport) +# else +# define ZEXPORT __declspec(dllimport) +# define ZEXPORTVA __declspec(dllimport) +# endif +# endif +#endif + +#ifndef ZEXTERN +# define ZEXTERN extern +#endif +#ifndef ZEXPORT +# define ZEXPORT +#endif +#ifndef ZEXPORTVA +# define ZEXPORTVA +#endif + +#ifndef FAR +# define FAR +#endif + +#if !defined(__MACTYPES__) +typedef unsigned char Byte; /* 8 bits */ +#endif +typedef unsigned int uInt; /* 16 bits or more */ +typedef unsigned long uLong; /* 32 bits or more */ + +#ifdef SMALL_MEDIUM + /* Borland C/C++ and some old MSC versions ignore FAR inside typedef */ +# define Bytef Byte FAR +#else + typedef Byte FAR Bytef; +#endif +typedef char FAR charf; +typedef int FAR intf; +typedef uInt FAR uIntf; +typedef uLong FAR uLongf; + +#ifdef STDC + typedef void const *voidpc; + typedef void FAR *voidpf; + typedef void *voidp; +#else + typedef Byte const *voidpc; + typedef Byte FAR *voidpf; + typedef Byte *voidp; +#endif + +#if 0 /* HAVE_UNISTD_H -- this line is updated by ./configure */ +# include <sys/types.h> /* for off_t */ +# include <unistd.h> /* for SEEK_* and off_t */ +# ifdef VMS +# include <unixio.h> /* for off_t */ +# endif +# define z_off_t off_t +#endif +#ifndef SEEK_SET +# define SEEK_SET 0 /* Seek from beginning of file. */ +# define SEEK_CUR 1 /* Seek from current position. */ +# define SEEK_END 2 /* Set file pointer to EOF plus "offset" */ +#endif +#ifndef z_off_t +# define z_off_t long +#endif + +#if defined(__OS400__) +# define NO_vsnprintf +#endif + +#if defined(__MVS__) +# define NO_vsnprintf +# ifdef FAR +# undef FAR +# endif +#endif + +/* MVS linker does not support external names larger than 8 bytes */ +#if defined(__MVS__) +# pragma map(deflateInit_,"DEIN") +# pragma map(deflateInit2_,"DEIN2") +# pragma map(deflateEnd,"DEEND") +# pragma map(deflateBound,"DEBND") +# pragma map(inflateInit_,"ININ") +# pragma map(inflateInit2_,"ININ2") +# pragma map(inflateEnd,"INEND") +# pragma map(inflateSync,"INSY") +# pragma map(inflateSetDictionary,"INSEDI") +# pragma map(compressBound,"CMBND") +# pragma map(inflate_table,"INTABL") +# pragma map(inflate_fast,"INFA") +# pragma map(inflate_copyright,"INCOPY") +#endif + +#endif /* ZCONF_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/win32/zlib.h Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,1357 @@ +/* zlib.h -- interface of the 'zlib' general purpose compression library + version 1.2.3, July 18th, 2005 + + Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu + + + The data format used by the zlib library is described by RFCs (Request for + Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt + (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). +*/ + +#ifndef ZLIB_H +#define ZLIB_H + +#include "zconf.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZLIB_VERSION "1.2.3" +#define ZLIB_VERNUM 0x1230 + +/* + The 'zlib' compression library provides in-memory compression and + decompression functions, including integrity checks of the uncompressed + data. This version of the library supports only one compression method + (deflation) but other algorithms will be added later and will have the same + stream interface. + + Compression can be done in a single step if the buffers are large + enough (for example if an input file is mmap'ed), or can be done by + repeated calls of the compression function. In the latter case, the + application must provide more input and/or consume the output + (providing more output space) before each call. + + The compressed data format used by default by the in-memory functions is + the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped + around a deflate stream, which is itself documented in RFC 1951. + + The library also supports reading and writing files in gzip (.gz) format + with an interface similar to that of stdio using the functions that start + with "gz". The gzip format is different from the zlib format. gzip is a + gzip wrapper, documented in RFC 1952, wrapped around a deflate stream. + + This library can optionally read and write gzip streams in memory as well. + + The zlib format was designed to be compact and fast for use in memory + and on communications channels. The gzip format was designed for single- + file compression on file systems, has a larger header than zlib to maintain + directory information, and uses a different, slower check method than zlib. + + The library does not install any signal handler. The decoder checks + the consistency of the compressed data, so the library should never + crash even in case of corrupted input. +*/ + +typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size)); +typedef void (*free_func) OF((voidpf opaque, voidpf address)); + +struct internal_state; + +typedef struct z_stream_s { + Bytef *next_in; /* next input byte */ + uInt avail_in; /* number of bytes available at next_in */ + uLong total_in; /* total nb of input bytes read so far */ + + Bytef *next_out; /* next output byte should be put there */ + uInt avail_out; /* remaining free space at next_out */ + uLong total_out; /* total nb of bytes output so far */ + + char *msg; /* last error message, NULL if no error */ + struct internal_state FAR *state; /* not visible by applications */ + + alloc_func zalloc; /* used to allocate the internal state */ + free_func zfree; /* used to free the internal state */ + voidpf opaque; /* private data object passed to zalloc and zfree */ + + int data_type; /* best guess about the data type: binary or text */ + uLong adler; /* adler32 value of the uncompressed data */ + uLong reserved; /* reserved for future use */ +} z_stream; + +typedef z_stream FAR *z_streamp; + +/* + gzip header information passed to and from zlib routines. See RFC 1952 + for more details on the meanings of these fields. +*/ +typedef struct gz_header_s { + int text; /* true if compressed data believed to be text */ + uLong time; /* modification time */ + int xflags; /* extra flags (not used when writing a gzip file) */ + int os; /* operating system */ + Bytef *extra; /* pointer to extra field or Z_NULL if none */ + uInt extra_len; /* extra field length (valid if extra != Z_NULL) */ + uInt extra_max; /* space at extra (only when reading header) */ + Bytef *name; /* pointer to zero-terminated file name or Z_NULL */ + uInt name_max; /* space at name (only when reading header) */ + Bytef *comment; /* pointer to zero-terminated comment or Z_NULL */ + uInt comm_max; /* space at comment (only when reading header) */ + int hcrc; /* true if there was or will be a header crc */ + int done; /* true when done reading gzip header (not used + when writing a gzip file) */ +} gz_header; + +typedef gz_header FAR *gz_headerp; + +/* + The application must update next_in and avail_in when avail_in has + dropped to zero. It must update next_out and avail_out when avail_out + has dropped to zero. The application must initialize zalloc, zfree and + opaque before calling the init function. All other fields are set by the + compression library and must not be updated by the application. + + The opaque value provided by the application will be passed as the first + parameter for calls of zalloc and zfree. This can be useful for custom + memory management. The compression library attaches no meaning to the + opaque value. + + zalloc must return Z_NULL if there is not enough memory for the object. + If zlib is used in a multi-threaded application, zalloc and zfree must be + thread safe. + + On 16-bit systems, the functions zalloc and zfree must be able to allocate + exactly 65536 bytes, but will not be required to allocate more than this + if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS, + pointers returned by zalloc for objects of exactly 65536 bytes *must* + have their offset normalized to zero. The default allocation function + provided by this library ensures this (see zutil.c). To reduce memory + requirements and avoid any allocation of 64K objects, at the expense of + compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h). + + The fields total_in and total_out can be used for statistics or + progress reports. After compression, total_in holds the total size of + the uncompressed data and may be saved for use in the decompressor + (particularly if the decompressor wants to decompress everything in + a single step). +*/ + + /* constants */ + +#define Z_NO_FLUSH 0 +#define Z_PARTIAL_FLUSH 1 /* will be removed, use Z_SYNC_FLUSH instead */ +#define Z_SYNC_FLUSH 2 +#define Z_FULL_FLUSH 3 +#define Z_FINISH 4 +#define Z_BLOCK 5 +/* Allowed flush values; see deflate() and inflate() below for details */ + +#define Z_OK 0 +#define Z_STREAM_END 1 +#define Z_NEED_DICT 2 +#define Z_ERRNO (-1) +#define Z_STREAM_ERROR (-2) +#define Z_DATA_ERROR (-3) +#define Z_MEM_ERROR (-4) +#define Z_BUF_ERROR (-5) +#define Z_VERSION_ERROR (-6) +/* Return codes for the compression/decompression functions. Negative + * values are errors, positive values are used for special but normal events. + */ + +#define Z_NO_COMPRESSION 0 +#define Z_BEST_SPEED 1 +#define Z_BEST_COMPRESSION 9 +#define Z_DEFAULT_COMPRESSION (-1) +/* compression levels */ + +#define Z_FILTERED 1 +#define Z_HUFFMAN_ONLY 2 +#define Z_RLE 3 +#define Z_FIXED 4 +#define Z_DEFAULT_STRATEGY 0 +/* compression strategy; see deflateInit2() below for details */ + +#define Z_BINARY 0 +#define Z_TEXT 1 +#define Z_ASCII Z_TEXT /* for compatibility with 1.2.2 and earlier */ +#define Z_UNKNOWN 2 +/* Possible values of the data_type field (though see inflate()) */ + +#define Z_DEFLATED 8 +/* The deflate compression method (the only one supported in this version) */ + +#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */ + +#define zlib_version zlibVersion() +/* for compatibility with versions < 1.0.2 */ + + /* basic functions */ + +ZEXTERN const char * ZEXPORT zlibVersion OF((void)); +/* The application can compare zlibVersion and ZLIB_VERSION for consistency. + If the first character differs, the library code actually used is + not compatible with the zlib.h header file used by the application. + This check is automatically made by deflateInit and inflateInit. + */ + +/* +ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level)); + + Initializes the internal stream state for compression. The fields + zalloc, zfree and opaque must be initialized before by the caller. + If zalloc and zfree are set to Z_NULL, deflateInit updates them to + use default allocation functions. + + The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9: + 1 gives best speed, 9 gives best compression, 0 gives no compression at + all (the input data is simply copied a block at a time). + Z_DEFAULT_COMPRESSION requests a default compromise between speed and + compression (currently equivalent to level 6). + + deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if level is not a valid compression level, + Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible + with the version assumed by the caller (ZLIB_VERSION). + msg is set to null if there is no error message. deflateInit does not + perform any compression: this will be done by deflate(). +*/ + + +ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush)); +/* + deflate compresses as much data as possible, and stops when the input + buffer becomes empty or the output buffer becomes full. It may introduce some + output latency (reading input without producing any output) except when + forced to flush. + + The detailed semantics are as follows. deflate performs one or both of the + following actions: + + - Compress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in and avail_in are updated and + processing will resume at this point for the next call of deflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. This action is forced if the parameter flush is non zero. + Forcing flush frequently degrades the compression ratio, so this parameter + should be set only when necessary (in interactive applications). + Some output may be provided even if flush is not set. + + Before the call of deflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming + more output, and updating avail_in or avail_out accordingly; avail_out + should never be zero before the call. The application can consume the + compressed output when it wants, for example when the output buffer is full + (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK + and with zero avail_out, it must be called again after making room in the + output buffer because there might be more output pending. + + Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to + decide how much data to accumualte before producing output, in order to + maximize compression. + + If the parameter flush is set to Z_SYNC_FLUSH, all pending output is + flushed to the output buffer and the output is aligned on a byte boundary, so + that the decompressor can get all input data available so far. (In particular + avail_in is zero after the call if enough output space has been provided + before the call.) Flushing may degrade compression for some compression + algorithms and so it should be used only when necessary. + + If flush is set to Z_FULL_FLUSH, all output is flushed as with + Z_SYNC_FLUSH, and the compression state is reset so that decompression can + restart from this point if previous compressed data has been damaged or if + random access is desired. Using Z_FULL_FLUSH too often can seriously degrade + compression. + + If deflate returns with avail_out == 0, this function must be called again + with the same value of the flush parameter and more output space (updated + avail_out), until the flush is complete (deflate returns with non-zero + avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that + avail_out is greater than six to avoid repeated flush markers due to + avail_out == 0 on return. + + If the parameter flush is set to Z_FINISH, pending input is processed, + pending output is flushed and deflate returns with Z_STREAM_END if there + was enough output space; if deflate returns with Z_OK, this function must be + called again with Z_FINISH and more output space (updated avail_out) but no + more input data, until it returns with Z_STREAM_END or an error. After + deflate has returned Z_STREAM_END, the only possible operations on the + stream are deflateReset or deflateEnd. + + Z_FINISH can be used immediately after deflateInit if all the compression + is to be done in a single step. In this case, avail_out must be at least + the value returned by deflateBound (see below). If deflate does not return + Z_STREAM_END, then it must be called again as described above. + + deflate() sets strm->adler to the adler32 checksum of all input read + so far (that is, total_in bytes). + + deflate() may update strm->data_type if it can make a good guess about + the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered + binary. This field is only for information purposes and does not affect + the compression algorithm in any manner. + + deflate() returns Z_OK if some progress has been made (more input + processed or more output produced), Z_STREAM_END if all input has been + consumed and all output has been produced (only when flush is set to + Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example + if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible + (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not + fatal, and deflate() can be called again with more input and more output + space to continue compressing. +*/ + + +ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any + pending output. + + deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the + stream state was inconsistent, Z_DATA_ERROR if the stream was freed + prematurely (some input or output was discarded). In the error case, + msg may be set but then points to a static string (which must not be + deallocated). +*/ + + +/* +ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm)); + + Initializes the internal stream state for decompression. The fields + next_in, avail_in, zalloc, zfree and opaque must be initialized before by + the caller. If next_in is not Z_NULL and avail_in is large enough (the exact + value depends on the compression method), inflateInit determines the + compression method from the zlib header and allocates all data structures + accordingly; otherwise the allocation will be deferred to the first call of + inflate. If zalloc and zfree are set to Z_NULL, inflateInit updates them to + use default allocation functions. + + inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_VERSION_ERROR if the zlib library version is incompatible with the + version assumed by the caller. msg is set to null if there is no error + message. inflateInit does not perform any decompression apart from reading + the zlib header if present: this will be done by inflate(). (So next_in and + avail_in may be modified, but next_out and avail_out are unchanged.) +*/ + + +ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush)); +/* + inflate decompresses as much data as possible, and stops when the input + buffer becomes empty or the output buffer becomes full. It may introduce + some output latency (reading input without producing any output) except when + forced to flush. + + The detailed semantics are as follows. inflate performs one or both of the + following actions: + + - Decompress more input starting at next_in and update next_in and avail_in + accordingly. If not all input can be processed (because there is not + enough room in the output buffer), next_in is updated and processing + will resume at this point for the next call of inflate(). + + - Provide more output starting at next_out and update next_out and avail_out + accordingly. inflate() provides as much output as possible, until there + is no more input data or no more space in the output buffer (see below + about the flush parameter). + + Before the call of inflate(), the application should ensure that at least + one of the actions is possible, by providing more input and/or consuming + more output, and updating the next_* and avail_* values accordingly. + The application can consume the uncompressed output when it wants, for + example when the output buffer is full (avail_out == 0), or after each + call of inflate(). If inflate returns Z_OK and with zero avail_out, it + must be called again after making room in the output buffer because there + might be more output pending. + + The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, + Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much + output as possible to the output buffer. Z_BLOCK requests that inflate() stop + if and when it gets to the next deflate block boundary. When decoding the + zlib or gzip format, this will cause inflate() to return immediately after + the header and before the first block. When doing a raw inflate, inflate() + will go ahead and process the first block, and will return when it gets to + the end of that block, or when it runs out of data. + + The Z_BLOCK option assists in appending to or combining deflate streams. + Also to assist in this, on return inflate() will set strm->data_type to the + number of unused bits in the last byte taken from strm->next_in, plus 64 + if inflate() is currently decoding the last block in the deflate stream, + plus 128 if inflate() returned immediately after decoding an end-of-block + code or decoding the complete header up to just before the first byte of the + deflate stream. The end-of-block will not be indicated until all of the + uncompressed data from that block has been written to strm->next_out. The + number of unused bits may in general be greater than seven, except when + bit 7 of data_type is set, in which case the number of unused bits will be + less than eight. + + inflate() should normally be called until it returns Z_STREAM_END or an + error. However if all decompression is to be performed in a single step + (a single call of inflate), the parameter flush should be set to + Z_FINISH. In this case all pending input is processed and all pending + output is flushed; avail_out must be large enough to hold all the + uncompressed data. (The size of the uncompressed data may have been saved + by the compressor for this purpose.) The next operation on this stream must + be inflateEnd to deallocate the decompression state. The use of Z_FINISH + is never required, but can be used to inform inflate that a faster approach + may be used for the single inflate() call. + + In this implementation, inflate() always flushes as much output as + possible to the output buffer, and always uses the faster approach on the + first call. So the only effect of the flush parameter in this implementation + is on the return value of inflate(), as noted below, or when it returns early + because Z_BLOCK is used. + + If a preset dictionary is needed after this call (see inflateSetDictionary + below), inflate sets strm->adler to the adler32 checksum of the dictionary + chosen by the compressor and returns Z_NEED_DICT; otherwise it sets + strm->adler to the adler32 checksum of all output produced so far (that is, + total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described + below. At the end of the stream, inflate() checks that its computed adler32 + checksum is equal to that saved by the compressor and returns Z_STREAM_END + only if the checksum is correct. + + inflate() will decompress and check either zlib-wrapped or gzip-wrapped + deflate data. The header type is detected automatically. Any information + contained in the gzip header is not retained, so applications that need that + information should instead use raw inflate, see inflateInit2() below, or + inflateBack() and perform their own processing of the gzip header and + trailer. + + inflate() returns Z_OK if some progress has been made (more input processed + or more output produced), Z_STREAM_END if the end of the compressed data has + been reached and all uncompressed output has been produced, Z_NEED_DICT if a + preset dictionary is needed at this point, Z_DATA_ERROR if the input data was + corrupted (input stream not conforming to the zlib format or incorrect check + value), Z_STREAM_ERROR if the stream structure was inconsistent (for example + if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory, + Z_BUF_ERROR if no progress is possible or if there was not enough room in the + output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and + inflate() can be called again with more input and more output space to + continue decompressing. If Z_DATA_ERROR is returned, the application may then + call inflateSync() to look for a good compression block if a partial recovery + of the data is desired. +*/ + + +ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm)); +/* + All dynamically allocated data structures for this stream are freed. + This function discards any unprocessed input and does not flush any + pending output. + + inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state + was inconsistent. In the error case, msg may be set but then points to a + static string (which must not be deallocated). +*/ + + /* Advanced functions */ + +/* + The following functions are needed only in some special applications. +*/ + +/* +ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm, + int level, + int method, + int windowBits, + int memLevel, + int strategy)); + + This is another version of deflateInit with more compression options. The + fields next_in, zalloc, zfree and opaque must be initialized before by + the caller. + + The method parameter is the compression method. It must be Z_DEFLATED in + this version of the library. + + The windowBits parameter is the base two logarithm of the window size + (the size of the history buffer). It should be in the range 8..15 for this + version of the library. Larger values of this parameter result in better + compression at the expense of memory usage. The default value is 15 if + deflateInit is used instead. + + windowBits can also be -8..-15 for raw deflate. In this case, -windowBits + determines the window size. deflate() will then generate raw deflate data + with no zlib header or trailer, and will not compute an adler32 check value. + + windowBits can also be greater than 15 for optional gzip encoding. Add + 16 to windowBits to write a simple gzip header and trailer around the + compressed data instead of a zlib wrapper. The gzip header will have no + file name, no extra data, no comment, no modification time (set to zero), + no header crc, and the operating system will be set to 255 (unknown). If a + gzip stream is being written, strm->adler is a crc32 instead of an adler32. + + The memLevel parameter specifies how much memory should be allocated + for the internal compression state. memLevel=1 uses minimum memory but + is slow and reduces compression ratio; memLevel=9 uses maximum memory + for optimal speed. The default value is 8. See zconf.h for total memory + usage as a function of windowBits and memLevel. + + The strategy parameter is used to tune the compression algorithm. Use the + value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a + filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no + string match), or Z_RLE to limit match distances to one (run-length + encoding). Filtered data consists mostly of small values with a somewhat + random distribution. In this case, the compression algorithm is tuned to + compress them better. The effect of Z_FILTERED is to force more Huffman + coding and less string matching; it is somewhat intermediate between + Z_DEFAULT and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as fast as + Z_HUFFMAN_ONLY, but give better compression for PNG image data. The strategy + parameter only affects the compression ratio but not the correctness of the + compressed output even if it is not set appropriately. Z_FIXED prevents the + use of dynamic Huffman codes, allowing for a simpler decoder for special + applications. + + deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_STREAM_ERROR if a parameter is invalid (such as an invalid + method). msg is set to null if there is no error message. deflateInit2 does + not perform any compression: this will be done by deflate(). +*/ + +ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm, + const Bytef *dictionary, + uInt dictLength)); +/* + Initializes the compression dictionary from the given byte sequence + without producing any compressed output. This function must be called + immediately after deflateInit, deflateInit2 or deflateReset, before any + call of deflate. The compressor and decompressor must use exactly the same + dictionary (see inflateSetDictionary). + + The dictionary should consist of strings (byte sequences) that are likely + to be encountered later in the data to be compressed, with the most commonly + used strings preferably put towards the end of the dictionary. Using a + dictionary is most useful when the data to be compressed is short and can be + predicted with good accuracy; the data can then be compressed better than + with the default empty dictionary. + + Depending on the size of the compression data structures selected by + deflateInit or deflateInit2, a part of the dictionary may in effect be + discarded, for example if the dictionary is larger than the window size in + deflate or deflate2. Thus the strings most likely to be useful should be + put at the end of the dictionary, not at the front. In addition, the + current implementation of deflate will use at most the window size minus + 262 bytes of the provided dictionary. + + Upon return of this function, strm->adler is set to the adler32 value + of the dictionary; the decompressor may later use this value to determine + which dictionary has been used by the compressor. (The adler32 value + applies to the whole dictionary even if only a subset of the dictionary is + actually used by the compressor.) If a raw deflate was requested, then the + adler32 value is not computed and strm->adler is not set. + + deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a + parameter is invalid (such as NULL dictionary) or the stream state is + inconsistent (for example if deflate has already been called for this stream + or if the compression method is bsort). deflateSetDictionary does not + perform any compression: this will be done by deflate(). +*/ + +ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest, + z_streamp source)); +/* + Sets the destination stream as a complete copy of the source stream. + + This function can be useful when several compression strategies will be + tried, for example when there are several ways of pre-processing the input + data with a filter. The streams that will be discarded should then be freed + by calling deflateEnd. Note that deflateCopy duplicates the internal + compression state which can be quite large, so this strategy is slow and + can consume lots of memory. + + deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being NULL). msg is left unchanged in both source and + destination. +*/ + +ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm)); +/* + This function is equivalent to deflateEnd followed by deflateInit, + but does not free and reallocate all the internal compression state. + The stream will keep the same compression level and any other attributes + that may have been set by deflateInit2. + + deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being NULL). +*/ + +ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm, + int level, + int strategy)); +/* + Dynamically update the compression level and compression strategy. The + interpretation of level and strategy is as in deflateInit2. This can be + used to switch between compression and straight copy of the input data, or + to switch to a different kind of input data requiring a different + strategy. If the compression level is changed, the input available so far + is compressed with the old level (and may be flushed); the new level will + take effect only at the next call of deflate(). + + Before the call of deflateParams, the stream state must be set as for + a call of deflate(), since the currently available input may have to + be compressed and flushed. In particular, strm->avail_out must be non-zero. + + deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source + stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR + if strm->avail_out was zero. +*/ + +ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm, + int good_length, + int max_lazy, + int nice_length, + int max_chain)); +/* + Fine tune deflate's internal compression parameters. This should only be + used by someone who understands the algorithm used by zlib's deflate for + searching for the best matching string, and even then only by the most + fanatic optimizer trying to squeeze out the last compressed bit for their + specific input data. Read the deflate.c source code for the meaning of the + max_lazy, good_length, nice_length, and max_chain parameters. + + deflateTune() can be called after deflateInit() or deflateInit2(), and + returns Z_OK on success, or Z_STREAM_ERROR for an invalid deflate stream. + */ + +ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm, + uLong sourceLen)); +/* + deflateBound() returns an upper bound on the compressed size after + deflation of sourceLen bytes. It must be called after deflateInit() + or deflateInit2(). This would be used to allocate an output buffer + for deflation in a single pass, and so would be called before deflate(). +*/ + +ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm, + int bits, + int value)); +/* + deflatePrime() inserts bits in the deflate output stream. The intent + is that this function is used to start off the deflate output with the + bits leftover from a previous deflate stream when appending to it. As such, + this function can only be used for raw deflate, and must be used before the + first deflate() call after a deflateInit2() or deflateReset(). bits must be + less than or equal to 16, and that many of the least significant bits of + value will be inserted in the output. + + deflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm, + gz_headerp head)); +/* + deflateSetHeader() provides gzip header information for when a gzip + stream is requested by deflateInit2(). deflateSetHeader() may be called + after deflateInit2() or deflateReset() and before the first call of + deflate(). The text, time, os, extra field, name, and comment information + in the provided gz_header structure are written to the gzip header (xflag is + ignored -- the extra flags are set according to the compression level). The + caller must assure that, if not Z_NULL, name and comment are terminated with + a zero byte, and that if extra is not Z_NULL, that extra_len bytes are + available there. If hcrc is true, a gzip header crc is included. Note that + the current versions of the command-line version of gzip (up through version + 1.3.x) do not support header crc's, and will report that it is a "multi-part + gzip file" and give up. + + If deflateSetHeader is not used, the default gzip header has text false, + the time set to zero, and os set to 255, with no extra, name, or comment + fields. The gzip header is returned to the default state by deflateReset(). + + deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +/* +ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm, + int windowBits)); + + This is another version of inflateInit with an extra parameter. The + fields next_in, avail_in, zalloc, zfree and opaque must be initialized + before by the caller. + + The windowBits parameter is the base two logarithm of the maximum window + size (the size of the history buffer). It should be in the range 8..15 for + this version of the library. The default value is 15 if inflateInit is used + instead. windowBits must be greater than or equal to the windowBits value + provided to deflateInit2() while compressing, or it must be equal to 15 if + deflateInit2() was not used. If a compressed stream with a larger window + size is given as input, inflate() will return with the error code + Z_DATA_ERROR instead of trying to allocate a larger window. + + windowBits can also be -8..-15 for raw inflate. In this case, -windowBits + determines the window size. inflate() will then process raw deflate data, + not looking for a zlib or gzip header, not generating a check value, and not + looking for any check values for comparison at the end of the stream. This + is for use with other formats that use the deflate compressed data format + such as zip. Those formats provide their own check values. If a custom + format is developed using the raw deflate format for compressed data, it is + recommended that a check value such as an adler32 or a crc32 be applied to + the uncompressed data as is done in the zlib, gzip, and zip formats. For + most applications, the zlib format should be used as is. Note that comments + above on the use in deflateInit2() applies to the magnitude of windowBits. + + windowBits can also be greater than 15 for optional gzip decoding. Add + 32 to windowBits to enable zlib and gzip decoding with automatic header + detection, or add 16 to decode only the gzip format (the zlib format will + return a Z_DATA_ERROR). If a gzip stream is being decoded, strm->adler is + a crc32 instead of an adler32. + + inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg + is set to null if there is no error message. inflateInit2 does not perform + any decompression apart from reading the zlib header if present: this will + be done by inflate(). (So next_in and avail_in may be modified, but next_out + and avail_out are unchanged.) +*/ + +ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm, + const Bytef *dictionary, + uInt dictLength)); +/* + Initializes the decompression dictionary from the given uncompressed byte + sequence. This function must be called immediately after a call of inflate, + if that call returned Z_NEED_DICT. The dictionary chosen by the compressor + can be determined from the adler32 value returned by that call of inflate. + The compressor and decompressor must use exactly the same dictionary (see + deflateSetDictionary). For raw inflate, this function can be called + immediately after inflateInit2() or inflateReset() and before any call of + inflate() to set the dictionary. The application must insure that the + dictionary that was used for compression is provided. + + inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a + parameter is invalid (such as NULL dictionary) or the stream state is + inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the + expected one (incorrect adler32 value). inflateSetDictionary does not + perform any decompression: this will be done by subsequent calls of + inflate(). +*/ + +ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm)); +/* + Skips invalid compressed data until a full flush point (see above the + description of deflate with Z_FULL_FLUSH) can be found, or until all + available input is skipped. No output is provided. + + inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR + if no more input was provided, Z_DATA_ERROR if no flush point has been found, + or Z_STREAM_ERROR if the stream structure was inconsistent. In the success + case, the application may save the current current value of total_in which + indicates where valid compressed data was found. In the error case, the + application may repeatedly call inflateSync, providing more input each time, + until success or end of the input data. +*/ + +ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest, + z_streamp source)); +/* + Sets the destination stream as a complete copy of the source stream. + + This function can be useful when randomly accessing a large stream. The + first pass through the stream can periodically record the inflate state, + allowing restarting inflate at those points when randomly accessing the + stream. + + inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_STREAM_ERROR if the source stream state was inconsistent + (such as zalloc being NULL). msg is left unchanged in both source and + destination. +*/ + +ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm)); +/* + This function is equivalent to inflateEnd followed by inflateInit, + but does not free and reallocate all the internal decompression state. + The stream will keep attributes that may have been set by inflateInit2. + + inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent (such as zalloc or state being NULL). +*/ + +ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm, + int bits, + int value)); +/* + This function inserts bits in the inflate input stream. The intent is + that this function is used to start inflating at a bit position in the + middle of a byte. The provided bits will be used before any bytes are used + from next_in. This function should only be used with raw inflate, and + should be used before the first inflate() call after inflateInit2() or + inflateReset(). bits must be less than or equal to 16, and that many of the + least significant bits of value will be inserted in the input. + + inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm, + gz_headerp head)); +/* + inflateGetHeader() requests that gzip header information be stored in the + provided gz_header structure. inflateGetHeader() may be called after + inflateInit2() or inflateReset(), and before the first call of inflate(). + As inflate() processes the gzip stream, head->done is zero until the header + is completed, at which time head->done is set to one. If a zlib stream is + being decoded, then head->done is set to -1 to indicate that there will be + no gzip header information forthcoming. Note that Z_BLOCK can be used to + force inflate() to return immediately after header processing is complete + and before any actual data is decompressed. + + The text, time, xflags, and os fields are filled in with the gzip header + contents. hcrc is set to true if there is a header CRC. (The header CRC + was valid if done is set to one.) If extra is not Z_NULL, then extra_max + contains the maximum number of bytes to write to extra. Once done is true, + extra_len contains the actual extra field length, and extra contains the + extra field, or that field truncated if extra_max is less than extra_len. + If name is not Z_NULL, then up to name_max characters are written there, + terminated with a zero unless the length is greater than name_max. If + comment is not Z_NULL, then up to comm_max characters are written there, + terminated with a zero unless the length is greater than comm_max. When + any of extra, name, or comment are not Z_NULL and the respective field is + not present in the header, then that field is set to Z_NULL to signal its + absence. This allows the use of deflateSetHeader() with the returned + structure to duplicate the header. However if those fields are set to + allocated memory, then the application will need to save those pointers + elsewhere so that they can be eventually freed. + + If inflateGetHeader is not used, then the header information is simply + discarded. The header is always checked for validity, including the header + CRC if present. inflateReset() will reset the process to discard the header + information. The application would need to call inflateGetHeader() again to + retrieve the header from the next gzip stream. + + inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source + stream state was inconsistent. +*/ + +/* +ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits, + unsigned char FAR *window)); + + Initialize the internal stream state for decompression using inflateBack() + calls. The fields zalloc, zfree and opaque in strm must be initialized + before the call. If zalloc and zfree are Z_NULL, then the default library- + derived memory allocation routines are used. windowBits is the base two + logarithm of the window size, in the range 8..15. window is a caller + supplied buffer of that size. Except for special applications where it is + assured that deflate was used with small window sizes, windowBits must be 15 + and a 32K byte window must be supplied to be able to decompress general + deflate streams. + + See inflateBack() for the usage of these routines. + + inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of + the paramaters are invalid, Z_MEM_ERROR if the internal state could not + be allocated, or Z_VERSION_ERROR if the version of the library does not + match the version of the header file. +*/ + +typedef unsigned (*in_func) OF((void FAR *, unsigned char FAR * FAR *)); +typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned)); + +ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm, + in_func in, void FAR *in_desc, + out_func out, void FAR *out_desc)); +/* + inflateBack() does a raw inflate with a single call using a call-back + interface for input and output. This is more efficient than inflate() for + file i/o applications in that it avoids copying between the output and the + sliding window by simply making the window itself the output buffer. This + function trusts the application to not change the output buffer passed by + the output function, at least until inflateBack() returns. + + inflateBackInit() must be called first to allocate the internal state + and to initialize the state with the user-provided window buffer. + inflateBack() may then be used multiple times to inflate a complete, raw + deflate stream with each call. inflateBackEnd() is then called to free + the allocated state. + + A raw deflate stream is one with no zlib or gzip header or trailer. + This routine would normally be used in a utility that reads zip or gzip + files and writes out uncompressed files. The utility would decode the + header and process the trailer on its own, hence this routine expects + only the raw deflate stream to decompress. This is different from the + normal behavior of inflate(), which expects either a zlib or gzip header and + trailer around the deflate stream. + + inflateBack() uses two subroutines supplied by the caller that are then + called by inflateBack() for input and output. inflateBack() calls those + routines until it reads a complete deflate stream and writes out all of the + uncompressed data, or until it encounters an error. The function's + parameters and return types are defined above in the in_func and out_func + typedefs. inflateBack() will call in(in_desc, &buf) which should return the + number of bytes of provided input, and a pointer to that input in buf. If + there is no input available, in() must return zero--buf is ignored in that + case--and inflateBack() will return a buffer error. inflateBack() will call + out(out_desc, buf, len) to write the uncompressed data buf[0..len-1]. out() + should return zero on success, or non-zero on failure. If out() returns + non-zero, inflateBack() will return with an error. Neither in() nor out() + are permitted to change the contents of the window provided to + inflateBackInit(), which is also the buffer that out() uses to write from. + The length written by out() will be at most the window size. Any non-zero + amount of input may be provided by in(). + + For convenience, inflateBack() can be provided input on the first call by + setting strm->next_in and strm->avail_in. If that input is exhausted, then + in() will be called. Therefore strm->next_in must be initialized before + calling inflateBack(). If strm->next_in is Z_NULL, then in() will be called + immediately for input. If strm->next_in is not Z_NULL, then strm->avail_in + must also be initialized, and then if strm->avail_in is not zero, input will + initially be taken from strm->next_in[0 .. strm->avail_in - 1]. + + The in_desc and out_desc parameters of inflateBack() is passed as the + first parameter of in() and out() respectively when they are called. These + descriptors can be optionally used to pass any information that the caller- + supplied in() and out() functions need to do their job. + + On return, inflateBack() will set strm->next_in and strm->avail_in to + pass back any unused input that was provided by the last in() call. The + return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR + if in() or out() returned an error, Z_DATA_ERROR if there was a format + error in the deflate stream (in which case strm->msg is set to indicate the + nature of the error), or Z_STREAM_ERROR if the stream was not properly + initialized. In the case of Z_BUF_ERROR, an input or output error can be + distinguished using strm->next_in which will be Z_NULL only if in() returned + an error. If strm->next is not Z_NULL, then the Z_BUF_ERROR was due to + out() returning non-zero. (in() will always be called before out(), so + strm->next_in is assured to be defined if out() returns non-zero.) Note + that inflateBack() cannot return Z_OK. +*/ + +ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm)); +/* + All memory allocated by inflateBackInit() is freed. + + inflateBackEnd() returns Z_OK on success, or Z_STREAM_ERROR if the stream + state was inconsistent. +*/ + +ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void)); +/* Return flags indicating compile-time options. + + Type sizes, two bits each, 00 = 16 bits, 01 = 32, 10 = 64, 11 = other: + 1.0: size of uInt + 3.2: size of uLong + 5.4: size of voidpf (pointer) + 7.6: size of z_off_t + + Compiler, assembler, and debug options: + 8: DEBUG + 9: ASMV or ASMINF -- use ASM code + 10: ZLIB_WINAPI -- exported functions use the WINAPI calling convention + 11: 0 (reserved) + + One-time table building (smaller code, but not thread-safe if true): + 12: BUILDFIXED -- build static block decoding tables when needed + 13: DYNAMIC_CRC_TABLE -- build CRC calculation tables when needed + 14,15: 0 (reserved) + + Library content (indicates missing functionality): + 16: NO_GZCOMPRESS -- gz* functions cannot compress (to avoid linking + deflate code when not needed) + 17: NO_GZIP -- deflate can't write gzip streams, and inflate can't detect + and decode gzip streams (to avoid linking crc code) + 18-19: 0 (reserved) + + Operation variations (changes in library functionality): + 20: PKZIP_BUG_WORKAROUND -- slightly more permissive inflate + 21: FASTEST -- deflate algorithm with only one, lowest compression level + 22,23: 0 (reserved) + + The sprintf variant used by gzprintf (zero is best): + 24: 0 = vs*, 1 = s* -- 1 means limited to 20 arguments after the format + 25: 0 = *nprintf, 1 = *printf -- 1 means gzprintf() not secure! + 26: 0 = returns value, 1 = void -- 1 means inferred string length returned + + Remainder: + 27-31: 0 (reserved) + */ + + + /* utility functions */ + +/* + The following utility functions are implemented on top of the + basic stream-oriented functions. To simplify the interface, some + default options are assumed (compression level and memory usage, + standard memory allocation functions). The source code of these + utility functions can easily be modified if you need special options. +*/ + +ZEXTERN int ZEXPORT compress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); +/* + Compresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be at least the value returned + by compressBound(sourceLen). Upon exit, destLen is the actual size of the + compressed buffer. + This function can be used to compress a whole file at once if the + input file is mmap'ed. + compress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer. +*/ + +ZEXTERN int ZEXPORT compress2 OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen, + int level)); +/* + Compresses the source buffer into the destination buffer. The level + parameter has the same meaning as in deflateInit. sourceLen is the byte + length of the source buffer. Upon entry, destLen is the total size of the + destination buffer, which must be at least the value returned by + compressBound(sourceLen). Upon exit, destLen is the actual size of the + compressed buffer. + + compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + memory, Z_BUF_ERROR if there was not enough room in the output buffer, + Z_STREAM_ERROR if the level parameter is invalid. +*/ + +ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen)); +/* + compressBound() returns an upper bound on the compressed size after + compress() or compress2() on sourceLen bytes. It would be used before + a compress() or compress2() call to allocate the destination buffer. +*/ + +ZEXTERN int ZEXPORT uncompress OF((Bytef *dest, uLongf *destLen, + const Bytef *source, uLong sourceLen)); +/* + Decompresses the source buffer into the destination buffer. sourceLen is + the byte length of the source buffer. Upon entry, destLen is the total + size of the destination buffer, which must be large enough to hold the + entire uncompressed data. (The size of the uncompressed data must have + been saved previously by the compressor and transmitted to the decompressor + by some mechanism outside the scope of this compression library.) + Upon exit, destLen is the actual size of the compressed buffer. + This function can be used to decompress a whole file at once if the + input file is mmap'ed. + + uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + enough memory, Z_BUF_ERROR if there was not enough room in the output + buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete. +*/ + + +typedef voidp gzFile; + +ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode)); +/* + Opens a gzip (.gz) file for reading or writing. The mode parameter + is as in fopen ("rb" or "wb") but can also include a compression level + ("wb9") or a strategy: 'f' for filtered data as in "wb6f", 'h' for + Huffman only compression as in "wb1h", or 'R' for run-length encoding + as in "wb1R". (See the description of deflateInit2 for more information + about the strategy parameter.) + + gzopen can be used to read a file which is not in gzip format; in this + case gzread will directly read from the file without decompression. + + gzopen returns NULL if the file could not be opened or if there was + insufficient memory to allocate the (de)compression state; errno + can be checked to distinguish the two cases (if errno is zero, the + zlib error is Z_MEM_ERROR). */ + +ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode)); +/* + gzdopen() associates a gzFile with the file descriptor fd. File + descriptors are obtained from calls like open, dup, creat, pipe or + fileno (in the file has been previously opened with fopen). + The mode parameter is as in gzopen. + The next call of gzclose on the returned gzFile will also close the + file descriptor fd, just like fclose(fdopen(fd), mode) closes the file + descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode). + gzdopen returns NULL if there was insufficient memory to allocate + the (de)compression state. +*/ + +ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy)); +/* + Dynamically update the compression level or strategy. See the description + of deflateInit2 for the meaning of these parameters. + gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not + opened for writing. +*/ + +ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len)); +/* + Reads the given number of uncompressed bytes from the compressed file. + If the input file was not in gzip format, gzread copies the given number + of bytes into the buffer. + gzread returns the number of uncompressed bytes actually read (0 for + end of file, -1 for error). */ + +ZEXTERN int ZEXPORT gzwrite OF((gzFile file, + voidpc buf, unsigned len)); +/* + Writes the given number of uncompressed bytes into the compressed file. + gzwrite returns the number of uncompressed bytes actually written + (0 in case of error). +*/ + +ZEXTERN int ZEXPORTVA gzprintf OF((gzFile file, const char *format, ...)); +/* + Converts, formats, and writes the args to the compressed file under + control of the format string, as in fprintf. gzprintf returns the number of + uncompressed bytes actually written (0 in case of error). The number of + uncompressed bytes written is limited to 4095. The caller should assure that + this limit is not exceeded. If it is exceeded, then gzprintf() will return + return an error (0) with nothing written. In this case, there may also be a + buffer overflow with unpredictable consequences, which is possible only if + zlib was compiled with the insecure functions sprintf() or vsprintf() + because the secure snprintf() or vsnprintf() functions were not available. +*/ + +ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s)); +/* + Writes the given null-terminated string to the compressed file, excluding + the terminating null character. + gzputs returns the number of characters written, or -1 in case of error. +*/ + +ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len)); +/* + Reads bytes from the compressed file until len-1 characters are read, or + a newline character is read and transferred to buf, or an end-of-file + condition is encountered. The string is then terminated with a null + character. + gzgets returns buf, or Z_NULL in case of error. +*/ + +ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c)); +/* + Writes c, converted to an unsigned char, into the compressed file. + gzputc returns the value that was written, or -1 in case of error. +*/ + +ZEXTERN int ZEXPORT gzgetc OF((gzFile file)); +/* + Reads one byte from the compressed file. gzgetc returns this byte + or -1 in case of end of file or error. +*/ + +ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file)); +/* + Push one character back onto the stream to be read again later. + Only one character of push-back is allowed. gzungetc() returns the + character pushed, or -1 on failure. gzungetc() will fail if a + character has been pushed but not read yet, or if c is -1. The pushed + character will be discarded if the stream is repositioned with gzseek() + or gzrewind(). +*/ + +ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush)); +/* + Flushes all pending output into the compressed file. The parameter + flush is as in the deflate() function. The return value is the zlib + error number (see function gzerror below). gzflush returns Z_OK if + the flush parameter is Z_FINISH and all output could be flushed. + gzflush should be called only when strictly necessary because it can + degrade compression. +*/ + +ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file, + z_off_t offset, int whence)); +/* + Sets the starting position for the next gzread or gzwrite on the + given compressed file. The offset represents a number of bytes in the + uncompressed data stream. The whence parameter is defined as in lseek(2); + the value SEEK_END is not supported. + If the file is opened for reading, this function is emulated but can be + extremely slow. If the file is opened for writing, only forward seeks are + supported; gzseek then compresses a sequence of zeroes up to the new + starting position. + + gzseek returns the resulting offset location as measured in bytes from + the beginning of the uncompressed stream, or -1 in case of error, in + particular if the file is opened for writing and the new starting position + would be before the current position. +*/ + +ZEXTERN int ZEXPORT gzrewind OF((gzFile file)); +/* + Rewinds the given file. This function is supported only for reading. + + gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET) +*/ + +ZEXTERN z_off_t ZEXPORT gztell OF((gzFile file)); +/* + Returns the starting position for the next gzread or gzwrite on the + given compressed file. This position represents a number of bytes in the + uncompressed data stream. + + gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR) +*/ + +ZEXTERN int ZEXPORT gzeof OF((gzFile file)); +/* + Returns 1 when EOF has previously been detected reading the given + input stream, otherwise zero. +*/ + +ZEXTERN int ZEXPORT gzdirect OF((gzFile file)); +/* + Returns 1 if file is being read directly without decompression, otherwise + zero. +*/ + +ZEXTERN int ZEXPORT gzclose OF((gzFile file)); +/* + Flushes all pending output if necessary, closes the compressed file + and deallocates all the (de)compression state. The return value is the zlib + error number (see function gzerror below). +*/ + +ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum)); +/* + Returns the error message for the last error which occurred on the + given compressed file. errnum is set to zlib error number. If an + error occurred in the file system and not in the compression library, + errnum is set to Z_ERRNO and the application may consult errno + to get the exact error code. +*/ + +ZEXTERN void ZEXPORT gzclearerr OF((gzFile file)); +/* + Clears the error and end-of-file flags for file. This is analogous to the + clearerr() function in stdio. This is useful for continuing to read a gzip + file that is being written concurrently. +*/ + + /* checksum functions */ + +/* + These functions are not related to compression but are exported + anyway because they might be useful in applications using the + compression library. +*/ + +ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len)); +/* + Update a running Adler-32 checksum with the bytes buf[0..len-1] and + return the updated checksum. If buf is NULL, this function returns + the required initial value for the checksum. + An Adler-32 checksum is almost as reliable as a CRC32 but can be computed + much faster. Usage example: + + uLong adler = adler32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + adler = adler32(adler, buffer, length); + } + if (adler != original_adler) error(); +*/ + +ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2, + z_off_t len2)); +/* + Combine two Adler-32 checksums into one. For two sequences of bytes, seq1 + and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for + each, adler1 and adler2. adler32_combine() returns the Adler-32 checksum of + seq1 and seq2 concatenated, requiring only adler1, adler2, and len2. +*/ + +ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len)); +/* + Update a running CRC-32 with the bytes buf[0..len-1] and return the + updated CRC-32. If buf is NULL, this function returns the required initial + value for the for the crc. Pre- and post-conditioning (one's complement) is + performed within this function so it shouldn't be done by the application. + Usage example: + + uLong crc = crc32(0L, Z_NULL, 0); + + while (read_buffer(buffer, length) != EOF) { + crc = crc32(crc, buffer, length); + } + if (crc != original_crc) error(); +*/ + +ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2)); + +/* + Combine two CRC-32 check values into one. For two sequences of bytes, + seq1 and seq2 with lengths len1 and len2, CRC-32 check values were + calculated for each, crc1 and crc2. crc32_combine() returns the CRC-32 + check value of seq1 and seq2 concatenated, requiring only crc1, crc2, and + len2. +*/ + + + /* various hacks, don't look :) */ + +/* deflateInit and inflateInit are macros to allow checking the zlib version + * and the compiler's view of z_stream: + */ +ZEXTERN int ZEXPORT deflateInit_ OF((z_streamp strm, int level, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT inflateInit_ OF((z_streamp strm, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT deflateInit2_ OF((z_streamp strm, int level, int method, + int windowBits, int memLevel, + int strategy, const char *version, + int stream_size)); +ZEXTERN int ZEXPORT inflateInit2_ OF((z_streamp strm, int windowBits, + const char *version, int stream_size)); +ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits, + unsigned char FAR *window, + const char *version, + int stream_size)); +#define deflateInit(strm, level) \ + deflateInit_((strm), (level), ZLIB_VERSION, sizeof(z_stream)) +#define inflateInit(strm) \ + inflateInit_((strm), ZLIB_VERSION, sizeof(z_stream)) +#define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \ + deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\ + (strategy), ZLIB_VERSION, sizeof(z_stream)) +#define inflateInit2(strm, windowBits) \ + inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream)) +#define inflateBackInit(strm, windowBits, window) \ + inflateBackInit_((strm), (windowBits), (window), \ + ZLIB_VERSION, sizeof(z_stream)) + + +#if !defined(ZUTIL_H) && !defined(NO_DUMMY_DECL) + struct internal_state {int dummy;}; /* hack for buggy compilers */ +#endif + +ZEXTERN const char * ZEXPORT zError OF((int)); +ZEXTERN int ZEXPORT inflateSyncPoint OF((z_streamp z)); +ZEXTERN const uLongf * ZEXPORT get_crc_table OF((void)); + +#ifdef __cplusplus +} +#endif + +#endif /* ZLIB_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/stats.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,580 @@ +#include "stats.hpp" + +/** The digamma function in long double precision. +* @param x the real value of the argument +* @return the value of the digamma (psi) function at that point +* @author Richard J. Mathar +* @since 2005-11-24 +*/ +long double digammal(long double x) +{ + /* force into the interval 1..3 */ + if( x < 0.0L ) + return digammal(1.0L-x)+M_PIl/tanl(M_PIl*(1.0L-x)) ; /* reflection formula */ + else if( x < 1.0L ) + return digammal(1.0L+x)-1.0L/x ; + else if ( x == 1.0L) + return -M_GAMMAl ; + else if ( x == 2.0L) + return 1.0L-M_GAMMAl ; + else if ( x == 3.0L) + return 1.5L-M_GAMMAl ; + else if ( x > 3.0L) + /* duplication formula */ + return 0.5L*(digammal(x/2.0L)+digammal((x+1.0L)/2.0L))+M_LN2l ; + else + { + /* Just for your information, the following lines contain + * the Maple source code to re-generate the table that is + * eventually becoming the Kncoe[] array below + * interface(prettyprint=0) : + * Digits := 63 : + * r := 0 : + * + * for l from 1 to 60 do + * d := binomial(-1/2,l) : + * r := r+d*(-1)^l*(Zeta(2*l+1) -1) ; + * evalf(r) ; + * print(%,evalf(1+Psi(1)-r)) ; + *o d : + * + * for N from 1 to 28 do + * r := 0 : + * n := N-1 : + * + * for l from iquo(n+3,2) to 70 do + * d := 0 : + * for s from 0 to n+1 do + * d := d+(-1)^s*binomial(n+1,s)*binomial((s-1)/2,l) : + * od : + * if 2*l-n > 1 then + * r := r+d*(-1)^l*(Zeta(2*l-n) -1) : + * fi : + * od : + * print(evalf((-1)^n*2*r)) ; + *od : + *quit : + */ + static long double Kncoe[] = { .30459198558715155634315638246624251L, + .72037977439182833573548891941219706L, -.12454959243861367729528855995001087L, + .27769457331927827002810119567456810e-1L, -.67762371439822456447373550186163070e-2L, + .17238755142247705209823876688592170e-2L, -.44817699064252933515310345718960928e-3L, + .11793660000155572716272710617753373e-3L, -.31253894280980134452125172274246963e-4L, + .83173997012173283398932708991137488e-5L, -.22191427643780045431149221890172210e-5L, + .59302266729329346291029599913617915e-6L, -.15863051191470655433559920279603632e-6L, + .42459203983193603241777510648681429e-7L, -.11369129616951114238848106591780146e-7L, + .304502217295931698401459168423403510e-8L, -.81568455080753152802915013641723686e-9L, + .21852324749975455125936715817306383e-9L, -.58546491441689515680751900276454407e-10L, + .15686348450871204869813586459513648e-10L, -.42029496273143231373796179302482033e-11L, + .11261435719264907097227520956710754e-11L, -.30174353636860279765375177200637590e-12L, + .80850955256389526647406571868193768e-13L, -.21663779809421233144009565199997351e-13L, + .58047634271339391495076374966835526e-14L, -.15553767189204733561108869588173845e-14L, + .41676108598040807753707828039353330e-15L, -.11167065064221317094734023242188463e-15L } ; + + register long double Tn_1 = 1.0L ; /* T_{n-1}(x), started at n=1 */ + register long double Tn = x-2.0L ; /* T_{n}(x) , started at n=1 */ + register long double resul = Kncoe[0] + Kncoe[1]*Tn ; + + x -= 2.0L ; + int n ; + + for( n = 2 ; n < int( sizeof(Kncoe)/sizeof(long double) ) ; n++) + { + const long double Tn1 = 2.0L * x * Tn - Tn_1 ; /* Chebyshev recursion, Eq. 22.7.4 Abramowitz-Stegun */ + resul += Kncoe[n]*Tn1 ; + Tn_1 = Tn ; + Tn = Tn1 ; + } + return resul ; + } +} + + + +double trigamma ( double x, int *ifault ) + +//**************************************************************************** +// purpose: +// +// trigamma calculates trigamma(x) = d**2 log(gamma(x)) / dx**2 +// +// licensing: +// +// this code is distributed under the gnu lgpl license. +// +// modified: +// +// 19 january 2008 +// +// author: +// +// original fortran77 version by be schneider. +// c++ version by john burkardt. +// +// reference: +// +// be schneider, +// algorithm as 121: +// trigamma function, +// applied statistics, +// volume 27, number 1, pages 97-99, 1978. +// +// parameters: +// +// input, double x, the argument of the trigamma function. +// 0 < x. +// +// output, int *ifault, error flag. +// 0, no error. +// 1, x <= 0. +// +// output, double trigamma, the value of the trigamma function at x. +// +{ + double a = 0.0001; + double b = 5.0; + double b2 = 0.1666666667; + double b4 = -0.03333333333; + double b6 = 0.02380952381; + double b8 = -0.03333333333; + double value; + double y; + double z; + // + // check the input. + // + if ( x <= 0.0 ) + { + *ifault = 1; + value = 0.0; + return value; + } + + *ifault = 0; + z = x; + // + // use small value approximation if x <= a. + // + if ( x <= a ) + { + value = 1.0 / x / x; + return value; + } + // + // increase argument to ( x + i ) >= b. + // + value = 0.0; + + while ( z < b ) + { + value = value + 1.0 / z / z; + z = z + 1.0; + } + // + // apply asymptotic formula if argument is b or greater. + // + y = 1.0 / z / z; + + value = value + 0.5 * + y + ( 1.0 + y * ( b2+ y * ( b4 + y * ( b6+ y * b8 )))) / z; + + return value; +} + + +double LogGammaDensity( double x, double k, double theta ) +{ + return -k * log( theta ) + ( k - 1 ) * log( x ) - x / theta - lgamma( k ) ; +} + +double MixtureGammaAssignment( double x, double pi, double* k, double *theta ) +{ + if ( pi == 1 ) + return 1 ; + else if ( pi == 0 ) + return 0 ; + + double lf0 = LogGammaDensity( x, k[0], theta[0] ) ; + double lf1 = LogGammaDensity( x, k[1], theta[1] ) ; + return (double)1.0 / ( 1.0 + exp( lf1 + log( 1 - pi ) - lf0 - log( pi ) ) ) ; +} + +//****************************************************************************80 + +double alnorm ( double x, bool upper ) + +//****************************************************************************80 +// +// Purpose: +// +// ALNORM computes the cumulative density of the standard normal distribution. +// +// Licensing: +// +// This code is distributed under the GNU LGPL license. +// +// Modified: +// +// 17 January 2008 +// +// Author: +// +// Original FORTRAN77 version by David Hill. +// C++ version by John Burkardt. +// +// Reference: +// +// David Hill, +// Algorithm AS 66: +// The Normal Integral, +// Applied Statistics, +// Volume 22, Number 3, 1973, pages 424-427. +// +// Parameters: +// +// Input, double X, is one endpoint of the semi-infinite interval +// over which the integration takes place. +// +// Input, bool UPPER, determines whether the upper or lower +// interval is to be integrated: +// .TRUE. => integrate from X to + Infinity; +// .FALSE. => integrate from - Infinity to X. +// +// Output, double ALNORM, the integral of the standard normal +// distribution over the desired interval. +// +{ + double a1 = 5.75885480458; + double a2 = 2.62433121679; + double a3 = 5.92885724438; + double b1 = -29.8213557807; + double b2 = 48.6959930692; + double c1 = -0.000000038052; + double c2 = 0.000398064794; + double c3 = -0.151679116635; + double c4 = 4.8385912808; + double c5 = 0.742380924027; + double c6 = 3.99019417011; + double con = 1.28; + double d1 = 1.00000615302; + double d2 = 1.98615381364; + double d3 = 5.29330324926; + double d4 = -15.1508972451; + double d5 = 30.789933034; + double ltone = 7.0; + double p = 0.398942280444; + double q = 0.39990348504; + double r = 0.398942280385; + bool up; + double utzero = 18.66; + double value; + double y; + double z; + + up = upper; + z = x; + + if ( z < 0.0 ) + { + up = !up; + z = - z; + } + + if ( ltone < z && ( ( !up ) || utzero < z ) ) + { + if ( up ) + { + value = 0.0; + } + else + { + value = 1.0; + } + return value; + } + + y = 0.5 * z * z; + + if ( z <= con ) + { + value = 0.5 - z * ( p - q * y + / ( y + a1 + b1 + / ( y + a2 + b2 + / ( y + a3 )))); + } + else + { + value = r * exp ( - y ) + / ( z + c1 + d1 + / ( z + c2 + d2 + / ( z + c3 + d3 + / ( z + c4 + d4 + / ( z + c5 + d5 + / ( z + c6 )))))); + } + + if ( !up ) + { + value = 1.0 - value; + } + + return value; +} + +//****************************************************************************80 + +double gammad ( double x, double p, int *ifault ) + +//****************************************************************************80 +// +// Purpose: +// +// GAMMAD computes the Incomplete Gamma Integral +// +// Licensing: +// +// This code is distributed under the GNU LGPL license. +// +// Modified: +// +// 20 January 2008 +// +// Author: +// +// Original FORTRAN77 version by B Shea. +// C++ version by John Burkardt. +// +// Reference: +// +// B Shea, +// Algorithm AS 239: +// Chi-squared and Incomplete Gamma Integral, +// Applied Statistics, +// Volume 37, Number 3, 1988, pages 466-473. +// +// Parameters: +// +// Input, double X, P, the parameters of the incomplete +// gamma ratio. 0 <= X, and 0 < P. +// +// Output, int IFAULT, error flag. +// 0, no error. +// 1, X < 0 or P <= 0. +// +// Output, double GAMMAD, the value of the incomplete +// Gamma integral. +// +{ + double a; + double an; + double arg; + double b; + double c; + double elimit = - 88.0; + double oflo = 1.0E+37; + double plimit = 1000.0; + double pn1; + double pn2; + double pn3; + double pn4; + double pn5; + double pn6; + double rn; + double tol = 1.0E-14; + bool upper; + double value; + double xbig = 1.0E+08; + + value = 0.0; + // + // Check the input. + // + if ( x < 0.0 ) + { + *ifault = 1; + return value; + } + + if ( p <= 0.0 ) + { + *ifault = 1; + return value; + } + + *ifault = 0; + + if ( x == 0.0 ) + { + value = 0.0; + return value; + } + // + // If P is large, use a normal approximation. + // + if ( plimit < p ) + { + pn1 = 3.0 * sqrt ( p ) * ( pow ( x / p, 1.0 / 3.0 ) + + 1.0 / ( 9.0 * p ) - 1.0 ); + + upper = false; + value = alnorm ( pn1, upper ); + return value; + } + // + // If X is large set value = 1. + // + if ( xbig < x ) + { + value = 1.0; + return value; + } + // + // Use Pearson's series expansion. + // (Note that P is not large enough to force overflow in ALOGAM). + // No need to test IFAULT on exit since P > 0. + // + if ( x <= 1.0 || x < p ) + { + arg = p * log ( x ) - x - lgamma ( p + 1.0 ); + c = 1.0; + value = 1.0; + a = p; + + for ( ; ; ) + { + a = a + 1.0; + c = c * x / a; + value = value + c; + + if ( c <= tol ) + { + break; + } + } + + arg = arg + log ( value ); + + if ( elimit <= arg ) + { + value = exp ( arg ); + } + else + { + value = 0.0; + } + } + // + // Use a continued fraction expansion. + // + else + { + arg = p * log ( x ) - x - lgamma ( p ); + a = 1.0 - p; + b = a + x + 1.0; + c = 0.0; + pn1 = 1.0; + pn2 = x; + pn3 = x + 1.0; + pn4 = x * b; + value = pn3 / pn4; + + for ( ; ; ) + { + a = a + 1.0; + b = b + 2.0; + c = c + 1.0; + an = a * c; + pn5 = b * pn3 - an * pn1; + pn6 = b * pn4 - an * pn2; + + if ( pn6 != 0.0 ) + { + rn = pn5 / pn6; + + if ( fabs ( value - rn ) <= r8_min ( tol, tol * rn ) ) + { + break; + } + value = rn; + } + + pn1 = pn3; + pn2 = pn4; + pn3 = pn5; + pn4 = pn6; + // + // Re-scale terms in continued fraction if terms are large. + // + if ( oflo <= fabs ( pn5 ) ) + { + pn1 = pn1 / oflo; + pn2 = pn2 / oflo; + pn3 = pn3 / oflo; + pn4 = pn4 / oflo; + } + } + + arg = arg + log ( value ); + + if ( elimit <= arg ) + { + value = 1.0 - exp ( arg ); + } + else + { + value = 1.0; + } + } + + return value; +} + +double r8_min ( double x, double y ) + +//****************************************************************************80 +// +// Purpose: +// +// R8_MIN returns the minimum of two R8's. +// +// Licensing: +// +// This code is distributed under the GNU LGPL license. +// +// Modified: +// +// 31 August 2004 +// +// Author: +// +// John Burkardt +// +// Parameters: +// +// Input, double X, Y, the quantities to compare. +// +// Output, double R8_MIN, the minimum of X and Y. +// +{ + double value; + + if ( y < x ) + { + value = y; + } + else + { + value = x; + } + return value; +} + +// This function is implemented by Li Song. +// If Y follows (theta, k)=(1/alpha, k)-gamma distribution, then P_Y(Y<t)=gammad(alpha*t, k). +// Furthurmore, chi-square with d.f. k is gamma distribution (2, k/2) +double chicdf( double x, double df ) +{ + int ifault ; + return gammad( x / 2.0, df / 2.0, &ifault ) ; +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/stats.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,43 @@ +#ifndef _LSONG_STATS +#define _LSONG_STATS + +#include <stdio.h> +#include <math.h> + +#ifndef M_PIl +/** The constant Pi in high precision */ +#define M_PIl 3.1415926535897932384626433832795029L +#endif +#ifndef M_GAMMAl +/** Euler's constant in high precision */ +#define M_GAMMAl 0.5772156649015328606065120900824024L +#endif +#ifndef M_LN2l +/** the natural logarithm of 2 in high precision */ +#define M_LN2l 0.6931471805599453094172321214581766L +#endif + +/** The digamma function in long double precision. +* @param x the real value of the argument +* @return the value of the digamma (psi) function at that point +* @author Richard J. Mathar +* @since 2005-11-24 +*/ +long double digammal(long double x) ; + +//https://people.sc.fsu.edu/~jburkardt/cpp_src/asa121/asa121.hpp +double trigamma ( double x, int *ifault ); + + +double LogGammaDensity( double x, double k, double theta ) ; +double MixtureGammaAssignment( double x, double pi, double* k, double *theta ) ; + + +// http://people.sc.fsu.edu/~jburkardt/c_src/asa091/asa091.hpp +double alnorm ( double x, bool upper ); +double gammad ( double x, double p, int *ifault ); +double r8_min ( double x, double y ); + +double chicdf( double x, double df ) ; + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/support.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,171 @@ +// The class that deals the number of alignments supporting blocks or edges +#ifndef _LSONG_RSCAF_SUPPORT_HEADER +#define _LSONG_RSCAF_SUPPORT_HEADER + +extern int minimumSupport ; + +class Support +{ +private: + int uniqSupport ; + int multiSupport ; + + int plusSupport ; + int minusSupport ; + + int clipSupport ; + int nmSum ; + double multiSupportCoefficient ; + + int64_t leftPos, rightPos ; + int coordCnt ; // Record how many coordinates showed up. + int64_t prevCoord ; +public: + Support() + { + uniqSupport = multiSupport = 0 ; + plusSupport = minusSupport = 0 ; + leftPos = rightPos = -1 ; + multiSupportCoefficient = 1.0 ; + nmSum = 0 ; + coordCnt = 0 ; + prevCoord = -1 ; + } + + ~Support() + { + } + + void Add( Alignments &align, bool ignoreCoord = false ) + { + if ( align.IsUnique() ) + ++uniqSupport ; + else + ++multiSupport ; + + int strand = align.GetStrand() ; + if ( strand == 1 ) + ++plusSupport ; + else if ( strand == -1 ) + ++minusSupport ; + int nm = align.GetFieldI( "NM" ) ; + if ( nm >= 0 ) + nmSum += nm ; + + if ( !ignoreCoord ) + { + if ( leftPos == -1 || align.segments[0].a < leftPos ) + leftPos = align.segments[0].a ; + if ( align.segments[ align.segCnt - 1 ].b > rightPos ) + rightPos = align.segments[ align.segCnt - 1 ].b ; + + if ( align.segments[0].a != prevCoord ) // This makes sense when the coordinates are sorted + ++coordCnt ; + else if ( align.GetFieldZ( "SA" ) != NULL ) + ++coordCnt ; + prevCoord = align.segments[0].a ; + } + } + + void Add( Support &in ) + { + uniqSupport += in.uniqSupport ; + multiSupport += in.multiSupport ; + + plusSupport += in.plusSupport ; + minusSupport += in.minusSupport ; + + nmSum += in.nmSum ; + + if ( in.leftPos != -1 && ( leftPos == -1 || in.leftPos < leftPos ) ) + leftPos = in.leftPos ; + if ( in.rightPos > rightPos ) + rightPos = in.rightPos ; + + prevCoord = prevCoord > in.prevCoord ? prevCoord : in.prevCoord ; + coordCnt += in.coordCnt ; + } + + bool IsGood() + { + if ( uniqSupport < 0.1 * ( uniqSupport + multiSupport ) || uniqSupport == 0 ) + return false ; + if ( nmSum / ( uniqSupport + multiSupport ) >= 2.5 ) + return false ; + return true ; + } + + bool IsUnique() + { + if ( uniqSupport < 0.95 * ( uniqSupport + multiSupport ) ) + return false ; + return true ; + } + + int GetCount() + { + return (int)( uniqSupport + multiSupportCoefficient * multiSupport + 1e-6) ; + } + + int GetUniqCount() + { + return uniqSupport ; + } + + int GetStrand() + { + if ( plusSupport == minusSupport ) + return 0 ; + else if ( plusSupport > minusSupport ) + return 1 ; + else + return -1 ; + } + + int GetLeftMostPos() + { + return leftPos ; + } + + int GetRightMostPos() + { + return rightPos ; + } + + int GetCoordCnt() + { + return coordCnt ; + } + + void operator=( const Support &in ) + { + uniqSupport = in.uniqSupport ; + multiSupport = in.multiSupport ; + + plusSupport = in.plusSupport ; + minusSupport = in.minusSupport ; + + nmSum = in.nmSum ; + + leftPos = in.leftPos ; + rightPos = in.rightPos ; + + multiSupportCoefficient = in.multiSupportCoefficient ; + + prevCoord = in.prevCoord ; + coordCnt = in.coordCnt ; + } + + void Clear() + { + uniqSupport = multiSupport = 0 ; + plusSupport = minusSupport = 0 ; + nmSum = 0 ; + leftPos = rightPos = -1 ; + multiSupportCoefficient = 1.0 ; + + coordCnt = 0 ; + prevCoord = -1 ; + } +} ; +#endif