Mercurial > repos > lsong10 > psiclass
diff PsiCLASS-1.0.2/TranscriptDecider.hpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/TranscriptDecider.hpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,544 @@ +#ifndef _MOURISL_CLASSES_TRANSCRIPTDECIDER_HEADER +#define _MOURISL_CLASSES_TRANSCRIPTDECIDER_HEADER + +#include <pthread.h> +#include <map> +#include <time.h> +#include <stdarg.h> + +#include "alignments.hpp" +#include "SubexonGraph.hpp" +#include "SubexonCorrelation.hpp" +#include "BitTable.hpp" +#include "Constraints.hpp" + +#define HASH_MAX 100003 // default HASH_MAX +#define USE_DP 200000 + +struct _transcript +{ + BitTable seVector ; + double abundance ; + double correlationScore ; + double FPKM ; + double *constraintsSupport ; // record the assign ment of constraints. + + int first, last ; // indicate the index of the first and last subexons. + bool partial ; // wehther this is a partial transcript. + int id ; // the id for various usage: i.e transcript index in the alltranscripts. +} ; + +struct _outputTranscript +{ + int chrId ; + int geneId, transcriptId ; + struct _pair32 *exons ; + int ecnt ; + char strand ; + int sampleId ; + + double FPKM ; + double TPM ; + double cov ; +} ; + +struct _dp +{ + BitTable seVector ; + int first, last ; + // The "cnt" is for the hash structure. + // the first cnt set bits represent the subexons that are the key of the hash + // the remaining set bits are the optimal subtranscript follow the key. + int cnt ; + double cover ; + + double minAbundance ; + int timeStamp ; + int strand ; +} ; + + +struct _dpAttribute +{ + struct _dp *f1, **f2 ; + struct _dp *hash ; + + struct _transcript bufferTxpt ; + + bool forAbundance ; + + struct _subexon *subexons ; + int seCnt ; + + std::map<uint64_t, int> uncoveredPair ; + + double minAbundance ; + int timeStamp ; +} ; + +class MultiThreadOutputTranscript ; + +struct _transcriptDeciderThreadArg +{ + int tid ; + struct _subexon *subexons ; + int seCnt ; + int sampleCnt ; + int numThreads ; + + int maxDpConstraintSize ; + double FPKMFraction, classifierThreshold, txptMinReadDepth ; + Alignments *alignments ; + std::vector<Constraints> constraints ; + SubexonCorrelation subexonCorrelation ; + MultiThreadOutputTranscript *outputHandler ; + + int *freeThreads ; // the stack for free threads + int *ftCnt ; + pthread_mutex_t *ftLock ; + pthread_cond_t *fullWorkCond ; +} ; + +class MultiThreadOutputTranscript +{ +private: + std::vector<struct _outputTranscript> outputQueue ; + pthread_t *threads ; + pthread_mutex_t outputLock ; + int sampleCnt ; + int numThreads ; + std::vector<FILE *> outputFPs ; + Alignments &alignments ; + +public: + static int CompTranscripts( const struct _outputTranscript &a, const struct _outputTranscript &b ) + { + int i ; + if ( a.geneId != b.geneId ) + return a.geneId - b.geneId ; + if ( a.ecnt != b.ecnt ) + return a.ecnt - b.ecnt ; + + for ( i = 0 ; i < a.ecnt ; ++i ) + { + if ( a.exons[i].a != b.exons[i].a ) + return a.exons[i].a - b.exons[i].a ; + + if ( a.exons[i].b != b.exons[i].b ) + return a.exons[i].b - b.exons[i].b ; + } + return 0 ; + } + + static bool CompSortTranscripts( const struct _outputTranscript &a, const struct _outputTranscript &b ) + { + int tmp = CompTranscripts( a, b ) ; + if ( tmp < 0 ) + return true ; + else if ( tmp > 0 ) + return false ; + else + return a.sampleId < b.sampleId ; + } + + MultiThreadOutputTranscript( int cnt, Alignments &a ): alignments( a ) + { + sampleCnt = cnt ; + pthread_mutex_init( &outputLock, NULL ) ; + } + ~MultiThreadOutputTranscript() + { + pthread_mutex_destroy( &outputLock ) ; + int i ; + for ( i = 0 ; i < sampleCnt ; ++i ) + fclose( outputFPs[i] ) ; + } + + void SetThreadsPointer( pthread_t *t, int n ) + { + threads = t ; + numThreads = n ; + } + + void SetOutputFPs( char *outputPrefix ) + { + int i ; + char buffer[1024] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + if ( outputPrefix[0] ) + sprintf( buffer, "%s_sample_%d.gtf", outputPrefix, i ) ; + else + sprintf( buffer, "sample_%d.gtf", i ) ; + FILE *fp = fopen( buffer, "w" ) ; + outputFPs.push_back( fp ) ; + } + } + + void Add( struct _outputTranscript &t ) + { + pthread_mutex_lock( &outputLock ) ; + outputQueue.push_back( t ) ; + pthread_mutex_unlock( &outputLock ) ; + } + + void Add_SingleThread( struct _outputTranscript &t ) + { + outputQueue.push_back( t ) ; + } + + void ComputeFPKMTPM( std::vector<Alignments> &alignmentFiles ) + { + int i ; + int qsize = outputQueue.size() ; + double *totalFPK = new double[ sampleCnt ] ; + memset( totalFPK, 0, sizeof( double ) * sampleCnt ) ; + for ( i = 0 ; i < qsize ; ++i ) + { + totalFPK[ outputQueue[i].sampleId ] += outputQueue[i].FPKM ; + } + + for ( i = 0 ; i < qsize ; ++i ) + { + outputQueue[i].TPM = outputQueue[i].FPKM / ( totalFPK[ outputQueue[i].sampleId ] / 1000000.0 ) ; + outputQueue[i].FPKM /= ( alignmentFiles[ outputQueue[i].sampleId ].totalReadCnt / 1000000.0 ) ; + } + + delete[] totalFPK ; + } + + void OutputCommandInfo( int argc, char *argv[] ) + { + int i ; + int j ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + fprintf( outputFPs[i], "#PsiCLASS_v1.0.1\n#" ) ; + for ( j = 0 ; j < argc - 1 ; ++j ) + { + fprintf( outputFPs[i], "%s ", argv[j] ) ; + } + fprintf( outputFPs[i], "%s\n", argv[j] ) ; + } + } + + void OutputCommentToSampleGTF( int sampleId, char *s ) + { + fprintf( outputFPs[ sampleId ], "#%s\n", s ) ; + } + + void Flush() + { + std::sort( outputQueue.begin(), outputQueue.end(), CompSortTranscripts ) ; + int i, j ; + int qsize = outputQueue.size() ; + char prefix[10] = "" ; + + // Recompute the transcript id + int gid = -1 ; + int tid = 0 ; + for ( i = 0 ; i < qsize ; ) + { + for ( j = i + 1 ; j < qsize ; ++j ) + { + if ( CompTranscripts( outputQueue[i], outputQueue[j] ) ) + break ; + } + int l ; + if ( outputQueue[i].geneId != gid ) + { + gid = outputQueue[i].geneId ; + tid = 0 ; + } + else + ++tid ; + + for ( l = i ; l < j ; ++l ) + outputQueue[l].transcriptId = tid ; + + i = j ; + } + + // output + for ( i = 0 ; i < qsize ; ++i ) + { + struct _outputTranscript &t = outputQueue[i] ; + char *chrom = alignments.GetChromName( t.chrId ) ; + + fprintf( outputFPs[t.sampleId], "%s\tPsiCLASS\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\";\n", + chrom, t.exons[0].a, t.exons[t.ecnt - 1].b, t.strand, + prefix, chrom, t.geneId, + prefix, chrom, t.geneId, t.transcriptId, t.FPKM, t.TPM, t.cov ) ; + for ( j = 0 ; j < t.ecnt ; ++j ) + { + fprintf( outputFPs[ t.sampleId ], "%s\tPsiCLASS\texon\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; " + "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"\%.6lf\";\n", + chrom, t.exons[j].a, t.exons[j].b, t.strand, + prefix, chrom, t.geneId, + prefix, chrom, t.geneId, t.transcriptId, + j + 1, t.FPKM, t.TPM, t.cov ) ; + } + delete []t.exons ; + } + } +} ; + +class TranscriptDecider +{ +private: + int sampleCnt ; + int numThreads ; + double FPKMFraction ; + double txptMinReadDepth ; + int hashMax ; + int maxDpConstraintSize ; + + Constraints *constraints ; + //struct _subexon *subexons ; + //int seCnt ; + + int usedGeneId ; + int baseGeneId, defaultGeneId[2] ; + + int *transcriptId ; // the next transcript id for each gene id (we shift the gene id to 0 in this array.) + Alignments &alignments ; // for obtain the chromosome names. + + std::vector<FILE *> outputFPs ; + + BitTable compatibleTestVectorT, compatibleTestVectorC ; + double canBeSoftBoundaryThreshold ; + + MultiThreadOutputTranscript *outputHandler ; + + // Test whether subexon tag is a start subexon in a mixture region that corresponds to the start of a gene on another strand. + bool IsStartOfMixtureStrandRegion( int tag, struct _subexon *subexons, int seCnt ) ; + + // The functions to pick transcripts through dynamic programming + struct _dp *dpHash ; + void SearchSubTranscript( int tag, int strand, int parents[], int pcnt, struct _dp &pdp, int visit[], int vcnt, int extends[], int extendCnt, std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr ) ; + struct _dp SolveSubTranscript( int visit[], int vcnt, int strand, std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr ) ; + void PickTranscriptsByDP( struct _subexon *subexons, int seCnt, int iterBound, Constraints &constraints, SubexonCorrelation &correlation, struct _dpAttribute &attr, std::vector<struct _transcript> &allTranscripts ) ; + + void SetDpContent( struct _dp &a, struct _dp &b, const struct _dpAttribute &attr ) + { + a.seVector.Assign( b.seVector ) ; + a.first = b.first ; + a.last = b.last ; + a.cnt = b.cnt ; + a.cover = b.cover ; + + a.strand = b.strand ; + a.minAbundance = attr.minAbundance ; + a.timeStamp = attr.timeStamp ; + } + + void ResetDpContent( struct _dp &d ) + { + d.seVector.Reset() ; + d.first = -1 ; + d.last = -1 ; + d.cnt = -1 ; + d.cover = -1 ; + d.minAbundance = -1 ; + d.timeStamp = -1 ; + } + + void AugmentTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, int limit, bool extend ) ; + // Test whether a constraints is compatible with the transcript. + // Return 0 - uncompatible or does not overlap at all. 1 - fully compatible. 2 - Head of the constraints compatible with the tail of the transcript + int IsConstraintInTranscript( struct _transcript transcript, struct _constraint &c ) ; + int IsConstraintInTranscriptDebug( struct _transcript transcript, struct _constraint &c ) ; + + // Count how many transcripts are possible starting from subexons[tag]. + int SubTranscriptCount( int tag, struct _subexon *subexons, int f[] ) ; + + // The methods when there is no need for DP + void EnumerateTranscript( int tag, int strand, int visit[], int vcnt, struct _subexon *subexons, SubexonCorrelation &correlation, double correlationScore, std::vector<struct _transcript> &alltranscripts, int &atcnt ) ; + // For the simpler case, we can pick sample by sample. + void PickTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, Constraints &constraints, SubexonCorrelation &seCorrelation, std::vector<struct _transcript> &transcripts ) ; + + static bool CompSortTranscripts( const struct _transcript &a, const struct _transcript &b ) + { + if ( a.first < b.first ) + return true ; + else if ( a.first > b.first ) + return false ; + + int diffPos = a.seVector.GetFirstDifference( b.seVector ) ; + if ( diffPos == -1 ) + return false ; + if ( a.seVector.Test( diffPos ) ) + return true ; + else + return false ; + } + + static bool CompSortPairs( const struct _pair32 &x, const struct _pair32 &y ) + { + if ( x.a != y.a ) + return x.a < y.a ; + else + return x.b < y.b ; + } + + static bool CompSortPairsByB( const struct _pair32 &x, const struct _pair32 &y ) + { + return x.b < y.b ; + } + + static int CompPairsByB( const void *p1, const void *p2 ) + { + return ((struct _pair32 *)p1)->b - ((struct _pair32 *)p2)->b ; + } + + double ComputeScore( double cnt, double weight, double a, double A, double correlation ) + { + if ( a > A * 0.1 ) + return ( cnt * weight ) * ( 1 + pow( a / A, 0.25 ) ) + correlation ; + else + return ( cnt * weight ) * ( 1 + a / A ) + correlation ; + //return ( cnt ) * ( exp( 1 + a / A ) ) + correlation ; + } + + int GetFather( int f, int *father ) ; + + void ConvertTranscriptAbundanceToFPKM( struct _subexon *subexons, struct _transcript &t, int readCnt = 1000000 ) + { + int txptLen = 0 ; + int i, size ; + + std::vector<int> subexonInd ; + t.seVector.GetOnesIndices( subexonInd ) ; + size = subexonInd.size() ; + for ( i = 0 ; i < size ; ++i ) + txptLen += ( subexons[ subexonInd[i] ].end - subexons[ subexonInd[i] ].start + 1 ) ; + double factor = 1 ; + if ( alignments.matePaired ) + factor = 0.5 ; + t.FPKM = t.abundance * factor / ( ( readCnt / 1000000.0 ) * ( txptLen / 1000.0 ) ) ; + } + + int GetTranscriptLengthFromAbundanceAndFPKM( double abundance, double FPKM, int readCnt = 1000000 ) + { + double factor = 1 ; + if ( alignments.matePaired ) + factor = 0.5 ; + return int( abundance * factor / ( FPKM / 1000.0 ) / ( readCnt / 1000000.0 ) + 0.5 ) ; + } + + void CoalesceSameTranscripts( std::vector<struct _transcript> &t ) ; + + + // Initialize the structure to store transcript id + void InitTranscriptId() ; + + int GetTranscriptGeneId( std::vector<int> &subexonInd, struct _subexon *subexons ) ; + int GetTranscriptGeneId( struct _transcript &t, struct _subexon *subexons ) ; + int RemoveNegativeAbundTranscripts( std::vector<struct _transcript> &transcripts ) + { + int i, j ; + int tcnt = transcripts.size() ; + j = 0 ; + for ( i = 0 ; i < tcnt ; ++i ) + { + if ( transcripts[i].abundance < 0 ) + { + transcripts[i].seVector.Release() ; // Don't forget release the memory. + continue ; + } + transcripts[j] = transcripts[i] ; + ++j ; + } + transcripts.resize( j ) ; + return j ; + } + + void AbundanceEstimation( struct _subexon *subexons, int seCnt, Constraints &constraints, std::vector<struct _transcript> &transcripts ) ; + + int RefineTranscripts( struct _subexon *subexons, int seCnt, bool aggressive, std::map<int, int> *subexonChainSupport, int *txptSampleSupport, std::vector<struct _transcript> &transcripts, Constraints &constraints ) ; + + void ComputeTranscriptsScore( struct _subexon *subexons, int seCnt, std::map<int, int> *subexonChainSupport, std::vector<struct _transcript> &transcripts ) ; + + void OutputTranscript( int sampleId, struct _subexon *subexons, struct _transcript &transcript ) ; + + void PrintLog( const char *fmt, ... ) + { + char buffer[10021] ; + va_list args ; + va_start( args, fmt ) ; + vsprintf( buffer, fmt, args ) ; + + time_t mytime = time(NULL) ; + struct tm *localT = localtime( &mytime ) ; + char stime[500] ; + strftime( stime, sizeof( stime ), "%c", localT ) ; + fprintf( stderr, "[%s] %s\n", stime, buffer ) ; + } + + +public: + TranscriptDecider( double f, double c, double d, int sampleCnt, Alignments &a ): alignments( a ) + { + FPKMFraction = f ; + canBeSoftBoundaryThreshold = c ; + txptMinReadDepth = d ; + usedGeneId = 0 ; + defaultGeneId[0] = -1 ; + defaultGeneId[1] = -1 ; + maxDpConstraintSize = -1 ; + numThreads = 1 ; + this->sampleCnt = sampleCnt ; + dpHash = new struct _dp[ HASH_MAX ] ; // pre-allocated buffer to hold dp information. + } + ~TranscriptDecider() + { + int i ; + if ( numThreads == 1 ) + { + int size = outputFPs.size() ; + for ( i = 0 ; i < size ; ++i ) + { + fclose( outputFPs[i] ) ; + } + } + delete[] dpHash ; + } + + + // @return: the number of assembled transcript + int Solve( struct _subexon *subexons, int seCnt, std::vector<Constraints> &constraints, SubexonCorrelation &subexonCorrelation ) ; + + void SetOutputFPs( char *outputPrefix ) + { + int i ; + char buffer[1024] ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + if ( outputPrefix[0] ) + sprintf( buffer, "%s_sample_%d.gtf", outputPrefix, i ) ; + else + sprintf( buffer, "sample_%d.gtf", i ) ; + FILE *fp = fopen( buffer, "w" ) ; + outputFPs.push_back( fp ) ; + } + } + + void SetMultiThreadOutputHandler( MultiThreadOutputTranscript *h ) + { + outputHandler = h ; + } + + void SetNumThreads( int t ) + { + numThreads = t ; + } + + void SetMaxDpConstraintSize(int size) + { + maxDpConstraintSize = size ; + } +} ; + +void *TranscriptDeciderSolve_Wrapper( void *arg ) ; + +#endif