Mercurial > repos > lsong10 > psiclass
diff PsiCLASS-1.0.2/classes.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/classes.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,491 @@ +#include <stdio.h> +#include <getopt.h> +#include <vector> +#include <pthread.h> + +#include "alignments.hpp" +#include "SubexonGraph.hpp" +#include "SubexonCorrelation.hpp" +#include "Constraints.hpp" +#include "TranscriptDecider.hpp" + +char usage[] = "./classes [OPTIONS]:\n" + "Required:\n" + "\t-s STRING: path to the subexon file.\n" + "\t-b STRING: path to the BAM file.\n" + "\t\tor\n" + "\t--lb STRING: path to the file of the list of BAM files.\n" + "Optional:\n" + "\t-p INT: number of threads. (default: 1)\n" + "\t-o STRING: the prefix of the output file. (default: not used)\n" + "\t-c FLOAT: only use the subexons with classifier score <= than the given number. (default: 0.05)\n" + "\t-f FLOAT: filter the transcript from the gene if its abundance is lower than the given number percent of the most abundant one. (default: 0.05)\n" + "\t-d FLOAT: filter the transcript whose average read depth is less than the given number. (default: 2.5)\n" + "\t--ls STRING: path to the file of the list of single-sample subexon files. (default: not used)\n" + "\t--hasMateIdSuffix: the read id has suffix such as .1, .2 for a mate pair. (default: false)\n" + "\t--maxDpConstraintSize: the maximum number of subexons a constraint can cover in dynamic programming. (default: 7; -1 for inf)\n" + "\t--primaryParalog: use primary alignment to retain paralog genes instead of unique alignments. (default: not used)\n" + ; + +static const char *short_options = "s:b:f:o:d:p:c:h" ; +static struct option long_options[] = + { + { "ls", required_argument, 0, 10000 }, + { "lb", required_argument, 0, 10001 }, + { "hasMateIdSuffix", no_argument, 0, 10002 }, + { "primaryParalog", no_argument, 0, 10003 }, + { "maxDpConstraintSize", required_argument, 0, 10004 }, + { (char *)0, 0, 0, 0} + } ; + +struct _getAlignmentsInfoThreadArg +{ + std::vector<Alignments> *pAlignmentFiles ; + int numThreads ; + int tid ; +} ; + +struct _getConstraintsThreadArg +{ + std::vector<Constraints> *pMultiSampleConstraints ; + int numThreads ; + int tid ; + + struct _subexon *subexons ; + int seCnt ; + int start, end ; +} ; + +void *GetAlignmentsInfo_Thread( void *pArg ) +{ + int i ; + std::vector <Alignments> &alignmentFiles = *( ( (struct _getAlignmentsInfoThreadArg *)pArg )->pAlignmentFiles ) ; + int tid = ( (struct _getAlignmentsInfoThreadArg *)pArg )->tid ; + int numThreads = ( (struct _getAlignmentsInfoThreadArg *)pArg )->numThreads ; + int size = alignmentFiles.size() ; + for ( i = 0 ; i < size ; ++i ) + { + if ( i % numThreads == tid ) + { + alignmentFiles[i].GetGeneralInfo( true ) ; + alignmentFiles[i].Rewind() ; + } + } + + pthread_exit( NULL ) ; +} + + +void *GetConstraints_Thread( void *pArg ) +{ + int i ; + struct _getConstraintsThreadArg &arg = *( (struct _getConstraintsThreadArg *)pArg ) ; + std::vector<Constraints> &multiSampleConstraints = *( arg.pMultiSampleConstraints ) ; + int tid = arg.tid ; + int numThreads = arg.numThreads ; + int size = multiSampleConstraints.size() ; + for ( i = 0 ; i < size ; ++i ) + { + if ( i % numThreads == tid ) + multiSampleConstraints[i].BuildConstraints( arg.subexons, arg.seCnt, arg.start, arg.end ) ; + } + pthread_exit( NULL ) ; +} + +int main( int argc, char *argv[] ) +{ + int i, j ; + int size ; + + if ( argc <= 1 ) + { + printf( "%s", usage ) ; + return 0 ; + } + + int c, option_index ; // For getopt + option_index = 0 ; + FILE *fpSubexon = NULL ; + double FPKMFraction = 0.05 ; + double classifierThreshold ; + double txptMinReadDepth = 2.5 ; + char outputPrefix[1024] = "" ; + int numThreads = 1 ; + bool hasMateReadIdSuffix = false ; + bool usePrimaryAsUnique = false ; + int maxDpConstraintSize = 7 ; + + std::vector<Alignments> alignmentFiles ; + SubexonCorrelation subexonCorrelation ; + + classifierThreshold = 0.05 ; + while ( 1 ) + { + c = getopt_long( argc, argv, short_options, long_options, &option_index ) ; + if ( c == -1 ) + break ; + + if ( c == 's' ) + { + fpSubexon = fopen( optarg, "r" ) ; + } + else if ( c == 'b' ) + { + Alignments a ; + a.Open( optarg ) ; + //a.SetAllowClip( false ) ; + alignmentFiles.push_back( a ) ; + } + else if ( c == 'c' ) + { + classifierThreshold = atof( optarg ) ; + } + else if ( c == 'f' ) + { + FPKMFraction = atof( optarg ) ; + } + else if ( c == 'o' ) + { + strcpy( outputPrefix, optarg ) ; + } + else if ( c == 'd' ) + { + txptMinReadDepth = atof( optarg ) ; + } + else if ( c == 'p' ) + { + numThreads = atoi( optarg ) ; + } + else if ( c == 10000 ) // the list of subexon files. + { + subexonCorrelation.Initialize( optarg ) ; + } + else if ( c == 10001 ) // the list of bam files. + { + FILE *fp = fopen( optarg, "r" ) ; + char buffer[1024] ; + while ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) + { + int len = strlen( buffer ) ; + if ( buffer[len - 1] == '\n' ) + { + buffer[len - 1] = '\0' ; + --len ; + + } + Alignments a ; + a.Open( buffer ) ; + alignmentFiles.push_back( a ) ; + } + fclose( fp ) ; + } + else if ( c == 10002 ) // the mate pair read id has suffix. + { + hasMateReadIdSuffix = true ; + } + else if ( c == 10003 ) // do not check unique alignment + { + usePrimaryAsUnique = true ; + } + else if ( c == 10004 ) // maxDpConstraintSize + { + maxDpConstraintSize = atoi(optarg) ; + } + else + { + printf( "%s", usage ) ; + exit( 1 ) ; + } + } + if ( fpSubexon == NULL ) + { + printf( "Cannot find combined subexon file.\n" ) ; + exit( 1 ) ; + } + if ( alignmentFiles.size() < 1 ) + { + printf( "Must use -b option to specify BAM files.\n" ) ; + exit( 1 ) ; + } + + + if ( alignmentFiles.size() < 50 ) + { + size = alignmentFiles.size() ; + for ( i = 0 ; i < size ; ++i ) + { + alignmentFiles[i].GetGeneralInfo( true ) ; + alignmentFiles[i].Rewind() ; + } + } + else + { + struct _getAlignmentsInfoThreadArg *args = new struct _getAlignmentsInfoThreadArg[ numThreads ] ; + pthread_attr_t pthreadAttr ; + pthread_t *threads ; + + pthread_attr_init( &pthreadAttr ) ; + pthread_attr_setdetachstate( &pthreadAttr, PTHREAD_CREATE_JOINABLE ) ; + + threads = new pthread_t[ numThreads ] ; + + for ( i = 0 ; i < numThreads ; ++i ) + { + args[i].pAlignmentFiles = &alignmentFiles ; + args[i].tid = i ; + args[i].numThreads = numThreads ; + + pthread_create( &threads[i], &pthreadAttr, GetAlignmentsInfo_Thread, &args[i] ) ; + } + + for ( i = 0 ; i < numThreads ; ++i ) + { + pthread_join( threads[i], NULL ) ; + } + + pthread_attr_destroy( &pthreadAttr ) ; + delete[] args ; + delete[] threads ; + } + + // Build the subexon graph + SubexonGraph subexonGraph( classifierThreshold, alignmentFiles[0], fpSubexon ) ; + subexonGraph.ComputeGeneIntervals() ; + + // Solve gene by gene + int sampleCnt = alignmentFiles.size() ; + std::vector<Constraints> multiSampleConstraints ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + Constraints constraints( &alignmentFiles[i] ) ; + constraints.SetHasMateReadIdSuffix( hasMateReadIdSuffix ) ; + constraints.SetUsePrimaryAsUnique( usePrimaryAsUnique ) ; + multiSampleConstraints.push_back( constraints ) ; + } + MultiThreadOutputTranscript outputHandler( sampleCnt, alignmentFiles[0] ) ; + outputHandler.SetOutputFPs( outputPrefix ) ; + + if ( numThreads <= 1 ) + { + TranscriptDecider transcriptDecider( FPKMFraction, classifierThreshold, txptMinReadDepth, sampleCnt, alignmentFiles[0] ) ; + + transcriptDecider.SetMultiThreadOutputHandler( &outputHandler ) ; + transcriptDecider.SetNumThreads( numThreads ) ; + transcriptDecider.SetMaxDpConstraintSize( maxDpConstraintSize ) ; + + int giCnt = subexonGraph.geneIntervals.size() ; + for ( i = 0 ; i < giCnt ; ++i ) + { + struct _geneInterval gi = subexonGraph.geneIntervals[i] ; + struct _subexon *intervalSubexons = new struct _subexon[ gi.endIdx - gi.startIdx + 1 ] ; + subexonGraph.ExtractSubexons( gi.startIdx, gi.endIdx, intervalSubexons ) ; + printf( "%d: %d %s %d %d\n", i, gi.endIdx - gi.startIdx + 1, + alignmentFiles[0].GetChromName( intervalSubexons[0].chrId ), + gi.start + 1, gi.end + 1 ) ; + fflush( stdout ) ; + + subexonCorrelation.ComputeCorrelation( intervalSubexons, gi.endIdx - gi.startIdx + 1, alignmentFiles[0] ) ; + for ( j = 0 ; j < sampleCnt ; ++j ) + multiSampleConstraints[j].BuildConstraints( intervalSubexons, gi.endIdx - gi.startIdx + 1, gi.start, gi.end ) ; + + transcriptDecider.Solve( intervalSubexons, gi.endIdx - gi.startIdx + 1, multiSampleConstraints, subexonCorrelation ) ; + + for ( j = 0 ; j < gi.endIdx - gi.startIdx + 1 ; ++j ) + { + delete[] intervalSubexons[j].prev ; + delete[] intervalSubexons[j].next ; + } + delete[] intervalSubexons ; + } + } + else // multi-thread case. + { + --numThreads ; // one thread is used for read in the data. + + // Allocate memory + struct _transcriptDeciderThreadArg *pArgs = new struct _transcriptDeciderThreadArg[ numThreads ] ; + pthread_mutex_t ftLock ; + int *freeThreads ; + int ftCnt ; + pthread_cond_t fullWorkCond ; + pthread_attr_t pthreadAttr ; + pthread_t *threads ; + pthread_t *getConstraintsThreads ; + bool *initThreads ; + + pthread_mutex_init( &ftLock, NULL ) ; + pthread_cond_init( &fullWorkCond, NULL ) ; + pthread_attr_init( &pthreadAttr ) ; + pthread_attr_setdetachstate( &pthreadAttr, PTHREAD_CREATE_JOINABLE ) ; + + threads = new pthread_t[ numThreads ] ; + getConstraintsThreads = new pthread_t[ numThreads ] ; + initThreads = new bool[numThreads] ; + freeThreads = new int[ numThreads ] ; + ftCnt = numThreads ; + for ( i = 0 ; i < numThreads ; ++i ) + { + pArgs[i].tid = i ; + pArgs[i].sampleCnt = sampleCnt ; + pArgs[i].numThreads = numThreads ; + pArgs[i].maxDpConstraintSize = maxDpConstraintSize ; + pArgs[i].FPKMFraction = FPKMFraction ; + pArgs[i].classifierThreshold = classifierThreshold ; + pArgs[i].txptMinReadDepth = txptMinReadDepth ; + pArgs[i].alignments = &alignmentFiles[0] ; + //pArgs[i].constraints = new std::vector<Constraints> ; + pArgs[i].outputHandler = &outputHandler ; + + freeThreads[i] = i ; + pArgs[i].freeThreads = freeThreads ; + pArgs[i].ftCnt = &ftCnt ; + pArgs[i].ftLock = &ftLock ; + pArgs[i].fullWorkCond = &fullWorkCond ; + + for ( j = 0 ; j < sampleCnt ; ++j ) + { + Constraints constraints( &alignmentFiles[j] ) ; + pArgs[i].constraints.push_back( constraints ) ; + } + + initThreads[i] = false ; + } + + + // Read in and distribute the work + int giCnt = subexonGraph.geneIntervals.size() ; + for ( i = 0 ; i < giCnt ; ++i ) + { + struct _geneInterval gi = subexonGraph.geneIntervals[i] ; + struct _subexon *intervalSubexons = new struct _subexon[ gi.endIdx - gi.startIdx + 1 ] ; + subexonGraph.ExtractSubexons( gi.startIdx, gi.endIdx, intervalSubexons ) ; + subexonCorrelation.ComputeCorrelation( intervalSubexons, gi.endIdx - gi.startIdx + 1, alignmentFiles[0] ) ; + printf( "%d: %d %s %d %d. Free threads: %d/%d\n", i, gi.endIdx - gi.startIdx + 1, + alignmentFiles[0].GetChromName( intervalSubexons[0].chrId ), + gi.start + 1, gi.end + 1, ftCnt, numThreads + 1 ) ; + fflush( stdout ) ; + + int gctCnt = ftCnt ; + if ( gctCnt > 1 && sampleCnt > 1 ) + { + gctCnt = ( gctCnt < sampleCnt ? gctCnt : sampleCnt ) ; + struct _getConstraintsThreadArg *args = new struct _getConstraintsThreadArg[ gctCnt ] ; + for ( j = 0 ; j < gctCnt ; ++j ) + { + args[j].pMultiSampleConstraints = &multiSampleConstraints ; + args[j].numThreads = gctCnt ; + args[j].tid = j ; + args[j].subexons = intervalSubexons ; + args[j].seCnt = gi.endIdx - gi.startIdx + 1 ; + args[j].start = gi.start ; args[j].end = gi.end ; + pthread_create( &getConstraintsThreads[j], &pthreadAttr, GetConstraints_Thread, &args[j] ) ; + } + + + + for ( j = 0 ; j < gctCnt ; ++j ) + pthread_join( getConstraintsThreads[j], NULL ) ; + + delete[] args ; + } + else + { + for ( j = 0 ; j < sampleCnt ; ++j ) + multiSampleConstraints[j].BuildConstraints( intervalSubexons, gi.endIdx - gi.startIdx + 1, gi.start, gi.end ) ; + } + + // Search for the free queue. + int tag = -1 ; // get the working thread. + pthread_mutex_lock( &ftLock ) ; + if ( ftCnt == 0 ) + { + pthread_cond_wait( &fullWorkCond, &ftLock ) ; + } + tag = freeThreads[ ftCnt - 1 ] ; + --ftCnt ; + pthread_mutex_unlock( &ftLock ) ; + + if ( initThreads[tag] ) + pthread_join( threads[tag], NULL ) ; // Make sure the chosen thread exits. + + // Assign the subexons, the constraints and correlation content. + pArgs[tag].subexons = new struct _subexon[gi.endIdx - gi.startIdx + 1] ; + pArgs[tag].seCnt = gi.endIdx - gi.startIdx + 1 ; + for ( j = 0 ; j < pArgs[tag].seCnt ; ++j ) + { + pArgs[tag].subexons[j] = intervalSubexons[j] ; + int cnt = intervalSubexons[j].prevCnt ; + pArgs[tag].subexons[j].prev = new int[cnt] ; + memcpy( pArgs[tag].subexons[j].prev, intervalSubexons[j].prev, sizeof( int ) * cnt ) ; + cnt = intervalSubexons[j].nextCnt ; + pArgs[tag].subexons[j].next = new int[cnt] ; + memcpy( pArgs[tag].subexons[j].next, intervalSubexons[j].next, sizeof( int ) * cnt ) ; + } + + for ( j = 0 ; j < sampleCnt ; ++j ) + { + pArgs[tag].constraints[j].Assign( multiSampleConstraints[ j ] ) ; + } + pArgs[tag].subexonCorrelation.Assign( subexonCorrelation ) ; + pthread_create( &threads[tag], &pthreadAttr, TranscriptDeciderSolve_Wrapper, &pArgs[tag] ) ; + initThreads[tag] = true ; + for ( j = 0 ; j < gi.endIdx - gi.startIdx + 1 ; ++j ) + { + delete[] intervalSubexons[j].prev ; + delete[] intervalSubexons[j].next ; + } + delete[] intervalSubexons ; + } + + for ( i = 0 ; i < numThreads ; ++i ) + { + if ( initThreads[i] ) + pthread_join( threads[i], NULL ) ; + } + + // Release memory + for ( i = 0 ; i < numThreads ; ++i ) + { + std::vector<Constraints>().swap( pArgs[i].constraints ) ; + } + delete []pArgs ; + pthread_attr_destroy( &pthreadAttr ) ; + pthread_mutex_destroy( &ftLock ) ; + pthread_cond_destroy( &fullWorkCond ) ; + + delete[] threads ; + delete[] getConstraintsThreads ; + delete[] initThreads ; + delete[] freeThreads ; + } // end of else for multi-thread. + + outputHandler.OutputCommandInfo( argc, argv ) ; + for ( i = 0 ; i < sampleCnt ; ++i ) + { + char buffer[1024] ; + alignmentFiles[i].GetFileName( buffer ) ; + int separator = -1 ; + + // deprive the directory. + for ( j = 0 ; buffer[j] ; ++j ) + { + if ( buffer[j] == '/' ) + separator = j ; + } + if ( separator != -1 ) + { + ++separator ; + for ( j = separator ; buffer[j] ; ++j ) + buffer[j - separator] = buffer[j] ; + buffer[j - separator] = '\0' ; + } + outputHandler.OutputCommentToSampleGTF( i, buffer ) ; + } + outputHandler.ComputeFPKMTPM( alignmentFiles ) ; + outputHandler.Flush() ; + + for ( i = 0 ; i < sampleCnt ; ++i ) + alignmentFiles[i].Close() ; + fclose( fpSubexon ) ; + return 0 ; +}