diff PsiCLASS-1.0.2/classes.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PsiCLASS-1.0.2/classes.cpp	Fri Mar 26 16:52:45 2021 +0000
@@ -0,0 +1,491 @@
+#include <stdio.h>
+#include <getopt.h>
+#include <vector>
+#include <pthread.h>
+
+#include "alignments.hpp"
+#include "SubexonGraph.hpp"
+#include "SubexonCorrelation.hpp"
+#include "Constraints.hpp"
+#include "TranscriptDecider.hpp"
+
+char usage[] = "./classes [OPTIONS]:\n"
+	"Required:\n"
+	"\t-s STRING: path to the subexon file.\n"
+	"\t-b STRING: path to the BAM file.\n"
+	"\t\tor\n"
+	"\t--lb STRING: path to the file of the list of BAM files.\n"
+	"Optional:\n"
+	"\t-p INT: number of threads. (default: 1)\n"
+	"\t-o STRING: the prefix of the output file. (default: not used)\n"
+	"\t-c FLOAT: only use the subexons with classifier score <= than the given number. (default: 0.05)\n" 
+	"\t-f FLOAT: filter the transcript from the gene if its abundance is lower than the given number percent of the most abundant one. (default: 0.05)\n"
+	"\t-d FLOAT: filter the transcript whose average read depth is less than the given number. (default: 2.5)\n"
+	"\t--ls STRING: path to the file of the list of single-sample subexon files. (default: not used)\n"
+	"\t--hasMateIdSuffix: the read id has suffix such as .1, .2 for a mate pair. (default: false)\n"
+	"\t--maxDpConstraintSize: the maximum number of subexons a constraint can cover in dynamic programming. (default: 7; -1 for inf)\n"
+	"\t--primaryParalog: use primary alignment to retain paralog genes instead of unique alignments. (default: not used)\n"
+	;
+
+static const char *short_options = "s:b:f:o:d:p:c:h" ;
+static struct option long_options[] =
+	{
+		{ "ls", required_argument, 0, 10000 },
+		{ "lb", required_argument, 0, 10001 },
+		{ "hasMateIdSuffix", no_argument, 0, 10002 },
+		{ "primaryParalog", no_argument, 0, 10003 },
+		{ "maxDpConstraintSize", required_argument, 0, 10004 },
+		{ (char *)0, 0, 0, 0} 
+	} ;
+
+struct _getAlignmentsInfoThreadArg
+{
+	std::vector<Alignments> *pAlignmentFiles ;
+	int numThreads ;
+	int tid ;
+} ;
+
+struct _getConstraintsThreadArg
+{
+	std::vector<Constraints> *pMultiSampleConstraints ;	
+	int numThreads ;
+	int tid ;
+
+	struct _subexon *subexons ;
+	int seCnt ;
+	int start, end ;
+} ;
+
+void *GetAlignmentsInfo_Thread( void *pArg )
+{
+	int i ;
+	std::vector <Alignments> &alignmentFiles = *( ( (struct _getAlignmentsInfoThreadArg *)pArg )->pAlignmentFiles ) ;
+	int tid = ( (struct _getAlignmentsInfoThreadArg *)pArg )->tid ;
+	int numThreads = ( (struct _getAlignmentsInfoThreadArg *)pArg )->numThreads ;
+	int size = alignmentFiles.size() ;
+	for ( i = 0 ; i < size ; ++i )
+	{
+		if ( i % numThreads == tid )
+		{
+			alignmentFiles[i].GetGeneralInfo( true ) ;
+			alignmentFiles[i].Rewind() ;
+		}
+	}
+	
+	pthread_exit( NULL ) ;
+}
+
+
+void *GetConstraints_Thread( void *pArg )
+{
+	int i ;
+	struct _getConstraintsThreadArg &arg = *( (struct _getConstraintsThreadArg *)pArg ) ;
+	std::vector<Constraints> &multiSampleConstraints = *( arg.pMultiSampleConstraints ) ;
+	int tid = arg.tid ;
+	int numThreads = arg.numThreads ;
+	int size = multiSampleConstraints.size() ;
+	for ( i = 0 ; i < size ; ++i )
+	{
+		if ( i % numThreads == tid )
+			multiSampleConstraints[i].BuildConstraints( arg.subexons, arg.seCnt, arg.start, arg.end ) ;
+	}
+	pthread_exit( NULL ) ;
+}
+
+int main( int argc, char *argv[] )
+{
+	int i, j ;
+	int size ;
+
+	if ( argc <= 1 )
+	{
+		printf( "%s", usage ) ;
+		return 0 ;
+	}
+
+	int c, option_index ; // For getopt
+	option_index = 0 ;
+	FILE *fpSubexon = NULL ;
+	double FPKMFraction = 0.05 ; 
+	double classifierThreshold ;
+	double txptMinReadDepth = 2.5 ;
+	char outputPrefix[1024] = "" ;
+	int numThreads = 1 ;
+	bool hasMateReadIdSuffix = false ;
+	bool usePrimaryAsUnique = false ;
+	int maxDpConstraintSize = 7 ;
+	
+	std::vector<Alignments> alignmentFiles ;
+	SubexonCorrelation subexonCorrelation ;
+	
+	classifierThreshold = 0.05 ;
+	while ( 1 )
+	{
+		c = getopt_long( argc, argv, short_options, long_options, &option_index ) ;
+		if ( c == -1 )
+			break ;
+
+		if ( c == 's' )
+		{
+			fpSubexon = fopen( optarg, "r" ) ;
+		}
+		else if ( c == 'b' )
+		{
+			Alignments a ;
+			a.Open( optarg ) ;
+			//a.SetAllowClip( false ) ;
+			alignmentFiles.push_back( a ) ;
+		}
+		else if ( c == 'c' )
+		{
+			classifierThreshold = atof( optarg ) ;
+		}
+		else if ( c == 'f' )
+		{
+			FPKMFraction = atof( optarg ) ; 		
+		}
+		else if ( c == 'o' )
+		{
+			strcpy( outputPrefix, optarg ) ;	
+		}
+		else if ( c == 'd' )
+		{
+			txptMinReadDepth = atof( optarg ) ;
+		}
+		else if ( c == 'p' )
+		{
+			numThreads = atoi( optarg ) ;
+		}
+		else if ( c == 10000 ) // the list of subexon files.
+		{
+			subexonCorrelation.Initialize( optarg ) ;
+		}
+		else if ( c == 10001 ) // the list of bam files.
+		{
+			FILE *fp = fopen( optarg, "r" ) ;
+			char buffer[1024] ;
+			while ( fgets( buffer, sizeof( buffer ), fp ) != NULL )
+			{
+				int len = strlen( buffer ) ;
+				if ( buffer[len - 1] == '\n' )
+				{
+					buffer[len - 1] = '\0' ;
+					--len ;
+
+				}
+				Alignments a ;
+				a.Open( buffer ) ;
+				alignmentFiles.push_back( a ) ;
+			}
+			fclose( fp ) ;
+		}
+		else if ( c == 10002 ) // the mate pair read id has suffix.
+		{
+			hasMateReadIdSuffix = true ;
+		}
+		else if ( c == 10003 ) // do not check unique alignment
+		{
+			usePrimaryAsUnique = true ;
+		}
+		else if ( c == 10004 ) // maxDpConstraintSize
+		{
+			maxDpConstraintSize = atoi(optarg) ;
+		}
+		else
+		{
+			printf( "%s", usage ) ;
+			exit( 1 ) ;
+		}
+	}
+	if ( fpSubexon == NULL )			
+	{
+		printf( "Cannot find combined subexon file.\n" ) ;
+		exit( 1 ) ;
+	}
+	if ( alignmentFiles.size() < 1 )
+	{
+		printf( "Must use -b option to specify BAM files.\n" ) ;
+		exit( 1 ) ;
+	}
+
+
+	if ( alignmentFiles.size() < 50 )
+	{
+		size = alignmentFiles.size() ;
+		for ( i = 0 ; i < size ; ++i )
+		{
+			alignmentFiles[i].GetGeneralInfo( true ) ;
+			alignmentFiles[i].Rewind() ;
+		}
+	}
+	else
+	{
+		struct _getAlignmentsInfoThreadArg *args = new struct _getAlignmentsInfoThreadArg[ numThreads ] ;
+		pthread_attr_t pthreadAttr ;
+		pthread_t *threads ;
+		
+		pthread_attr_init( &pthreadAttr ) ;
+		pthread_attr_setdetachstate( &pthreadAttr, PTHREAD_CREATE_JOINABLE ) ;
+
+		threads = new pthread_t[ numThreads ] ;
+
+		for ( i = 0 ; i < numThreads ; ++i )
+		{
+			args[i].pAlignmentFiles = &alignmentFiles ;
+			args[i].tid = i ;
+			args[i].numThreads = numThreads ;
+
+			pthread_create( &threads[i], &pthreadAttr, GetAlignmentsInfo_Thread, &args[i] ) ;
+		}
+
+		for ( i = 0 ; i < numThreads ; ++i )
+		{
+			pthread_join( threads[i], NULL ) ;
+		}
+
+		pthread_attr_destroy( &pthreadAttr ) ;
+		delete[] args ;
+		delete[] threads ;
+	}
+
+	// Build the subexon graph
+	SubexonGraph subexonGraph( classifierThreshold, alignmentFiles[0], fpSubexon ) ;
+	subexonGraph.ComputeGeneIntervals() ;
+	
+	// Solve gene by gene
+	int sampleCnt = alignmentFiles.size() ;
+	std::vector<Constraints> multiSampleConstraints ;
+	for ( i = 0 ; i < sampleCnt ; ++i )
+	{
+		Constraints constraints( &alignmentFiles[i] ) ;
+		constraints.SetHasMateReadIdSuffix( hasMateReadIdSuffix ) ;
+		constraints.SetUsePrimaryAsUnique( usePrimaryAsUnique ) ;
+		multiSampleConstraints.push_back( constraints ) ;
+	}
+	MultiThreadOutputTranscript outputHandler( sampleCnt, alignmentFiles[0] ) ;
+	outputHandler.SetOutputFPs( outputPrefix ) ;
+
+	if ( numThreads <= 1 )
+	{
+		TranscriptDecider transcriptDecider( FPKMFraction, classifierThreshold, txptMinReadDepth, sampleCnt, alignmentFiles[0] ) ;
+		
+		transcriptDecider.SetMultiThreadOutputHandler( &outputHandler ) ;
+		transcriptDecider.SetNumThreads( numThreads ) ;
+		transcriptDecider.SetMaxDpConstraintSize( maxDpConstraintSize ) ;
+
+		int giCnt = subexonGraph.geneIntervals.size() ;
+		for ( i = 0 ; i < giCnt ; ++i )
+		{
+			struct _geneInterval gi = subexonGraph.geneIntervals[i] ;
+			struct _subexon *intervalSubexons = new struct _subexon[ gi.endIdx - gi.startIdx + 1 ] ;
+			subexonGraph.ExtractSubexons( gi.startIdx, gi.endIdx, intervalSubexons ) ;
+			printf( "%d: %d %s %d %d\n", i, gi.endIdx - gi.startIdx + 1, 
+					alignmentFiles[0].GetChromName( intervalSubexons[0].chrId ), 
+					gi.start + 1, gi.end + 1 ) ;	
+			fflush( stdout ) ;
+
+			subexonCorrelation.ComputeCorrelation( intervalSubexons, gi.endIdx - gi.startIdx + 1, alignmentFiles[0] ) ;
+			for ( j = 0 ; j < sampleCnt ; ++j )
+				multiSampleConstraints[j].BuildConstraints( intervalSubexons, gi.endIdx - gi.startIdx + 1, gi.start, gi.end ) ;	
+
+			transcriptDecider.Solve( intervalSubexons, gi.endIdx - gi.startIdx + 1, multiSampleConstraints, subexonCorrelation ) ;
+
+			for ( j = 0 ; j < gi.endIdx - gi.startIdx + 1 ; ++j )
+			{
+				delete[] intervalSubexons[j].prev ;
+				delete[] intervalSubexons[j].next ;
+			}
+			delete[] intervalSubexons ;
+		}
+	}
+	else // multi-thread case.
+	{
+		--numThreads ; // one thread is used for read in the data.
+		
+		// Allocate memory
+		struct _transcriptDeciderThreadArg *pArgs = new struct _transcriptDeciderThreadArg[ numThreads ] ;
+		pthread_mutex_t ftLock ;
+		int *freeThreads ;
+		int ftCnt ;
+		pthread_cond_t fullWorkCond ;
+		pthread_attr_t pthreadAttr ;
+		pthread_t *threads ;
+		pthread_t *getConstraintsThreads ;
+		bool *initThreads ;
+
+		pthread_mutex_init( &ftLock, NULL ) ;
+		pthread_cond_init( &fullWorkCond, NULL ) ;
+		pthread_attr_init( &pthreadAttr ) ;
+		pthread_attr_setdetachstate( &pthreadAttr, PTHREAD_CREATE_JOINABLE ) ;
+
+		threads = new pthread_t[ numThreads ] ;
+		getConstraintsThreads = new pthread_t[ numThreads ] ;
+		initThreads = new bool[numThreads] ;
+		freeThreads = new int[ numThreads ] ;
+		ftCnt = numThreads ;
+		for ( i = 0 ; i < numThreads ; ++i )
+		{
+			pArgs[i].tid = i ;
+			pArgs[i].sampleCnt = sampleCnt ;
+			pArgs[i].numThreads = numThreads ;
+			pArgs[i].maxDpConstraintSize = maxDpConstraintSize ;
+			pArgs[i].FPKMFraction = FPKMFraction ;
+			pArgs[i].classifierThreshold = classifierThreshold ;
+			pArgs[i].txptMinReadDepth = txptMinReadDepth ;
+			pArgs[i].alignments = &alignmentFiles[0] ;
+			//pArgs[i].constraints = new std::vector<Constraints> ;
+			pArgs[i].outputHandler = &outputHandler ;
+
+			freeThreads[i] = i ;
+			pArgs[i].freeThreads = freeThreads ;
+			pArgs[i].ftCnt = &ftCnt ;
+			pArgs[i].ftLock = &ftLock ;
+			pArgs[i].fullWorkCond = &fullWorkCond ;
+			
+			for ( j = 0 ; j < sampleCnt ; ++j )
+			{
+				Constraints constraints( &alignmentFiles[j] ) ;
+				pArgs[i].constraints.push_back( constraints ) ;
+			}
+
+			initThreads[i] = false ;
+		}
+
+
+		// Read in and distribute the work
+		int giCnt = subexonGraph.geneIntervals.size() ;
+		for ( i = 0 ; i < giCnt ; ++i )
+		{
+			struct _geneInterval gi = subexonGraph.geneIntervals[i] ;
+			struct _subexon *intervalSubexons = new struct _subexon[ gi.endIdx - gi.startIdx + 1 ] ;
+			subexonGraph.ExtractSubexons( gi.startIdx, gi.endIdx, intervalSubexons ) ;
+			subexonCorrelation.ComputeCorrelation( intervalSubexons, gi.endIdx - gi.startIdx + 1, alignmentFiles[0] ) ;
+			printf( "%d: %d %s %d %d. Free threads: %d/%d\n", i, gi.endIdx - gi.startIdx + 1, 
+					alignmentFiles[0].GetChromName( intervalSubexons[0].chrId ), 
+					gi.start + 1, gi.end + 1, ftCnt, numThreads + 1 ) ;	
+			fflush( stdout ) ;
+			
+			int gctCnt = ftCnt ;
+			if ( gctCnt > 1 && sampleCnt > 1 )
+			{
+				gctCnt = ( gctCnt < sampleCnt ? gctCnt : sampleCnt ) ;	
+				struct _getConstraintsThreadArg *args = new struct _getConstraintsThreadArg[ gctCnt ] ;
+				for ( j = 0 ; j < gctCnt ; ++j )
+				{
+					args[j].pMultiSampleConstraints = &multiSampleConstraints ;
+					args[j].numThreads = gctCnt ;
+					args[j].tid = j ;
+					args[j].subexons = intervalSubexons ;
+					args[j].seCnt = gi.endIdx - gi.startIdx + 1 ;
+					args[j].start = gi.start ; args[j].end = gi.end ;
+					pthread_create( &getConstraintsThreads[j], &pthreadAttr, GetConstraints_Thread, &args[j] ) ;
+				}
+
+
+				
+				for ( j = 0 ; j < gctCnt ; ++j )
+					pthread_join( getConstraintsThreads[j], NULL ) ;		
+
+				delete[] args ;
+			}
+			else
+			{
+				for ( j = 0 ; j < sampleCnt ; ++j )
+					multiSampleConstraints[j].BuildConstraints( intervalSubexons, gi.endIdx - gi.startIdx + 1, gi.start, gi.end ) ;	
+			}
+			
+			// Search for the free queue.
+			int tag = -1 ; // get the working thread.
+			pthread_mutex_lock( &ftLock ) ;
+			if ( ftCnt == 0 )
+			{
+				pthread_cond_wait( &fullWorkCond, &ftLock ) ;	
+			}
+			tag = freeThreads[ ftCnt - 1 ] ;
+			--ftCnt ;
+			pthread_mutex_unlock( &ftLock ) ;
+			
+			if ( initThreads[tag] )
+				pthread_join( threads[tag], NULL ) ; // Make sure the chosen thread exits.
+			
+			// Assign the subexons, the constraints and correlation content.
+			pArgs[tag].subexons = new struct _subexon[gi.endIdx - gi.startIdx + 1] ;
+			pArgs[tag].seCnt = gi.endIdx - gi.startIdx + 1 ;
+			for ( j = 0 ; j < pArgs[tag].seCnt ; ++j )
+			{
+				pArgs[tag].subexons[j] = intervalSubexons[j] ;
+				int cnt = intervalSubexons[j].prevCnt ;
+				pArgs[tag].subexons[j].prev = new int[cnt] ;
+				memcpy( pArgs[tag].subexons[j].prev, intervalSubexons[j].prev, sizeof( int ) * cnt ) ;
+				cnt = intervalSubexons[j].nextCnt ;
+				pArgs[tag].subexons[j].next = new int[cnt] ;
+				memcpy( pArgs[tag].subexons[j].next, intervalSubexons[j].next, sizeof( int ) * cnt ) ;
+			}
+
+			for ( j = 0 ; j < sampleCnt ; ++j )
+			{
+				pArgs[tag].constraints[j].Assign( multiSampleConstraints[ j ] ) ;
+			}
+			pArgs[tag].subexonCorrelation.Assign( subexonCorrelation ) ;
+			pthread_create( &threads[tag], &pthreadAttr, TranscriptDeciderSolve_Wrapper, &pArgs[tag] ) ;
+			initThreads[tag] = true ;
+			for ( j = 0 ; j < gi.endIdx - gi.startIdx + 1 ; ++j )
+			{
+				delete[] intervalSubexons[j].prev ;
+				delete[] intervalSubexons[j].next ;
+			}
+			delete[] intervalSubexons ;
+		}
+
+		for ( i = 0 ; i < numThreads ; ++i )
+		{
+			if ( initThreads[i] )
+				pthread_join( threads[i], NULL ) ;
+		}
+
+		// Release memory
+		for ( i = 0 ; i < numThreads ; ++i )
+		{
+			std::vector<Constraints>().swap( pArgs[i].constraints ) ;
+		}
+		delete []pArgs ;
+		pthread_attr_destroy( &pthreadAttr ) ;
+		pthread_mutex_destroy( &ftLock ) ;
+		pthread_cond_destroy( &fullWorkCond ) ;
+
+		delete[] threads ;
+		delete[] getConstraintsThreads ;
+		delete[] initThreads ;
+		delete[] freeThreads ;
+	} // end of else for multi-thread.
+
+	outputHandler.OutputCommandInfo( argc, argv ) ; 
+	for ( i = 0 ; i < sampleCnt ; ++i )
+	{
+		char buffer[1024] ;
+		alignmentFiles[i].GetFileName( buffer ) ;
+		int separator = -1 ;
+		
+		// deprive the directory.
+		for ( j = 0 ; buffer[j] ; ++j )
+		{
+			if ( buffer[j] == '/' )
+				separator = j ;
+		}
+		if ( separator != -1 )
+		{
+			++separator ;
+			for ( j = separator ; buffer[j] ; ++j )
+				buffer[j - separator] = buffer[j] ;
+			buffer[j - separator] = '\0' ;
+		}
+		outputHandler.OutputCommentToSampleGTF( i, buffer ) ;
+	}
+	outputHandler.ComputeFPKMTPM( alignmentFiles ) ;
+	outputHandler.Flush() ;
+	
+	for ( i = 0 ; i < sampleCnt ; ++i )
+		alignmentFiles[i].Close() ;
+	fclose( fpSubexon ) ;
+	return 0 ;
+}