diff PsiCLASS-1.0.2/Vote.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PsiCLASS-1.0.2/Vote.cpp	Fri Mar 26 16:52:45 2021 +0000
@@ -0,0 +1,432 @@
+// The program that vote to pick the trusted transcripts.
+#include <stdio.h>
+#include <stdlib.h>
+#include <map>
+#include <vector>
+#include <algorithm>
+#include <string.h>
+#include <string>
+
+#include "defs.h"
+#include "TranscriptDecider.hpp"
+
+char usage[] = "./transcript-vote [OPTIONS] > output.gtf:\n"
+	"Required:\n"
+	"\t--lg: path to the list of GTF files.\n"
+	"Optional:\n" 
+	"\t-d FLOAT: threshold of average coverage depth across all the samples. (default: 1)\n"
+	//"\t-n INT: the number of samples a transcript showed up. (default: 3)\n"
+	;
+
+
+/*struct _transcript
+{
+	int chrId ;
+	int geneId ; 
+	char strand ;
+	struct _pair32 *exons ;
+	int ecnt ;
+	int sampleId ;
+} ;*/
+
+void GetGTFField( char *s, const char *field, char *ret )
+{
+	char *p = strstr( s, field ) ;
+	if ( p == NULL )
+		return ;
+	for ( ; *p != ' ' ; ++p )
+		;
+	p += 2 ; // add extra 1 to skip \"
+	sscanf( p, "%s", ret ) ;
+	//printf( "+%s %d\n", tid, strlen( tid )  ) ;
+	p = ret + strlen( ret ) ;
+	while ( *p != '\"' )
+		--p ;
+	*p = '\0' ;
+}
+
+int GetTailNumber( char *s )
+{
+	int len = strlen( s ) ;
+	int ret = 0 ;
+	int i ;
+	int factor = 1 ;
+	for ( i = len - 1 ; i >= 0 && s[i] >= '0' && s[i] <= '9' ; --i, factor *= 10 )
+	{
+		ret += factor * ( s[i] - '0' ) ;
+	}
+	return ret ;
+}
+
+int CompDouble( const void *p1, const void *p2 )
+{
+	double a = *(double *)p1 ;
+	double b = *(double *)p2 ;
+	if ( a > b )
+		return 1 ;
+	else if ( a < b )
+		return -1 ;
+	else
+		return 0 ;
+}
+
+int main( int argc, char *argv[] )
+{
+	int i, j, k ;
+	double minAvgDepth = 1.0 ;
+	double fraction = 1.0 ;
+	int minSampleCnt = 3 ;
+	std::map<std::string, int> chrNameToId ;
+	std::map<int, std::string> chrIdToName ;
+
+	FILE *fpGTFlist = NULL ;
+	FILE *fp = NULL ;
+	for ( i = 1 ; i < argc ; ++i )
+	{
+		if ( !strcmp( argv[i], "--lg" ) )
+		{
+			fpGTFlist = fopen( argv[i + 1], "r" ) ;
+			++i ;
+		}
+		else if ( !strcmp( argv[i], "-d" ) )
+		{
+			minAvgDepth = atof( argv[i + 1] ) ;
+			++i ;
+		}
+		/*else if ( !strcmp( argv[i], "-n" ) )
+		{
+			minSampleCnt = atoi( argv[i + 1] ) ;
+			++i ;
+		}*/
+		else 
+		{
+			printf( "%s", usage ) ;
+			exit( 1 ) ;
+		}
+	}
+	if ( fpGTFlist == NULL )
+	{
+		printf( "Must use --lg option to speicfy the list of GTF files.\n%s", usage ) ;
+		exit( 1 ) ;
+	}
+	
+	std::vector<struct _outputTranscript> transcripts ;
+	std::vector<struct _outputTranscript> outputTranscripts ;
+
+	char buffer[4096] ;
+	char line[10000] ;
+	char chrom[50], tool[20], type[40], strand[3] ;
+	//char tid[50] ;
+	int start, end ;
+	std::vector<struct _pair32> tmpExons ;
+	int sampleCnt = 0 ;
+	int gid = -1 ;
+	int tid = -1 ;
+	int chrId = -1 ;
+	int chrIdUsed = 0 ;
+	char cStrand = '.' ;
+	char prefix[50] ;
+	double FPKM = 0, TPM = 0, cov = 0 ;
+
+	while ( fgets( buffer, sizeof( buffer ), fpGTFlist ) != NULL )
+	{
+		int len = strlen( buffer ) ;	
+		if ( buffer[len - 1] == '\n' )
+		{
+			buffer[len - 1] = '\0' ; 
+			--len ;
+		}
+		fp = fopen( buffer, "r" ) ;
+
+
+		while ( fgets( line, sizeof( line ), fp ) != NULL )
+		{
+			if ( line[0] == '#' )
+				continue ;
+
+			sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ;
+
+			if ( strcmp( type, "exon" ) )
+			{
+				if ( tmpExons.size() > 0 )
+				{
+					struct _outputTranscript nt ;
+					nt.sampleId = sampleCnt ; 
+					nt.chrId = chrId ;
+					nt.geneId = gid ;
+					nt.transcriptId = tid ;
+					nt.strand = cStrand ;
+					nt.ecnt = tmpExons.size() ;
+					nt.exons = new struct _pair32[nt.ecnt] ;
+					nt.FPKM = FPKM ;
+					nt.TPM = TPM ;
+					nt.cov = cov ;
+					for ( i = 0 ; i < nt.ecnt ; ++i )
+						nt.exons[i] = tmpExons[i] ;
+					tmpExons.clear() ;
+					transcripts.push_back( nt ) ;
+				}
+				continue ;
+			}
+
+			if ( chrNameToId.find( std::string( chrom ) ) == chrNameToId.end() )
+			{
+				chrId = chrIdUsed ;
+
+				std::string s( chrom ) ;
+				chrNameToId[s] = chrIdUsed ;
+				chrIdToName[ chrIdUsed ] = s ;
+				++chrIdUsed ;
+			}
+			else
+				chrId = chrNameToId[ std::string( chrom ) ] ;
+
+			GetGTFField( line, "gene_id", buffer ) ;
+			gid = GetTailNumber( buffer ) ;
+			
+			GetGTFField( line, "transcript_id", buffer ) ;
+			tid = GetTailNumber( buffer ) ;
+
+			GetGTFField( line, "FPKM", buffer ) ;
+			FPKM = atof( buffer ) ;
+			
+			GetGTFField( line, "TPM", buffer ) ;
+			TPM = atof( buffer ) ;
+
+			GetGTFField( line, "cov", buffer ) ;
+			cov = atof( buffer ) ;
+
+			cStrand = strand[0] ;
+
+			// Look for the user-defined prefix.
+			int len = strlen( buffer ) ;
+			j = 0 ;
+			for ( i = len - 1 ; i >= 0 ; --i )
+			{
+				if ( buffer[i] == '.' )
+				{
+					++j ;
+					if ( j >= 2 )
+						break ;
+				}
+			}
+
+			for ( j = 0 ; j < i ; ++i )
+			{
+				prefix[j] = buffer[j] ;
+			}
+			if ( i > 0 )
+			{
+				prefix[j] = '.' ;
+				prefix[j + 1] = '\0' ;
+			}
+			else
+				prefix[0] = '\0' ;
+
+			struct _pair32 ne ;
+			ne.a = start ;
+			ne.b = end ;
+			tmpExons.push_back( ne ) ;
+		}
+		if ( tmpExons.size() > 0 )
+		{
+			struct _outputTranscript nt ;
+			nt.sampleId = sampleCnt ; 
+			nt.chrId = chrId ;
+			nt.geneId = gid ;
+			nt.transcriptId = tid ;
+			nt.strand = cStrand ;
+			nt.ecnt = tmpExons.size() ;
+			nt.exons = new struct _pair32[nt.ecnt] ;
+			nt.FPKM = FPKM ;
+			nt.TPM = TPM ;
+			nt.cov = cov ;
+			for ( i = 0 ; i < nt.ecnt ; ++i )
+				nt.exons[i] = tmpExons[i] ;
+			tmpExons.clear() ;
+			transcripts.push_back( nt ) ;
+		}
+
+		++sampleCnt ;
+		fclose( fp ) ;
+	}
+	fclose( fpGTFlist ) ;
+
+	// Coalesce same transcripts
+	std::sort( transcripts.begin(), transcripts.end(), MultiThreadOutputTranscript::CompSortTranscripts ) ;
+	std::vector<int> sampleSupport ;
+	int size = transcripts.size() ;
+	if ( minSampleCnt > sampleCnt )
+		minSampleCnt = sampleCnt ;
+
+	for ( i = 0 ; i < size ; )
+	{
+		int l ;
+		for ( j = i + 1 ; j < size ; ++j )
+		{
+			if ( MultiThreadOutputTranscript::CompTranscripts( transcripts[i], transcripts[j] ) )
+				break ;
+		}
+		// [i,j) are the same transcripts.
+		/*if ( j - i >= fraction * sampleCnt && j - i >= minSampleCnt )
+		{
+			outputTranscripts.push_back( transcripts[i] ) ;
+		}*/
+
+		double sumFPKM = 0 ;
+		double sumTPM = 0 ;
+		double sumCov = 0 ;
+		for ( l = i ; l < j ; ++l )
+		{
+			sumFPKM += transcripts[l].FPKM ;
+			sumTPM += transcripts[l].TPM ;
+			sumCov += transcripts[l].cov ;
+		}
+
+		transcripts[k] = transcripts[i] ;
+		for ( l = i + 1 ; l < j ; ++l )
+			delete[] transcripts[l].exons ;
+
+		transcripts[k].FPKM = sumFPKM / ( j - i ) ;
+		transcripts[k].TPM = sumTPM / ( j - i ) ;
+		transcripts[k].cov = sumCov / ( j - i ) ;
+		
+		if ( ( j - i < int( fraction * sampleCnt ) || j - i <  minSampleCnt ) && sumCov < minAvgDepth * sampleCnt )
+		//if ( transcripts[k].score * ( j - i ) < 1 && ( j - i ) < sampleCnt * 0.5 )
+		//if ( sumTPM < sampleCnt || sumFPKM < sampleCnt )
+		//if ( sumCov < minAvgDepthn * sampleCnt )
+		{
+			transcripts[k].FPKM = -1 ;
+		}
+		else
+			sampleSupport.push_back( j - i ) ;
+		++k ;
+		i = j ;
+	}
+	transcripts.resize( k ) ;	
+	
+	// The motivation to separate the coalscing and voting is to allow
+	// a gene with no txpt passing the vote to pick a representative.
+	size = k ;
+	for ( i = 0 ; i < size ; )
+	{
+		int cnt = 0 ;
+		if ( transcripts[i].FPKM != -1 )
+			++cnt ;
+
+		for ( j = i + 1 ; j < size ; ++j )
+		{
+			if ( transcripts[i].geneId != transcripts[j].geneId )
+				break ;
+			if ( transcripts[j].FPKM != -1 )
+				++cnt ;
+		}
+		int l ;
+		/*double *freq = new double[cnt] ;
+		for ( l = 0 ; l < cnt ; ++l )
+			freq[l] = transcripts[l].FPKM / sampleCnt ;
+		qsort( freq, cnt, sizeof( double ), CompDouble ) ;
+
+		for ( l = 0 ; l < cnt ; ++l )
+			printf( "%lf\n", freq[l] ) ;
+		printf( "===========\n" ) ;
+		delete[] freq ;*/
+		/*if ( cnt == 0 )
+		{
+			int maxEcnt = -1 ;
+			int maxCnt = 0 ;
+			int maxtag ;
+			for ( l = i ; l < j ; ++l )
+				if ( transcripts[l].ecnt > maxEcnt )
+					maxEcnt = transcripts[l].ecnt ;
+			
+			if ( maxEcnt >= 2 )
+			{
+				int maxSampleSupport = -1 ;
+				maxtag = i ;
+				for ( l = i ; l < j ; ++l )
+				{
+					if ( transcripts[l].ecnt > 1 && transcripts[l].ecnt >= 2  )
+					{
+						if ( sampleSupport[l] > maxSampleSupport )
+						{
+							maxSampleSupport = sampleSupport[l] ;
+							maxtag = l ;				
+							maxCnt = 1 ;
+						}
+						else if ( sampleSupport[l] == maxSampleSupport )
+							++maxCnt ;
+					}
+				}
+				
+				if ( maxSampleSupport >= 3 && maxSampleSupport >= ( fraction / 10 ) * sampleCnt && transcripts[maxtag].TPM > 0)
+				{
+					if ( maxCnt > 1 )
+					{
+						int maxTPM = -1 ;
+						for ( l = i ; l < j ; ++l )
+						{
+							if ( sampleSupport[l] != maxSampleSupport )
+								continue ;
+							if ( transcripts[l].TPM > maxTPM )
+							{
+								maxTPM = transcripts[l].TPM ;
+								maxtag = l ;
+							}
+						}
+
+					}
+					transcripts[maxtag].FPKM = 0 ;
+					//fprintf( stderr, "recovered: %d %d\n", sampleSupport[ maxtag ], transcripts[ maxtag ].ecnt ) ;
+					outputTranscripts.push_back( transcripts[maxtag] ) ;
+				}
+			}
+		}
+		else*/
+		{
+			for ( l = i ; l < j ; ++l )
+			{
+				//if ( transcripts[l].FPKM >= fraction * sampleCnt && transcripts[l].FPKM >= minSampleCnt )
+				if ( transcripts[l].FPKM != -1 )
+				{
+					outputTranscripts.push_back( transcripts[l] ) ;
+				}
+			}
+		}
+		
+		i = j ;
+	}
+	
+	// Output
+	size = outputTranscripts.size() ;
+	//printf( "%d\n", size ) ;
+	int transcriptId = 0 ;
+	int prevGid = -1 ;
+	for ( i = 0 ; i < size ; ++i )
+	{
+		struct _outputTranscript &t = outputTranscripts[i] ;
+		const char *chrom = chrIdToName[t.chrId].c_str() ;
+		/*if ( t.geneId != prevGid )
+			transcriptId = 0 ;
+		else
+			++transcriptId ;*/
+		transcriptId = outputTranscripts[i].transcriptId ;
+		fprintf( stdout, "%s\tPsiCLASS\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n",
+				chrom, t.exons[0].a, t.exons[t.ecnt - 1].b, t.strand,
+				prefix, chrom, t.geneId,
+				prefix, chrom, t.geneId, transcriptId, t.FPKM, t.TPM, t.cov, sampleSupport[i] ) ;
+		for ( j = 0 ; j < t.ecnt ; ++j )
+		{
+			fprintf( stdout, "%s\tPsiCLASS\texon\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; "
+					"transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n",
+					chrom, t.exons[j].a, t.exons[j].b, t.strand,
+					prefix, chrom, t.geneId,
+					prefix, chrom, t.geneId, transcriptId,
+					j + 1, t.FPKM, t.TPM, t.cov,
+					sampleSupport[i] ) ;
+		}
+
+		prevGid = t.geneId ;
+	}
+	return 0 ;
+}