Mercurial > repos > lsong10 > psiclass
diff PsiCLASS-1.0.2/Vote.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/Vote.cpp Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,432 @@ +// The program that vote to pick the trusted transcripts. +#include <stdio.h> +#include <stdlib.h> +#include <map> +#include <vector> +#include <algorithm> +#include <string.h> +#include <string> + +#include "defs.h" +#include "TranscriptDecider.hpp" + +char usage[] = "./transcript-vote [OPTIONS] > output.gtf:\n" + "Required:\n" + "\t--lg: path to the list of GTF files.\n" + "Optional:\n" + "\t-d FLOAT: threshold of average coverage depth across all the samples. (default: 1)\n" + //"\t-n INT: the number of samples a transcript showed up. (default: 3)\n" + ; + + +/*struct _transcript +{ + int chrId ; + int geneId ; + char strand ; + struct _pair32 *exons ; + int ecnt ; + int sampleId ; +} ;*/ + +void GetGTFField( char *s, const char *field, char *ret ) +{ + char *p = strstr( s, field ) ; + if ( p == NULL ) + return ; + for ( ; *p != ' ' ; ++p ) + ; + p += 2 ; // add extra 1 to skip \" + sscanf( p, "%s", ret ) ; + //printf( "+%s %d\n", tid, strlen( tid ) ) ; + p = ret + strlen( ret ) ; + while ( *p != '\"' ) + --p ; + *p = '\0' ; +} + +int GetTailNumber( char *s ) +{ + int len = strlen( s ) ; + int ret = 0 ; + int i ; + int factor = 1 ; + for ( i = len - 1 ; i >= 0 && s[i] >= '0' && s[i] <= '9' ; --i, factor *= 10 ) + { + ret += factor * ( s[i] - '0' ) ; + } + return ret ; +} + +int CompDouble( const void *p1, const void *p2 ) +{ + double a = *(double *)p1 ; + double b = *(double *)p2 ; + if ( a > b ) + return 1 ; + else if ( a < b ) + return -1 ; + else + return 0 ; +} + +int main( int argc, char *argv[] ) +{ + int i, j, k ; + double minAvgDepth = 1.0 ; + double fraction = 1.0 ; + int minSampleCnt = 3 ; + std::map<std::string, int> chrNameToId ; + std::map<int, std::string> chrIdToName ; + + FILE *fpGTFlist = NULL ; + FILE *fp = NULL ; + for ( i = 1 ; i < argc ; ++i ) + { + if ( !strcmp( argv[i], "--lg" ) ) + { + fpGTFlist = fopen( argv[i + 1], "r" ) ; + ++i ; + } + else if ( !strcmp( argv[i], "-d" ) ) + { + minAvgDepth = atof( argv[i + 1] ) ; + ++i ; + } + /*else if ( !strcmp( argv[i], "-n" ) ) + { + minSampleCnt = atoi( argv[i + 1] ) ; + ++i ; + }*/ + else + { + printf( "%s", usage ) ; + exit( 1 ) ; + } + } + if ( fpGTFlist == NULL ) + { + printf( "Must use --lg option to speicfy the list of GTF files.\n%s", usage ) ; + exit( 1 ) ; + } + + std::vector<struct _outputTranscript> transcripts ; + std::vector<struct _outputTranscript> outputTranscripts ; + + char buffer[4096] ; + char line[10000] ; + char chrom[50], tool[20], type[40], strand[3] ; + //char tid[50] ; + int start, end ; + std::vector<struct _pair32> tmpExons ; + int sampleCnt = 0 ; + int gid = -1 ; + int tid = -1 ; + int chrId = -1 ; + int chrIdUsed = 0 ; + char cStrand = '.' ; + char prefix[50] ; + double FPKM = 0, TPM = 0, cov = 0 ; + + while ( fgets( buffer, sizeof( buffer ), fpGTFlist ) != NULL ) + { + int len = strlen( buffer ) ; + if ( buffer[len - 1] == '\n' ) + { + buffer[len - 1] = '\0' ; + --len ; + } + fp = fopen( buffer, "r" ) ; + + + while ( fgets( line, sizeof( line ), fp ) != NULL ) + { + if ( line[0] == '#' ) + continue ; + + sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ; + + if ( strcmp( type, "exon" ) ) + { + if ( tmpExons.size() > 0 ) + { + struct _outputTranscript nt ; + nt.sampleId = sampleCnt ; + nt.chrId = chrId ; + nt.geneId = gid ; + nt.transcriptId = tid ; + nt.strand = cStrand ; + nt.ecnt = tmpExons.size() ; + nt.exons = new struct _pair32[nt.ecnt] ; + nt.FPKM = FPKM ; + nt.TPM = TPM ; + nt.cov = cov ; + for ( i = 0 ; i < nt.ecnt ; ++i ) + nt.exons[i] = tmpExons[i] ; + tmpExons.clear() ; + transcripts.push_back( nt ) ; + } + continue ; + } + + if ( chrNameToId.find( std::string( chrom ) ) == chrNameToId.end() ) + { + chrId = chrIdUsed ; + + std::string s( chrom ) ; + chrNameToId[s] = chrIdUsed ; + chrIdToName[ chrIdUsed ] = s ; + ++chrIdUsed ; + } + else + chrId = chrNameToId[ std::string( chrom ) ] ; + + GetGTFField( line, "gene_id", buffer ) ; + gid = GetTailNumber( buffer ) ; + + GetGTFField( line, "transcript_id", buffer ) ; + tid = GetTailNumber( buffer ) ; + + GetGTFField( line, "FPKM", buffer ) ; + FPKM = atof( buffer ) ; + + GetGTFField( line, "TPM", buffer ) ; + TPM = atof( buffer ) ; + + GetGTFField( line, "cov", buffer ) ; + cov = atof( buffer ) ; + + cStrand = strand[0] ; + + // Look for the user-defined prefix. + int len = strlen( buffer ) ; + j = 0 ; + for ( i = len - 1 ; i >= 0 ; --i ) + { + if ( buffer[i] == '.' ) + { + ++j ; + if ( j >= 2 ) + break ; + } + } + + for ( j = 0 ; j < i ; ++i ) + { + prefix[j] = buffer[j] ; + } + if ( i > 0 ) + { + prefix[j] = '.' ; + prefix[j + 1] = '\0' ; + } + else + prefix[0] = '\0' ; + + struct _pair32 ne ; + ne.a = start ; + ne.b = end ; + tmpExons.push_back( ne ) ; + } + if ( tmpExons.size() > 0 ) + { + struct _outputTranscript nt ; + nt.sampleId = sampleCnt ; + nt.chrId = chrId ; + nt.geneId = gid ; + nt.transcriptId = tid ; + nt.strand = cStrand ; + nt.ecnt = tmpExons.size() ; + nt.exons = new struct _pair32[nt.ecnt] ; + nt.FPKM = FPKM ; + nt.TPM = TPM ; + nt.cov = cov ; + for ( i = 0 ; i < nt.ecnt ; ++i ) + nt.exons[i] = tmpExons[i] ; + tmpExons.clear() ; + transcripts.push_back( nt ) ; + } + + ++sampleCnt ; + fclose( fp ) ; + } + fclose( fpGTFlist ) ; + + // Coalesce same transcripts + std::sort( transcripts.begin(), transcripts.end(), MultiThreadOutputTranscript::CompSortTranscripts ) ; + std::vector<int> sampleSupport ; + int size = transcripts.size() ; + if ( minSampleCnt > sampleCnt ) + minSampleCnt = sampleCnt ; + + for ( i = 0 ; i < size ; ) + { + int l ; + for ( j = i + 1 ; j < size ; ++j ) + { + if ( MultiThreadOutputTranscript::CompTranscripts( transcripts[i], transcripts[j] ) ) + break ; + } + // [i,j) are the same transcripts. + /*if ( j - i >= fraction * sampleCnt && j - i >= minSampleCnt ) + { + outputTranscripts.push_back( transcripts[i] ) ; + }*/ + + double sumFPKM = 0 ; + double sumTPM = 0 ; + double sumCov = 0 ; + for ( l = i ; l < j ; ++l ) + { + sumFPKM += transcripts[l].FPKM ; + sumTPM += transcripts[l].TPM ; + sumCov += transcripts[l].cov ; + } + + transcripts[k] = transcripts[i] ; + for ( l = i + 1 ; l < j ; ++l ) + delete[] transcripts[l].exons ; + + transcripts[k].FPKM = sumFPKM / ( j - i ) ; + transcripts[k].TPM = sumTPM / ( j - i ) ; + transcripts[k].cov = sumCov / ( j - i ) ; + + if ( ( j - i < int( fraction * sampleCnt ) || j - i < minSampleCnt ) && sumCov < minAvgDepth * sampleCnt ) + //if ( transcripts[k].score * ( j - i ) < 1 && ( j - i ) < sampleCnt * 0.5 ) + //if ( sumTPM < sampleCnt || sumFPKM < sampleCnt ) + //if ( sumCov < minAvgDepthn * sampleCnt ) + { + transcripts[k].FPKM = -1 ; + } + else + sampleSupport.push_back( j - i ) ; + ++k ; + i = j ; + } + transcripts.resize( k ) ; + + // The motivation to separate the coalscing and voting is to allow + // a gene with no txpt passing the vote to pick a representative. + size = k ; + for ( i = 0 ; i < size ; ) + { + int cnt = 0 ; + if ( transcripts[i].FPKM != -1 ) + ++cnt ; + + for ( j = i + 1 ; j < size ; ++j ) + { + if ( transcripts[i].geneId != transcripts[j].geneId ) + break ; + if ( transcripts[j].FPKM != -1 ) + ++cnt ; + } + int l ; + /*double *freq = new double[cnt] ; + for ( l = 0 ; l < cnt ; ++l ) + freq[l] = transcripts[l].FPKM / sampleCnt ; + qsort( freq, cnt, sizeof( double ), CompDouble ) ; + + for ( l = 0 ; l < cnt ; ++l ) + printf( "%lf\n", freq[l] ) ; + printf( "===========\n" ) ; + delete[] freq ;*/ + /*if ( cnt == 0 ) + { + int maxEcnt = -1 ; + int maxCnt = 0 ; + int maxtag ; + for ( l = i ; l < j ; ++l ) + if ( transcripts[l].ecnt > maxEcnt ) + maxEcnt = transcripts[l].ecnt ; + + if ( maxEcnt >= 2 ) + { + int maxSampleSupport = -1 ; + maxtag = i ; + for ( l = i ; l < j ; ++l ) + { + if ( transcripts[l].ecnt > 1 && transcripts[l].ecnt >= 2 ) + { + if ( sampleSupport[l] > maxSampleSupport ) + { + maxSampleSupport = sampleSupport[l] ; + maxtag = l ; + maxCnt = 1 ; + } + else if ( sampleSupport[l] == maxSampleSupport ) + ++maxCnt ; + } + } + + if ( maxSampleSupport >= 3 && maxSampleSupport >= ( fraction / 10 ) * sampleCnt && transcripts[maxtag].TPM > 0) + { + if ( maxCnt > 1 ) + { + int maxTPM = -1 ; + for ( l = i ; l < j ; ++l ) + { + if ( sampleSupport[l] != maxSampleSupport ) + continue ; + if ( transcripts[l].TPM > maxTPM ) + { + maxTPM = transcripts[l].TPM ; + maxtag = l ; + } + } + + } + transcripts[maxtag].FPKM = 0 ; + //fprintf( stderr, "recovered: %d %d\n", sampleSupport[ maxtag ], transcripts[ maxtag ].ecnt ) ; + outputTranscripts.push_back( transcripts[maxtag] ) ; + } + } + } + else*/ + { + for ( l = i ; l < j ; ++l ) + { + //if ( transcripts[l].FPKM >= fraction * sampleCnt && transcripts[l].FPKM >= minSampleCnt ) + if ( transcripts[l].FPKM != -1 ) + { + outputTranscripts.push_back( transcripts[l] ) ; + } + } + } + + i = j ; + } + + // Output + size = outputTranscripts.size() ; + //printf( "%d\n", size ) ; + int transcriptId = 0 ; + int prevGid = -1 ; + for ( i = 0 ; i < size ; ++i ) + { + struct _outputTranscript &t = outputTranscripts[i] ; + const char *chrom = chrIdToName[t.chrId].c_str() ; + /*if ( t.geneId != prevGid ) + transcriptId = 0 ; + else + ++transcriptId ;*/ + transcriptId = outputTranscripts[i].transcriptId ; + fprintf( stdout, "%s\tPsiCLASS\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n", + chrom, t.exons[0].a, t.exons[t.ecnt - 1].b, t.strand, + prefix, chrom, t.geneId, + prefix, chrom, t.geneId, transcriptId, t.FPKM, t.TPM, t.cov, sampleSupport[i] ) ; + for ( j = 0 ; j < t.ecnt ; ++j ) + { + fprintf( stdout, "%s\tPsiCLASS\texon\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; " + "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n", + chrom, t.exons[j].a, t.exons[j].b, t.strand, + prefix, chrom, t.geneId, + prefix, chrom, t.geneId, transcriptId, + j + 1, t.FPKM, t.TPM, t.cov, + sampleSupport[i] ) ; + } + + prevGid = t.geneId ; + } + return 0 ; +}