Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/Vote.cpp @ 0:903fc43d6227 draft default tip
Uploaded
| author | lsong10 |
|---|---|
| date | Fri, 26 Mar 2021 16:52:45 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:903fc43d6227 |
|---|---|
| 1 // The program that vote to pick the trusted transcripts. | |
| 2 #include <stdio.h> | |
| 3 #include <stdlib.h> | |
| 4 #include <map> | |
| 5 #include <vector> | |
| 6 #include <algorithm> | |
| 7 #include <string.h> | |
| 8 #include <string> | |
| 9 | |
| 10 #include "defs.h" | |
| 11 #include "TranscriptDecider.hpp" | |
| 12 | |
| 13 char usage[] = "./transcript-vote [OPTIONS] > output.gtf:\n" | |
| 14 "Required:\n" | |
| 15 "\t--lg: path to the list of GTF files.\n" | |
| 16 "Optional:\n" | |
| 17 "\t-d FLOAT: threshold of average coverage depth across all the samples. (default: 1)\n" | |
| 18 //"\t-n INT: the number of samples a transcript showed up. (default: 3)\n" | |
| 19 ; | |
| 20 | |
| 21 | |
| 22 /*struct _transcript | |
| 23 { | |
| 24 int chrId ; | |
| 25 int geneId ; | |
| 26 char strand ; | |
| 27 struct _pair32 *exons ; | |
| 28 int ecnt ; | |
| 29 int sampleId ; | |
| 30 } ;*/ | |
| 31 | |
| 32 void GetGTFField( char *s, const char *field, char *ret ) | |
| 33 { | |
| 34 char *p = strstr( s, field ) ; | |
| 35 if ( p == NULL ) | |
| 36 return ; | |
| 37 for ( ; *p != ' ' ; ++p ) | |
| 38 ; | |
| 39 p += 2 ; // add extra 1 to skip \" | |
| 40 sscanf( p, "%s", ret ) ; | |
| 41 //printf( "+%s %d\n", tid, strlen( tid ) ) ; | |
| 42 p = ret + strlen( ret ) ; | |
| 43 while ( *p != '\"' ) | |
| 44 --p ; | |
| 45 *p = '\0' ; | |
| 46 } | |
| 47 | |
| 48 int GetTailNumber( char *s ) | |
| 49 { | |
| 50 int len = strlen( s ) ; | |
| 51 int ret = 0 ; | |
| 52 int i ; | |
| 53 int factor = 1 ; | |
| 54 for ( i = len - 1 ; i >= 0 && s[i] >= '0' && s[i] <= '9' ; --i, factor *= 10 ) | |
| 55 { | |
| 56 ret += factor * ( s[i] - '0' ) ; | |
| 57 } | |
| 58 return ret ; | |
| 59 } | |
| 60 | |
| 61 int CompDouble( const void *p1, const void *p2 ) | |
| 62 { | |
| 63 double a = *(double *)p1 ; | |
| 64 double b = *(double *)p2 ; | |
| 65 if ( a > b ) | |
| 66 return 1 ; | |
| 67 else if ( a < b ) | |
| 68 return -1 ; | |
| 69 else | |
| 70 return 0 ; | |
| 71 } | |
| 72 | |
| 73 int main( int argc, char *argv[] ) | |
| 74 { | |
| 75 int i, j, k ; | |
| 76 double minAvgDepth = 1.0 ; | |
| 77 double fraction = 1.0 ; | |
| 78 int minSampleCnt = 3 ; | |
| 79 std::map<std::string, int> chrNameToId ; | |
| 80 std::map<int, std::string> chrIdToName ; | |
| 81 | |
| 82 FILE *fpGTFlist = NULL ; | |
| 83 FILE *fp = NULL ; | |
| 84 for ( i = 1 ; i < argc ; ++i ) | |
| 85 { | |
| 86 if ( !strcmp( argv[i], "--lg" ) ) | |
| 87 { | |
| 88 fpGTFlist = fopen( argv[i + 1], "r" ) ; | |
| 89 ++i ; | |
| 90 } | |
| 91 else if ( !strcmp( argv[i], "-d" ) ) | |
| 92 { | |
| 93 minAvgDepth = atof( argv[i + 1] ) ; | |
| 94 ++i ; | |
| 95 } | |
| 96 /*else if ( !strcmp( argv[i], "-n" ) ) | |
| 97 { | |
| 98 minSampleCnt = atoi( argv[i + 1] ) ; | |
| 99 ++i ; | |
| 100 }*/ | |
| 101 else | |
| 102 { | |
| 103 printf( "%s", usage ) ; | |
| 104 exit( 1 ) ; | |
| 105 } | |
| 106 } | |
| 107 if ( fpGTFlist == NULL ) | |
| 108 { | |
| 109 printf( "Must use --lg option to speicfy the list of GTF files.\n%s", usage ) ; | |
| 110 exit( 1 ) ; | |
| 111 } | |
| 112 | |
| 113 std::vector<struct _outputTranscript> transcripts ; | |
| 114 std::vector<struct _outputTranscript> outputTranscripts ; | |
| 115 | |
| 116 char buffer[4096] ; | |
| 117 char line[10000] ; | |
| 118 char chrom[50], tool[20], type[40], strand[3] ; | |
| 119 //char tid[50] ; | |
| 120 int start, end ; | |
| 121 std::vector<struct _pair32> tmpExons ; | |
| 122 int sampleCnt = 0 ; | |
| 123 int gid = -1 ; | |
| 124 int tid = -1 ; | |
| 125 int chrId = -1 ; | |
| 126 int chrIdUsed = 0 ; | |
| 127 char cStrand = '.' ; | |
| 128 char prefix[50] ; | |
| 129 double FPKM = 0, TPM = 0, cov = 0 ; | |
| 130 | |
| 131 while ( fgets( buffer, sizeof( buffer ), fpGTFlist ) != NULL ) | |
| 132 { | |
| 133 int len = strlen( buffer ) ; | |
| 134 if ( buffer[len - 1] == '\n' ) | |
| 135 { | |
| 136 buffer[len - 1] = '\0' ; | |
| 137 --len ; | |
| 138 } | |
| 139 fp = fopen( buffer, "r" ) ; | |
| 140 | |
| 141 | |
| 142 while ( fgets( line, sizeof( line ), fp ) != NULL ) | |
| 143 { | |
| 144 if ( line[0] == '#' ) | |
| 145 continue ; | |
| 146 | |
| 147 sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ; | |
| 148 | |
| 149 if ( strcmp( type, "exon" ) ) | |
| 150 { | |
| 151 if ( tmpExons.size() > 0 ) | |
| 152 { | |
| 153 struct _outputTranscript nt ; | |
| 154 nt.sampleId = sampleCnt ; | |
| 155 nt.chrId = chrId ; | |
| 156 nt.geneId = gid ; | |
| 157 nt.transcriptId = tid ; | |
| 158 nt.strand = cStrand ; | |
| 159 nt.ecnt = tmpExons.size() ; | |
| 160 nt.exons = new struct _pair32[nt.ecnt] ; | |
| 161 nt.FPKM = FPKM ; | |
| 162 nt.TPM = TPM ; | |
| 163 nt.cov = cov ; | |
| 164 for ( i = 0 ; i < nt.ecnt ; ++i ) | |
| 165 nt.exons[i] = tmpExons[i] ; | |
| 166 tmpExons.clear() ; | |
| 167 transcripts.push_back( nt ) ; | |
| 168 } | |
| 169 continue ; | |
| 170 } | |
| 171 | |
| 172 if ( chrNameToId.find( std::string( chrom ) ) == chrNameToId.end() ) | |
| 173 { | |
| 174 chrId = chrIdUsed ; | |
| 175 | |
| 176 std::string s( chrom ) ; | |
| 177 chrNameToId[s] = chrIdUsed ; | |
| 178 chrIdToName[ chrIdUsed ] = s ; | |
| 179 ++chrIdUsed ; | |
| 180 } | |
| 181 else | |
| 182 chrId = chrNameToId[ std::string( chrom ) ] ; | |
| 183 | |
| 184 GetGTFField( line, "gene_id", buffer ) ; | |
| 185 gid = GetTailNumber( buffer ) ; | |
| 186 | |
| 187 GetGTFField( line, "transcript_id", buffer ) ; | |
| 188 tid = GetTailNumber( buffer ) ; | |
| 189 | |
| 190 GetGTFField( line, "FPKM", buffer ) ; | |
| 191 FPKM = atof( buffer ) ; | |
| 192 | |
| 193 GetGTFField( line, "TPM", buffer ) ; | |
| 194 TPM = atof( buffer ) ; | |
| 195 | |
| 196 GetGTFField( line, "cov", buffer ) ; | |
| 197 cov = atof( buffer ) ; | |
| 198 | |
| 199 cStrand = strand[0] ; | |
| 200 | |
| 201 // Look for the user-defined prefix. | |
| 202 int len = strlen( buffer ) ; | |
| 203 j = 0 ; | |
| 204 for ( i = len - 1 ; i >= 0 ; --i ) | |
| 205 { | |
| 206 if ( buffer[i] == '.' ) | |
| 207 { | |
| 208 ++j ; | |
| 209 if ( j >= 2 ) | |
| 210 break ; | |
| 211 } | |
| 212 } | |
| 213 | |
| 214 for ( j = 0 ; j < i ; ++i ) | |
| 215 { | |
| 216 prefix[j] = buffer[j] ; | |
| 217 } | |
| 218 if ( i > 0 ) | |
| 219 { | |
| 220 prefix[j] = '.' ; | |
| 221 prefix[j + 1] = '\0' ; | |
| 222 } | |
| 223 else | |
| 224 prefix[0] = '\0' ; | |
| 225 | |
| 226 struct _pair32 ne ; | |
| 227 ne.a = start ; | |
| 228 ne.b = end ; | |
| 229 tmpExons.push_back( ne ) ; | |
| 230 } | |
| 231 if ( tmpExons.size() > 0 ) | |
| 232 { | |
| 233 struct _outputTranscript nt ; | |
| 234 nt.sampleId = sampleCnt ; | |
| 235 nt.chrId = chrId ; | |
| 236 nt.geneId = gid ; | |
| 237 nt.transcriptId = tid ; | |
| 238 nt.strand = cStrand ; | |
| 239 nt.ecnt = tmpExons.size() ; | |
| 240 nt.exons = new struct _pair32[nt.ecnt] ; | |
| 241 nt.FPKM = FPKM ; | |
| 242 nt.TPM = TPM ; | |
| 243 nt.cov = cov ; | |
| 244 for ( i = 0 ; i < nt.ecnt ; ++i ) | |
| 245 nt.exons[i] = tmpExons[i] ; | |
| 246 tmpExons.clear() ; | |
| 247 transcripts.push_back( nt ) ; | |
| 248 } | |
| 249 | |
| 250 ++sampleCnt ; | |
| 251 fclose( fp ) ; | |
| 252 } | |
| 253 fclose( fpGTFlist ) ; | |
| 254 | |
| 255 // Coalesce same transcripts | |
| 256 std::sort( transcripts.begin(), transcripts.end(), MultiThreadOutputTranscript::CompSortTranscripts ) ; | |
| 257 std::vector<int> sampleSupport ; | |
| 258 int size = transcripts.size() ; | |
| 259 if ( minSampleCnt > sampleCnt ) | |
| 260 minSampleCnt = sampleCnt ; | |
| 261 | |
| 262 for ( i = 0 ; i < size ; ) | |
| 263 { | |
| 264 int l ; | |
| 265 for ( j = i + 1 ; j < size ; ++j ) | |
| 266 { | |
| 267 if ( MultiThreadOutputTranscript::CompTranscripts( transcripts[i], transcripts[j] ) ) | |
| 268 break ; | |
| 269 } | |
| 270 // [i,j) are the same transcripts. | |
| 271 /*if ( j - i >= fraction * sampleCnt && j - i >= minSampleCnt ) | |
| 272 { | |
| 273 outputTranscripts.push_back( transcripts[i] ) ; | |
| 274 }*/ | |
| 275 | |
| 276 double sumFPKM = 0 ; | |
| 277 double sumTPM = 0 ; | |
| 278 double sumCov = 0 ; | |
| 279 for ( l = i ; l < j ; ++l ) | |
| 280 { | |
| 281 sumFPKM += transcripts[l].FPKM ; | |
| 282 sumTPM += transcripts[l].TPM ; | |
| 283 sumCov += transcripts[l].cov ; | |
| 284 } | |
| 285 | |
| 286 transcripts[k] = transcripts[i] ; | |
| 287 for ( l = i + 1 ; l < j ; ++l ) | |
| 288 delete[] transcripts[l].exons ; | |
| 289 | |
| 290 transcripts[k].FPKM = sumFPKM / ( j - i ) ; | |
| 291 transcripts[k].TPM = sumTPM / ( j - i ) ; | |
| 292 transcripts[k].cov = sumCov / ( j - i ) ; | |
| 293 | |
| 294 if ( ( j - i < int( fraction * sampleCnt ) || j - i < minSampleCnt ) && sumCov < minAvgDepth * sampleCnt ) | |
| 295 //if ( transcripts[k].score * ( j - i ) < 1 && ( j - i ) < sampleCnt * 0.5 ) | |
| 296 //if ( sumTPM < sampleCnt || sumFPKM < sampleCnt ) | |
| 297 //if ( sumCov < minAvgDepthn * sampleCnt ) | |
| 298 { | |
| 299 transcripts[k].FPKM = -1 ; | |
| 300 } | |
| 301 else | |
| 302 sampleSupport.push_back( j - i ) ; | |
| 303 ++k ; | |
| 304 i = j ; | |
| 305 } | |
| 306 transcripts.resize( k ) ; | |
| 307 | |
| 308 // The motivation to separate the coalscing and voting is to allow | |
| 309 // a gene with no txpt passing the vote to pick a representative. | |
| 310 size = k ; | |
| 311 for ( i = 0 ; i < size ; ) | |
| 312 { | |
| 313 int cnt = 0 ; | |
| 314 if ( transcripts[i].FPKM != -1 ) | |
| 315 ++cnt ; | |
| 316 | |
| 317 for ( j = i + 1 ; j < size ; ++j ) | |
| 318 { | |
| 319 if ( transcripts[i].geneId != transcripts[j].geneId ) | |
| 320 break ; | |
| 321 if ( transcripts[j].FPKM != -1 ) | |
| 322 ++cnt ; | |
| 323 } | |
| 324 int l ; | |
| 325 /*double *freq = new double[cnt] ; | |
| 326 for ( l = 0 ; l < cnt ; ++l ) | |
| 327 freq[l] = transcripts[l].FPKM / sampleCnt ; | |
| 328 qsort( freq, cnt, sizeof( double ), CompDouble ) ; | |
| 329 | |
| 330 for ( l = 0 ; l < cnt ; ++l ) | |
| 331 printf( "%lf\n", freq[l] ) ; | |
| 332 printf( "===========\n" ) ; | |
| 333 delete[] freq ;*/ | |
| 334 /*if ( cnt == 0 ) | |
| 335 { | |
| 336 int maxEcnt = -1 ; | |
| 337 int maxCnt = 0 ; | |
| 338 int maxtag ; | |
| 339 for ( l = i ; l < j ; ++l ) | |
| 340 if ( transcripts[l].ecnt > maxEcnt ) | |
| 341 maxEcnt = transcripts[l].ecnt ; | |
| 342 | |
| 343 if ( maxEcnt >= 2 ) | |
| 344 { | |
| 345 int maxSampleSupport = -1 ; | |
| 346 maxtag = i ; | |
| 347 for ( l = i ; l < j ; ++l ) | |
| 348 { | |
| 349 if ( transcripts[l].ecnt > 1 && transcripts[l].ecnt >= 2 ) | |
| 350 { | |
| 351 if ( sampleSupport[l] > maxSampleSupport ) | |
| 352 { | |
| 353 maxSampleSupport = sampleSupport[l] ; | |
| 354 maxtag = l ; | |
| 355 maxCnt = 1 ; | |
| 356 } | |
| 357 else if ( sampleSupport[l] == maxSampleSupport ) | |
| 358 ++maxCnt ; | |
| 359 } | |
| 360 } | |
| 361 | |
| 362 if ( maxSampleSupport >= 3 && maxSampleSupport >= ( fraction / 10 ) * sampleCnt && transcripts[maxtag].TPM > 0) | |
| 363 { | |
| 364 if ( maxCnt > 1 ) | |
| 365 { | |
| 366 int maxTPM = -1 ; | |
| 367 for ( l = i ; l < j ; ++l ) | |
| 368 { | |
| 369 if ( sampleSupport[l] != maxSampleSupport ) | |
| 370 continue ; | |
| 371 if ( transcripts[l].TPM > maxTPM ) | |
| 372 { | |
| 373 maxTPM = transcripts[l].TPM ; | |
| 374 maxtag = l ; | |
| 375 } | |
| 376 } | |
| 377 | |
| 378 } | |
| 379 transcripts[maxtag].FPKM = 0 ; | |
| 380 //fprintf( stderr, "recovered: %d %d\n", sampleSupport[ maxtag ], transcripts[ maxtag ].ecnt ) ; | |
| 381 outputTranscripts.push_back( transcripts[maxtag] ) ; | |
| 382 } | |
| 383 } | |
| 384 } | |
| 385 else*/ | |
| 386 { | |
| 387 for ( l = i ; l < j ; ++l ) | |
| 388 { | |
| 389 //if ( transcripts[l].FPKM >= fraction * sampleCnt && transcripts[l].FPKM >= minSampleCnt ) | |
| 390 if ( transcripts[l].FPKM != -1 ) | |
| 391 { | |
| 392 outputTranscripts.push_back( transcripts[l] ) ; | |
| 393 } | |
| 394 } | |
| 395 } | |
| 396 | |
| 397 i = j ; | |
| 398 } | |
| 399 | |
| 400 // Output | |
| 401 size = outputTranscripts.size() ; | |
| 402 //printf( "%d\n", size ) ; | |
| 403 int transcriptId = 0 ; | |
| 404 int prevGid = -1 ; | |
| 405 for ( i = 0 ; i < size ; ++i ) | |
| 406 { | |
| 407 struct _outputTranscript &t = outputTranscripts[i] ; | |
| 408 const char *chrom = chrIdToName[t.chrId].c_str() ; | |
| 409 /*if ( t.geneId != prevGid ) | |
| 410 transcriptId = 0 ; | |
| 411 else | |
| 412 ++transcriptId ;*/ | |
| 413 transcriptId = outputTranscripts[i].transcriptId ; | |
| 414 fprintf( stdout, "%s\tPsiCLASS\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n", | |
| 415 chrom, t.exons[0].a, t.exons[t.ecnt - 1].b, t.strand, | |
| 416 prefix, chrom, t.geneId, | |
| 417 prefix, chrom, t.geneId, transcriptId, t.FPKM, t.TPM, t.cov, sampleSupport[i] ) ; | |
| 418 for ( j = 0 ; j < t.ecnt ; ++j ) | |
| 419 { | |
| 420 fprintf( stdout, "%s\tPsiCLASS\texon\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; " | |
| 421 "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n", | |
| 422 chrom, t.exons[j].a, t.exons[j].b, t.strand, | |
| 423 prefix, chrom, t.geneId, | |
| 424 prefix, chrom, t.geneId, transcriptId, | |
| 425 j + 1, t.FPKM, t.TPM, t.cov, | |
| 426 sampleSupport[i] ) ; | |
| 427 } | |
| 428 | |
| 429 prevGid = t.geneId ; | |
| 430 } | |
| 431 return 0 ; | |
| 432 } |
