0
|
1 // The program that vote to pick the trusted transcripts.
|
|
2 #include <stdio.h>
|
|
3 #include <stdlib.h>
|
|
4 #include <map>
|
|
5 #include <vector>
|
|
6 #include <algorithm>
|
|
7 #include <string.h>
|
|
8 #include <string>
|
|
9
|
|
10 #include "defs.h"
|
|
11 #include "TranscriptDecider.hpp"
|
|
12
|
|
13 char usage[] = "./transcript-vote [OPTIONS] > output.gtf:\n"
|
|
14 "Required:\n"
|
|
15 "\t--lg: path to the list of GTF files.\n"
|
|
16 "Optional:\n"
|
|
17 "\t-d FLOAT: threshold of average coverage depth across all the samples. (default: 1)\n"
|
|
18 //"\t-n INT: the number of samples a transcript showed up. (default: 3)\n"
|
|
19 ;
|
|
20
|
|
21
|
|
22 /*struct _transcript
|
|
23 {
|
|
24 int chrId ;
|
|
25 int geneId ;
|
|
26 char strand ;
|
|
27 struct _pair32 *exons ;
|
|
28 int ecnt ;
|
|
29 int sampleId ;
|
|
30 } ;*/
|
|
31
|
|
32 void GetGTFField( char *s, const char *field, char *ret )
|
|
33 {
|
|
34 char *p = strstr( s, field ) ;
|
|
35 if ( p == NULL )
|
|
36 return ;
|
|
37 for ( ; *p != ' ' ; ++p )
|
|
38 ;
|
|
39 p += 2 ; // add extra 1 to skip \"
|
|
40 sscanf( p, "%s", ret ) ;
|
|
41 //printf( "+%s %d\n", tid, strlen( tid ) ) ;
|
|
42 p = ret + strlen( ret ) ;
|
|
43 while ( *p != '\"' )
|
|
44 --p ;
|
|
45 *p = '\0' ;
|
|
46 }
|
|
47
|
|
48 int GetTailNumber( char *s )
|
|
49 {
|
|
50 int len = strlen( s ) ;
|
|
51 int ret = 0 ;
|
|
52 int i ;
|
|
53 int factor = 1 ;
|
|
54 for ( i = len - 1 ; i >= 0 && s[i] >= '0' && s[i] <= '9' ; --i, factor *= 10 )
|
|
55 {
|
|
56 ret += factor * ( s[i] - '0' ) ;
|
|
57 }
|
|
58 return ret ;
|
|
59 }
|
|
60
|
|
61 int CompDouble( const void *p1, const void *p2 )
|
|
62 {
|
|
63 double a = *(double *)p1 ;
|
|
64 double b = *(double *)p2 ;
|
|
65 if ( a > b )
|
|
66 return 1 ;
|
|
67 else if ( a < b )
|
|
68 return -1 ;
|
|
69 else
|
|
70 return 0 ;
|
|
71 }
|
|
72
|
|
73 int main( int argc, char *argv[] )
|
|
74 {
|
|
75 int i, j, k ;
|
|
76 double minAvgDepth = 1.0 ;
|
|
77 double fraction = 1.0 ;
|
|
78 int minSampleCnt = 3 ;
|
|
79 std::map<std::string, int> chrNameToId ;
|
|
80 std::map<int, std::string> chrIdToName ;
|
|
81
|
|
82 FILE *fpGTFlist = NULL ;
|
|
83 FILE *fp = NULL ;
|
|
84 for ( i = 1 ; i < argc ; ++i )
|
|
85 {
|
|
86 if ( !strcmp( argv[i], "--lg" ) )
|
|
87 {
|
|
88 fpGTFlist = fopen( argv[i + 1], "r" ) ;
|
|
89 ++i ;
|
|
90 }
|
|
91 else if ( !strcmp( argv[i], "-d" ) )
|
|
92 {
|
|
93 minAvgDepth = atof( argv[i + 1] ) ;
|
|
94 ++i ;
|
|
95 }
|
|
96 /*else if ( !strcmp( argv[i], "-n" ) )
|
|
97 {
|
|
98 minSampleCnt = atoi( argv[i + 1] ) ;
|
|
99 ++i ;
|
|
100 }*/
|
|
101 else
|
|
102 {
|
|
103 printf( "%s", usage ) ;
|
|
104 exit( 1 ) ;
|
|
105 }
|
|
106 }
|
|
107 if ( fpGTFlist == NULL )
|
|
108 {
|
|
109 printf( "Must use --lg option to speicfy the list of GTF files.\n%s", usage ) ;
|
|
110 exit( 1 ) ;
|
|
111 }
|
|
112
|
|
113 std::vector<struct _outputTranscript> transcripts ;
|
|
114 std::vector<struct _outputTranscript> outputTranscripts ;
|
|
115
|
|
116 char buffer[4096] ;
|
|
117 char line[10000] ;
|
|
118 char chrom[50], tool[20], type[40], strand[3] ;
|
|
119 //char tid[50] ;
|
|
120 int start, end ;
|
|
121 std::vector<struct _pair32> tmpExons ;
|
|
122 int sampleCnt = 0 ;
|
|
123 int gid = -1 ;
|
|
124 int tid = -1 ;
|
|
125 int chrId = -1 ;
|
|
126 int chrIdUsed = 0 ;
|
|
127 char cStrand = '.' ;
|
|
128 char prefix[50] ;
|
|
129 double FPKM = 0, TPM = 0, cov = 0 ;
|
|
130
|
|
131 while ( fgets( buffer, sizeof( buffer ), fpGTFlist ) != NULL )
|
|
132 {
|
|
133 int len = strlen( buffer ) ;
|
|
134 if ( buffer[len - 1] == '\n' )
|
|
135 {
|
|
136 buffer[len - 1] = '\0' ;
|
|
137 --len ;
|
|
138 }
|
|
139 fp = fopen( buffer, "r" ) ;
|
|
140
|
|
141
|
|
142 while ( fgets( line, sizeof( line ), fp ) != NULL )
|
|
143 {
|
|
144 if ( line[0] == '#' )
|
|
145 continue ;
|
|
146
|
|
147 sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ;
|
|
148
|
|
149 if ( strcmp( type, "exon" ) )
|
|
150 {
|
|
151 if ( tmpExons.size() > 0 )
|
|
152 {
|
|
153 struct _outputTranscript nt ;
|
|
154 nt.sampleId = sampleCnt ;
|
|
155 nt.chrId = chrId ;
|
|
156 nt.geneId = gid ;
|
|
157 nt.transcriptId = tid ;
|
|
158 nt.strand = cStrand ;
|
|
159 nt.ecnt = tmpExons.size() ;
|
|
160 nt.exons = new struct _pair32[nt.ecnt] ;
|
|
161 nt.FPKM = FPKM ;
|
|
162 nt.TPM = TPM ;
|
|
163 nt.cov = cov ;
|
|
164 for ( i = 0 ; i < nt.ecnt ; ++i )
|
|
165 nt.exons[i] = tmpExons[i] ;
|
|
166 tmpExons.clear() ;
|
|
167 transcripts.push_back( nt ) ;
|
|
168 }
|
|
169 continue ;
|
|
170 }
|
|
171
|
|
172 if ( chrNameToId.find( std::string( chrom ) ) == chrNameToId.end() )
|
|
173 {
|
|
174 chrId = chrIdUsed ;
|
|
175
|
|
176 std::string s( chrom ) ;
|
|
177 chrNameToId[s] = chrIdUsed ;
|
|
178 chrIdToName[ chrIdUsed ] = s ;
|
|
179 ++chrIdUsed ;
|
|
180 }
|
|
181 else
|
|
182 chrId = chrNameToId[ std::string( chrom ) ] ;
|
|
183
|
|
184 GetGTFField( line, "gene_id", buffer ) ;
|
|
185 gid = GetTailNumber( buffer ) ;
|
|
186
|
|
187 GetGTFField( line, "transcript_id", buffer ) ;
|
|
188 tid = GetTailNumber( buffer ) ;
|
|
189
|
|
190 GetGTFField( line, "FPKM", buffer ) ;
|
|
191 FPKM = atof( buffer ) ;
|
|
192
|
|
193 GetGTFField( line, "TPM", buffer ) ;
|
|
194 TPM = atof( buffer ) ;
|
|
195
|
|
196 GetGTFField( line, "cov", buffer ) ;
|
|
197 cov = atof( buffer ) ;
|
|
198
|
|
199 cStrand = strand[0] ;
|
|
200
|
|
201 // Look for the user-defined prefix.
|
|
202 int len = strlen( buffer ) ;
|
|
203 j = 0 ;
|
|
204 for ( i = len - 1 ; i >= 0 ; --i )
|
|
205 {
|
|
206 if ( buffer[i] == '.' )
|
|
207 {
|
|
208 ++j ;
|
|
209 if ( j >= 2 )
|
|
210 break ;
|
|
211 }
|
|
212 }
|
|
213
|
|
214 for ( j = 0 ; j < i ; ++i )
|
|
215 {
|
|
216 prefix[j] = buffer[j] ;
|
|
217 }
|
|
218 if ( i > 0 )
|
|
219 {
|
|
220 prefix[j] = '.' ;
|
|
221 prefix[j + 1] = '\0' ;
|
|
222 }
|
|
223 else
|
|
224 prefix[0] = '\0' ;
|
|
225
|
|
226 struct _pair32 ne ;
|
|
227 ne.a = start ;
|
|
228 ne.b = end ;
|
|
229 tmpExons.push_back( ne ) ;
|
|
230 }
|
|
231 if ( tmpExons.size() > 0 )
|
|
232 {
|
|
233 struct _outputTranscript nt ;
|
|
234 nt.sampleId = sampleCnt ;
|
|
235 nt.chrId = chrId ;
|
|
236 nt.geneId = gid ;
|
|
237 nt.transcriptId = tid ;
|
|
238 nt.strand = cStrand ;
|
|
239 nt.ecnt = tmpExons.size() ;
|
|
240 nt.exons = new struct _pair32[nt.ecnt] ;
|
|
241 nt.FPKM = FPKM ;
|
|
242 nt.TPM = TPM ;
|
|
243 nt.cov = cov ;
|
|
244 for ( i = 0 ; i < nt.ecnt ; ++i )
|
|
245 nt.exons[i] = tmpExons[i] ;
|
|
246 tmpExons.clear() ;
|
|
247 transcripts.push_back( nt ) ;
|
|
248 }
|
|
249
|
|
250 ++sampleCnt ;
|
|
251 fclose( fp ) ;
|
|
252 }
|
|
253 fclose( fpGTFlist ) ;
|
|
254
|
|
255 // Coalesce same transcripts
|
|
256 std::sort( transcripts.begin(), transcripts.end(), MultiThreadOutputTranscript::CompSortTranscripts ) ;
|
|
257 std::vector<int> sampleSupport ;
|
|
258 int size = transcripts.size() ;
|
|
259 if ( minSampleCnt > sampleCnt )
|
|
260 minSampleCnt = sampleCnt ;
|
|
261
|
|
262 for ( i = 0 ; i < size ; )
|
|
263 {
|
|
264 int l ;
|
|
265 for ( j = i + 1 ; j < size ; ++j )
|
|
266 {
|
|
267 if ( MultiThreadOutputTranscript::CompTranscripts( transcripts[i], transcripts[j] ) )
|
|
268 break ;
|
|
269 }
|
|
270 // [i,j) are the same transcripts.
|
|
271 /*if ( j - i >= fraction * sampleCnt && j - i >= minSampleCnt )
|
|
272 {
|
|
273 outputTranscripts.push_back( transcripts[i] ) ;
|
|
274 }*/
|
|
275
|
|
276 double sumFPKM = 0 ;
|
|
277 double sumTPM = 0 ;
|
|
278 double sumCov = 0 ;
|
|
279 for ( l = i ; l < j ; ++l )
|
|
280 {
|
|
281 sumFPKM += transcripts[l].FPKM ;
|
|
282 sumTPM += transcripts[l].TPM ;
|
|
283 sumCov += transcripts[l].cov ;
|
|
284 }
|
|
285
|
|
286 transcripts[k] = transcripts[i] ;
|
|
287 for ( l = i + 1 ; l < j ; ++l )
|
|
288 delete[] transcripts[l].exons ;
|
|
289
|
|
290 transcripts[k].FPKM = sumFPKM / ( j - i ) ;
|
|
291 transcripts[k].TPM = sumTPM / ( j - i ) ;
|
|
292 transcripts[k].cov = sumCov / ( j - i ) ;
|
|
293
|
|
294 if ( ( j - i < int( fraction * sampleCnt ) || j - i < minSampleCnt ) && sumCov < minAvgDepth * sampleCnt )
|
|
295 //if ( transcripts[k].score * ( j - i ) < 1 && ( j - i ) < sampleCnt * 0.5 )
|
|
296 //if ( sumTPM < sampleCnt || sumFPKM < sampleCnt )
|
|
297 //if ( sumCov < minAvgDepthn * sampleCnt )
|
|
298 {
|
|
299 transcripts[k].FPKM = -1 ;
|
|
300 }
|
|
301 else
|
|
302 sampleSupport.push_back( j - i ) ;
|
|
303 ++k ;
|
|
304 i = j ;
|
|
305 }
|
|
306 transcripts.resize( k ) ;
|
|
307
|
|
308 // The motivation to separate the coalscing and voting is to allow
|
|
309 // a gene with no txpt passing the vote to pick a representative.
|
|
310 size = k ;
|
|
311 for ( i = 0 ; i < size ; )
|
|
312 {
|
|
313 int cnt = 0 ;
|
|
314 if ( transcripts[i].FPKM != -1 )
|
|
315 ++cnt ;
|
|
316
|
|
317 for ( j = i + 1 ; j < size ; ++j )
|
|
318 {
|
|
319 if ( transcripts[i].geneId != transcripts[j].geneId )
|
|
320 break ;
|
|
321 if ( transcripts[j].FPKM != -1 )
|
|
322 ++cnt ;
|
|
323 }
|
|
324 int l ;
|
|
325 /*double *freq = new double[cnt] ;
|
|
326 for ( l = 0 ; l < cnt ; ++l )
|
|
327 freq[l] = transcripts[l].FPKM / sampleCnt ;
|
|
328 qsort( freq, cnt, sizeof( double ), CompDouble ) ;
|
|
329
|
|
330 for ( l = 0 ; l < cnt ; ++l )
|
|
331 printf( "%lf\n", freq[l] ) ;
|
|
332 printf( "===========\n" ) ;
|
|
333 delete[] freq ;*/
|
|
334 /*if ( cnt == 0 )
|
|
335 {
|
|
336 int maxEcnt = -1 ;
|
|
337 int maxCnt = 0 ;
|
|
338 int maxtag ;
|
|
339 for ( l = i ; l < j ; ++l )
|
|
340 if ( transcripts[l].ecnt > maxEcnt )
|
|
341 maxEcnt = transcripts[l].ecnt ;
|
|
342
|
|
343 if ( maxEcnt >= 2 )
|
|
344 {
|
|
345 int maxSampleSupport = -1 ;
|
|
346 maxtag = i ;
|
|
347 for ( l = i ; l < j ; ++l )
|
|
348 {
|
|
349 if ( transcripts[l].ecnt > 1 && transcripts[l].ecnt >= 2 )
|
|
350 {
|
|
351 if ( sampleSupport[l] > maxSampleSupport )
|
|
352 {
|
|
353 maxSampleSupport = sampleSupport[l] ;
|
|
354 maxtag = l ;
|
|
355 maxCnt = 1 ;
|
|
356 }
|
|
357 else if ( sampleSupport[l] == maxSampleSupport )
|
|
358 ++maxCnt ;
|
|
359 }
|
|
360 }
|
|
361
|
|
362 if ( maxSampleSupport >= 3 && maxSampleSupport >= ( fraction / 10 ) * sampleCnt && transcripts[maxtag].TPM > 0)
|
|
363 {
|
|
364 if ( maxCnt > 1 )
|
|
365 {
|
|
366 int maxTPM = -1 ;
|
|
367 for ( l = i ; l < j ; ++l )
|
|
368 {
|
|
369 if ( sampleSupport[l] != maxSampleSupport )
|
|
370 continue ;
|
|
371 if ( transcripts[l].TPM > maxTPM )
|
|
372 {
|
|
373 maxTPM = transcripts[l].TPM ;
|
|
374 maxtag = l ;
|
|
375 }
|
|
376 }
|
|
377
|
|
378 }
|
|
379 transcripts[maxtag].FPKM = 0 ;
|
|
380 //fprintf( stderr, "recovered: %d %d\n", sampleSupport[ maxtag ], transcripts[ maxtag ].ecnt ) ;
|
|
381 outputTranscripts.push_back( transcripts[maxtag] ) ;
|
|
382 }
|
|
383 }
|
|
384 }
|
|
385 else*/
|
|
386 {
|
|
387 for ( l = i ; l < j ; ++l )
|
|
388 {
|
|
389 //if ( transcripts[l].FPKM >= fraction * sampleCnt && transcripts[l].FPKM >= minSampleCnt )
|
|
390 if ( transcripts[l].FPKM != -1 )
|
|
391 {
|
|
392 outputTranscripts.push_back( transcripts[l] ) ;
|
|
393 }
|
|
394 }
|
|
395 }
|
|
396
|
|
397 i = j ;
|
|
398 }
|
|
399
|
|
400 // Output
|
|
401 size = outputTranscripts.size() ;
|
|
402 //printf( "%d\n", size ) ;
|
|
403 int transcriptId = 0 ;
|
|
404 int prevGid = -1 ;
|
|
405 for ( i = 0 ; i < size ; ++i )
|
|
406 {
|
|
407 struct _outputTranscript &t = outputTranscripts[i] ;
|
|
408 const char *chrom = chrIdToName[t.chrId].c_str() ;
|
|
409 /*if ( t.geneId != prevGid )
|
|
410 transcriptId = 0 ;
|
|
411 else
|
|
412 ++transcriptId ;*/
|
|
413 transcriptId = outputTranscripts[i].transcriptId ;
|
|
414 fprintf( stdout, "%s\tPsiCLASS\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n",
|
|
415 chrom, t.exons[0].a, t.exons[t.ecnt - 1].b, t.strand,
|
|
416 prefix, chrom, t.geneId,
|
|
417 prefix, chrom, t.geneId, transcriptId, t.FPKM, t.TPM, t.cov, sampleSupport[i] ) ;
|
|
418 for ( j = 0 ; j < t.ecnt ; ++j )
|
|
419 {
|
|
420 fprintf( stdout, "%s\tPsiCLASS\texon\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; "
|
|
421 "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n",
|
|
422 chrom, t.exons[j].a, t.exons[j].b, t.strand,
|
|
423 prefix, chrom, t.geneId,
|
|
424 prefix, chrom, t.geneId, transcriptId,
|
|
425 j + 1, t.FPKM, t.TPM, t.cov,
|
|
426 sampleSupport[i] ) ;
|
|
427 }
|
|
428
|
|
429 prevGid = t.geneId ;
|
|
430 }
|
|
431 return 0 ;
|
|
432 }
|