comparison PsiCLASS-1.0.2/Vote.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 // The program that vote to pick the trusted transcripts.
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <map>
5 #include <vector>
6 #include <algorithm>
7 #include <string.h>
8 #include <string>
9
10 #include "defs.h"
11 #include "TranscriptDecider.hpp"
12
13 char usage[] = "./transcript-vote [OPTIONS] > output.gtf:\n"
14 "Required:\n"
15 "\t--lg: path to the list of GTF files.\n"
16 "Optional:\n"
17 "\t-d FLOAT: threshold of average coverage depth across all the samples. (default: 1)\n"
18 //"\t-n INT: the number of samples a transcript showed up. (default: 3)\n"
19 ;
20
21
22 /*struct _transcript
23 {
24 int chrId ;
25 int geneId ;
26 char strand ;
27 struct _pair32 *exons ;
28 int ecnt ;
29 int sampleId ;
30 } ;*/
31
32 void GetGTFField( char *s, const char *field, char *ret )
33 {
34 char *p = strstr( s, field ) ;
35 if ( p == NULL )
36 return ;
37 for ( ; *p != ' ' ; ++p )
38 ;
39 p += 2 ; // add extra 1 to skip \"
40 sscanf( p, "%s", ret ) ;
41 //printf( "+%s %d\n", tid, strlen( tid ) ) ;
42 p = ret + strlen( ret ) ;
43 while ( *p != '\"' )
44 --p ;
45 *p = '\0' ;
46 }
47
48 int GetTailNumber( char *s )
49 {
50 int len = strlen( s ) ;
51 int ret = 0 ;
52 int i ;
53 int factor = 1 ;
54 for ( i = len - 1 ; i >= 0 && s[i] >= '0' && s[i] <= '9' ; --i, factor *= 10 )
55 {
56 ret += factor * ( s[i] - '0' ) ;
57 }
58 return ret ;
59 }
60
61 int CompDouble( const void *p1, const void *p2 )
62 {
63 double a = *(double *)p1 ;
64 double b = *(double *)p2 ;
65 if ( a > b )
66 return 1 ;
67 else if ( a < b )
68 return -1 ;
69 else
70 return 0 ;
71 }
72
73 int main( int argc, char *argv[] )
74 {
75 int i, j, k ;
76 double minAvgDepth = 1.0 ;
77 double fraction = 1.0 ;
78 int minSampleCnt = 3 ;
79 std::map<std::string, int> chrNameToId ;
80 std::map<int, std::string> chrIdToName ;
81
82 FILE *fpGTFlist = NULL ;
83 FILE *fp = NULL ;
84 for ( i = 1 ; i < argc ; ++i )
85 {
86 if ( !strcmp( argv[i], "--lg" ) )
87 {
88 fpGTFlist = fopen( argv[i + 1], "r" ) ;
89 ++i ;
90 }
91 else if ( !strcmp( argv[i], "-d" ) )
92 {
93 minAvgDepth = atof( argv[i + 1] ) ;
94 ++i ;
95 }
96 /*else if ( !strcmp( argv[i], "-n" ) )
97 {
98 minSampleCnt = atoi( argv[i + 1] ) ;
99 ++i ;
100 }*/
101 else
102 {
103 printf( "%s", usage ) ;
104 exit( 1 ) ;
105 }
106 }
107 if ( fpGTFlist == NULL )
108 {
109 printf( "Must use --lg option to speicfy the list of GTF files.\n%s", usage ) ;
110 exit( 1 ) ;
111 }
112
113 std::vector<struct _outputTranscript> transcripts ;
114 std::vector<struct _outputTranscript> outputTranscripts ;
115
116 char buffer[4096] ;
117 char line[10000] ;
118 char chrom[50], tool[20], type[40], strand[3] ;
119 //char tid[50] ;
120 int start, end ;
121 std::vector<struct _pair32> tmpExons ;
122 int sampleCnt = 0 ;
123 int gid = -1 ;
124 int tid = -1 ;
125 int chrId = -1 ;
126 int chrIdUsed = 0 ;
127 char cStrand = '.' ;
128 char prefix[50] ;
129 double FPKM = 0, TPM = 0, cov = 0 ;
130
131 while ( fgets( buffer, sizeof( buffer ), fpGTFlist ) != NULL )
132 {
133 int len = strlen( buffer ) ;
134 if ( buffer[len - 1] == '\n' )
135 {
136 buffer[len - 1] = '\0' ;
137 --len ;
138 }
139 fp = fopen( buffer, "r" ) ;
140
141
142 while ( fgets( line, sizeof( line ), fp ) != NULL )
143 {
144 if ( line[0] == '#' )
145 continue ;
146
147 sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ;
148
149 if ( strcmp( type, "exon" ) )
150 {
151 if ( tmpExons.size() > 0 )
152 {
153 struct _outputTranscript nt ;
154 nt.sampleId = sampleCnt ;
155 nt.chrId = chrId ;
156 nt.geneId = gid ;
157 nt.transcriptId = tid ;
158 nt.strand = cStrand ;
159 nt.ecnt = tmpExons.size() ;
160 nt.exons = new struct _pair32[nt.ecnt] ;
161 nt.FPKM = FPKM ;
162 nt.TPM = TPM ;
163 nt.cov = cov ;
164 for ( i = 0 ; i < nt.ecnt ; ++i )
165 nt.exons[i] = tmpExons[i] ;
166 tmpExons.clear() ;
167 transcripts.push_back( nt ) ;
168 }
169 continue ;
170 }
171
172 if ( chrNameToId.find( std::string( chrom ) ) == chrNameToId.end() )
173 {
174 chrId = chrIdUsed ;
175
176 std::string s( chrom ) ;
177 chrNameToId[s] = chrIdUsed ;
178 chrIdToName[ chrIdUsed ] = s ;
179 ++chrIdUsed ;
180 }
181 else
182 chrId = chrNameToId[ std::string( chrom ) ] ;
183
184 GetGTFField( line, "gene_id", buffer ) ;
185 gid = GetTailNumber( buffer ) ;
186
187 GetGTFField( line, "transcript_id", buffer ) ;
188 tid = GetTailNumber( buffer ) ;
189
190 GetGTFField( line, "FPKM", buffer ) ;
191 FPKM = atof( buffer ) ;
192
193 GetGTFField( line, "TPM", buffer ) ;
194 TPM = atof( buffer ) ;
195
196 GetGTFField( line, "cov", buffer ) ;
197 cov = atof( buffer ) ;
198
199 cStrand = strand[0] ;
200
201 // Look for the user-defined prefix.
202 int len = strlen( buffer ) ;
203 j = 0 ;
204 for ( i = len - 1 ; i >= 0 ; --i )
205 {
206 if ( buffer[i] == '.' )
207 {
208 ++j ;
209 if ( j >= 2 )
210 break ;
211 }
212 }
213
214 for ( j = 0 ; j < i ; ++i )
215 {
216 prefix[j] = buffer[j] ;
217 }
218 if ( i > 0 )
219 {
220 prefix[j] = '.' ;
221 prefix[j + 1] = '\0' ;
222 }
223 else
224 prefix[0] = '\0' ;
225
226 struct _pair32 ne ;
227 ne.a = start ;
228 ne.b = end ;
229 tmpExons.push_back( ne ) ;
230 }
231 if ( tmpExons.size() > 0 )
232 {
233 struct _outputTranscript nt ;
234 nt.sampleId = sampleCnt ;
235 nt.chrId = chrId ;
236 nt.geneId = gid ;
237 nt.transcriptId = tid ;
238 nt.strand = cStrand ;
239 nt.ecnt = tmpExons.size() ;
240 nt.exons = new struct _pair32[nt.ecnt] ;
241 nt.FPKM = FPKM ;
242 nt.TPM = TPM ;
243 nt.cov = cov ;
244 for ( i = 0 ; i < nt.ecnt ; ++i )
245 nt.exons[i] = tmpExons[i] ;
246 tmpExons.clear() ;
247 transcripts.push_back( nt ) ;
248 }
249
250 ++sampleCnt ;
251 fclose( fp ) ;
252 }
253 fclose( fpGTFlist ) ;
254
255 // Coalesce same transcripts
256 std::sort( transcripts.begin(), transcripts.end(), MultiThreadOutputTranscript::CompSortTranscripts ) ;
257 std::vector<int> sampleSupport ;
258 int size = transcripts.size() ;
259 if ( minSampleCnt > sampleCnt )
260 minSampleCnt = sampleCnt ;
261
262 for ( i = 0 ; i < size ; )
263 {
264 int l ;
265 for ( j = i + 1 ; j < size ; ++j )
266 {
267 if ( MultiThreadOutputTranscript::CompTranscripts( transcripts[i], transcripts[j] ) )
268 break ;
269 }
270 // [i,j) are the same transcripts.
271 /*if ( j - i >= fraction * sampleCnt && j - i >= minSampleCnt )
272 {
273 outputTranscripts.push_back( transcripts[i] ) ;
274 }*/
275
276 double sumFPKM = 0 ;
277 double sumTPM = 0 ;
278 double sumCov = 0 ;
279 for ( l = i ; l < j ; ++l )
280 {
281 sumFPKM += transcripts[l].FPKM ;
282 sumTPM += transcripts[l].TPM ;
283 sumCov += transcripts[l].cov ;
284 }
285
286 transcripts[k] = transcripts[i] ;
287 for ( l = i + 1 ; l < j ; ++l )
288 delete[] transcripts[l].exons ;
289
290 transcripts[k].FPKM = sumFPKM / ( j - i ) ;
291 transcripts[k].TPM = sumTPM / ( j - i ) ;
292 transcripts[k].cov = sumCov / ( j - i ) ;
293
294 if ( ( j - i < int( fraction * sampleCnt ) || j - i < minSampleCnt ) && sumCov < minAvgDepth * sampleCnt )
295 //if ( transcripts[k].score * ( j - i ) < 1 && ( j - i ) < sampleCnt * 0.5 )
296 //if ( sumTPM < sampleCnt || sumFPKM < sampleCnt )
297 //if ( sumCov < minAvgDepthn * sampleCnt )
298 {
299 transcripts[k].FPKM = -1 ;
300 }
301 else
302 sampleSupport.push_back( j - i ) ;
303 ++k ;
304 i = j ;
305 }
306 transcripts.resize( k ) ;
307
308 // The motivation to separate the coalscing and voting is to allow
309 // a gene with no txpt passing the vote to pick a representative.
310 size = k ;
311 for ( i = 0 ; i < size ; )
312 {
313 int cnt = 0 ;
314 if ( transcripts[i].FPKM != -1 )
315 ++cnt ;
316
317 for ( j = i + 1 ; j < size ; ++j )
318 {
319 if ( transcripts[i].geneId != transcripts[j].geneId )
320 break ;
321 if ( transcripts[j].FPKM != -1 )
322 ++cnt ;
323 }
324 int l ;
325 /*double *freq = new double[cnt] ;
326 for ( l = 0 ; l < cnt ; ++l )
327 freq[l] = transcripts[l].FPKM / sampleCnt ;
328 qsort( freq, cnt, sizeof( double ), CompDouble ) ;
329
330 for ( l = 0 ; l < cnt ; ++l )
331 printf( "%lf\n", freq[l] ) ;
332 printf( "===========\n" ) ;
333 delete[] freq ;*/
334 /*if ( cnt == 0 )
335 {
336 int maxEcnt = -1 ;
337 int maxCnt = 0 ;
338 int maxtag ;
339 for ( l = i ; l < j ; ++l )
340 if ( transcripts[l].ecnt > maxEcnt )
341 maxEcnt = transcripts[l].ecnt ;
342
343 if ( maxEcnt >= 2 )
344 {
345 int maxSampleSupport = -1 ;
346 maxtag = i ;
347 for ( l = i ; l < j ; ++l )
348 {
349 if ( transcripts[l].ecnt > 1 && transcripts[l].ecnt >= 2 )
350 {
351 if ( sampleSupport[l] > maxSampleSupport )
352 {
353 maxSampleSupport = sampleSupport[l] ;
354 maxtag = l ;
355 maxCnt = 1 ;
356 }
357 else if ( sampleSupport[l] == maxSampleSupport )
358 ++maxCnt ;
359 }
360 }
361
362 if ( maxSampleSupport >= 3 && maxSampleSupport >= ( fraction / 10 ) * sampleCnt && transcripts[maxtag].TPM > 0)
363 {
364 if ( maxCnt > 1 )
365 {
366 int maxTPM = -1 ;
367 for ( l = i ; l < j ; ++l )
368 {
369 if ( sampleSupport[l] != maxSampleSupport )
370 continue ;
371 if ( transcripts[l].TPM > maxTPM )
372 {
373 maxTPM = transcripts[l].TPM ;
374 maxtag = l ;
375 }
376 }
377
378 }
379 transcripts[maxtag].FPKM = 0 ;
380 //fprintf( stderr, "recovered: %d %d\n", sampleSupport[ maxtag ], transcripts[ maxtag ].ecnt ) ;
381 outputTranscripts.push_back( transcripts[maxtag] ) ;
382 }
383 }
384 }
385 else*/
386 {
387 for ( l = i ; l < j ; ++l )
388 {
389 //if ( transcripts[l].FPKM >= fraction * sampleCnt && transcripts[l].FPKM >= minSampleCnt )
390 if ( transcripts[l].FPKM != -1 )
391 {
392 outputTranscripts.push_back( transcripts[l] ) ;
393 }
394 }
395 }
396
397 i = j ;
398 }
399
400 // Output
401 size = outputTranscripts.size() ;
402 //printf( "%d\n", size ) ;
403 int transcriptId = 0 ;
404 int prevGid = -1 ;
405 for ( i = 0 ; i < size ; ++i )
406 {
407 struct _outputTranscript &t = outputTranscripts[i] ;
408 const char *chrom = chrIdToName[t.chrId].c_str() ;
409 /*if ( t.geneId != prevGid )
410 transcriptId = 0 ;
411 else
412 ++transcriptId ;*/
413 transcriptId = outputTranscripts[i].transcriptId ;
414 fprintf( stdout, "%s\tPsiCLASS\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n",
415 chrom, t.exons[0].a, t.exons[t.ecnt - 1].b, t.strand,
416 prefix, chrom, t.geneId,
417 prefix, chrom, t.geneId, transcriptId, t.FPKM, t.TPM, t.cov, sampleSupport[i] ) ;
418 for ( j = 0 ; j < t.ecnt ; ++j )
419 {
420 fprintf( stdout, "%s\tPsiCLASS\texon\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; "
421 "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n",
422 chrom, t.exons[j].a, t.exons[j].b, t.strand,
423 prefix, chrom, t.geneId,
424 prefix, chrom, t.geneId, transcriptId,
425 j + 1, t.FPKM, t.TPM, t.cov,
426 sampleSupport[i] ) ;
427 }
428
429 prevGid = t.geneId ;
430 }
431 return 0 ;
432 }