Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/Vote.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 // The program that vote to pick the trusted transcripts. | |
2 #include <stdio.h> | |
3 #include <stdlib.h> | |
4 #include <map> | |
5 #include <vector> | |
6 #include <algorithm> | |
7 #include <string.h> | |
8 #include <string> | |
9 | |
10 #include "defs.h" | |
11 #include "TranscriptDecider.hpp" | |
12 | |
13 char usage[] = "./transcript-vote [OPTIONS] > output.gtf:\n" | |
14 "Required:\n" | |
15 "\t--lg: path to the list of GTF files.\n" | |
16 "Optional:\n" | |
17 "\t-d FLOAT: threshold of average coverage depth across all the samples. (default: 1)\n" | |
18 //"\t-n INT: the number of samples a transcript showed up. (default: 3)\n" | |
19 ; | |
20 | |
21 | |
22 /*struct _transcript | |
23 { | |
24 int chrId ; | |
25 int geneId ; | |
26 char strand ; | |
27 struct _pair32 *exons ; | |
28 int ecnt ; | |
29 int sampleId ; | |
30 } ;*/ | |
31 | |
32 void GetGTFField( char *s, const char *field, char *ret ) | |
33 { | |
34 char *p = strstr( s, field ) ; | |
35 if ( p == NULL ) | |
36 return ; | |
37 for ( ; *p != ' ' ; ++p ) | |
38 ; | |
39 p += 2 ; // add extra 1 to skip \" | |
40 sscanf( p, "%s", ret ) ; | |
41 //printf( "+%s %d\n", tid, strlen( tid ) ) ; | |
42 p = ret + strlen( ret ) ; | |
43 while ( *p != '\"' ) | |
44 --p ; | |
45 *p = '\0' ; | |
46 } | |
47 | |
48 int GetTailNumber( char *s ) | |
49 { | |
50 int len = strlen( s ) ; | |
51 int ret = 0 ; | |
52 int i ; | |
53 int factor = 1 ; | |
54 for ( i = len - 1 ; i >= 0 && s[i] >= '0' && s[i] <= '9' ; --i, factor *= 10 ) | |
55 { | |
56 ret += factor * ( s[i] - '0' ) ; | |
57 } | |
58 return ret ; | |
59 } | |
60 | |
61 int CompDouble( const void *p1, const void *p2 ) | |
62 { | |
63 double a = *(double *)p1 ; | |
64 double b = *(double *)p2 ; | |
65 if ( a > b ) | |
66 return 1 ; | |
67 else if ( a < b ) | |
68 return -1 ; | |
69 else | |
70 return 0 ; | |
71 } | |
72 | |
73 int main( int argc, char *argv[] ) | |
74 { | |
75 int i, j, k ; | |
76 double minAvgDepth = 1.0 ; | |
77 double fraction = 1.0 ; | |
78 int minSampleCnt = 3 ; | |
79 std::map<std::string, int> chrNameToId ; | |
80 std::map<int, std::string> chrIdToName ; | |
81 | |
82 FILE *fpGTFlist = NULL ; | |
83 FILE *fp = NULL ; | |
84 for ( i = 1 ; i < argc ; ++i ) | |
85 { | |
86 if ( !strcmp( argv[i], "--lg" ) ) | |
87 { | |
88 fpGTFlist = fopen( argv[i + 1], "r" ) ; | |
89 ++i ; | |
90 } | |
91 else if ( !strcmp( argv[i], "-d" ) ) | |
92 { | |
93 minAvgDepth = atof( argv[i + 1] ) ; | |
94 ++i ; | |
95 } | |
96 /*else if ( !strcmp( argv[i], "-n" ) ) | |
97 { | |
98 minSampleCnt = atoi( argv[i + 1] ) ; | |
99 ++i ; | |
100 }*/ | |
101 else | |
102 { | |
103 printf( "%s", usage ) ; | |
104 exit( 1 ) ; | |
105 } | |
106 } | |
107 if ( fpGTFlist == NULL ) | |
108 { | |
109 printf( "Must use --lg option to speicfy the list of GTF files.\n%s", usage ) ; | |
110 exit( 1 ) ; | |
111 } | |
112 | |
113 std::vector<struct _outputTranscript> transcripts ; | |
114 std::vector<struct _outputTranscript> outputTranscripts ; | |
115 | |
116 char buffer[4096] ; | |
117 char line[10000] ; | |
118 char chrom[50], tool[20], type[40], strand[3] ; | |
119 //char tid[50] ; | |
120 int start, end ; | |
121 std::vector<struct _pair32> tmpExons ; | |
122 int sampleCnt = 0 ; | |
123 int gid = -1 ; | |
124 int tid = -1 ; | |
125 int chrId = -1 ; | |
126 int chrIdUsed = 0 ; | |
127 char cStrand = '.' ; | |
128 char prefix[50] ; | |
129 double FPKM = 0, TPM = 0, cov = 0 ; | |
130 | |
131 while ( fgets( buffer, sizeof( buffer ), fpGTFlist ) != NULL ) | |
132 { | |
133 int len = strlen( buffer ) ; | |
134 if ( buffer[len - 1] == '\n' ) | |
135 { | |
136 buffer[len - 1] = '\0' ; | |
137 --len ; | |
138 } | |
139 fp = fopen( buffer, "r" ) ; | |
140 | |
141 | |
142 while ( fgets( line, sizeof( line ), fp ) != NULL ) | |
143 { | |
144 if ( line[0] == '#' ) | |
145 continue ; | |
146 | |
147 sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ; | |
148 | |
149 if ( strcmp( type, "exon" ) ) | |
150 { | |
151 if ( tmpExons.size() > 0 ) | |
152 { | |
153 struct _outputTranscript nt ; | |
154 nt.sampleId = sampleCnt ; | |
155 nt.chrId = chrId ; | |
156 nt.geneId = gid ; | |
157 nt.transcriptId = tid ; | |
158 nt.strand = cStrand ; | |
159 nt.ecnt = tmpExons.size() ; | |
160 nt.exons = new struct _pair32[nt.ecnt] ; | |
161 nt.FPKM = FPKM ; | |
162 nt.TPM = TPM ; | |
163 nt.cov = cov ; | |
164 for ( i = 0 ; i < nt.ecnt ; ++i ) | |
165 nt.exons[i] = tmpExons[i] ; | |
166 tmpExons.clear() ; | |
167 transcripts.push_back( nt ) ; | |
168 } | |
169 continue ; | |
170 } | |
171 | |
172 if ( chrNameToId.find( std::string( chrom ) ) == chrNameToId.end() ) | |
173 { | |
174 chrId = chrIdUsed ; | |
175 | |
176 std::string s( chrom ) ; | |
177 chrNameToId[s] = chrIdUsed ; | |
178 chrIdToName[ chrIdUsed ] = s ; | |
179 ++chrIdUsed ; | |
180 } | |
181 else | |
182 chrId = chrNameToId[ std::string( chrom ) ] ; | |
183 | |
184 GetGTFField( line, "gene_id", buffer ) ; | |
185 gid = GetTailNumber( buffer ) ; | |
186 | |
187 GetGTFField( line, "transcript_id", buffer ) ; | |
188 tid = GetTailNumber( buffer ) ; | |
189 | |
190 GetGTFField( line, "FPKM", buffer ) ; | |
191 FPKM = atof( buffer ) ; | |
192 | |
193 GetGTFField( line, "TPM", buffer ) ; | |
194 TPM = atof( buffer ) ; | |
195 | |
196 GetGTFField( line, "cov", buffer ) ; | |
197 cov = atof( buffer ) ; | |
198 | |
199 cStrand = strand[0] ; | |
200 | |
201 // Look for the user-defined prefix. | |
202 int len = strlen( buffer ) ; | |
203 j = 0 ; | |
204 for ( i = len - 1 ; i >= 0 ; --i ) | |
205 { | |
206 if ( buffer[i] == '.' ) | |
207 { | |
208 ++j ; | |
209 if ( j >= 2 ) | |
210 break ; | |
211 } | |
212 } | |
213 | |
214 for ( j = 0 ; j < i ; ++i ) | |
215 { | |
216 prefix[j] = buffer[j] ; | |
217 } | |
218 if ( i > 0 ) | |
219 { | |
220 prefix[j] = '.' ; | |
221 prefix[j + 1] = '\0' ; | |
222 } | |
223 else | |
224 prefix[0] = '\0' ; | |
225 | |
226 struct _pair32 ne ; | |
227 ne.a = start ; | |
228 ne.b = end ; | |
229 tmpExons.push_back( ne ) ; | |
230 } | |
231 if ( tmpExons.size() > 0 ) | |
232 { | |
233 struct _outputTranscript nt ; | |
234 nt.sampleId = sampleCnt ; | |
235 nt.chrId = chrId ; | |
236 nt.geneId = gid ; | |
237 nt.transcriptId = tid ; | |
238 nt.strand = cStrand ; | |
239 nt.ecnt = tmpExons.size() ; | |
240 nt.exons = new struct _pair32[nt.ecnt] ; | |
241 nt.FPKM = FPKM ; | |
242 nt.TPM = TPM ; | |
243 nt.cov = cov ; | |
244 for ( i = 0 ; i < nt.ecnt ; ++i ) | |
245 nt.exons[i] = tmpExons[i] ; | |
246 tmpExons.clear() ; | |
247 transcripts.push_back( nt ) ; | |
248 } | |
249 | |
250 ++sampleCnt ; | |
251 fclose( fp ) ; | |
252 } | |
253 fclose( fpGTFlist ) ; | |
254 | |
255 // Coalesce same transcripts | |
256 std::sort( transcripts.begin(), transcripts.end(), MultiThreadOutputTranscript::CompSortTranscripts ) ; | |
257 std::vector<int> sampleSupport ; | |
258 int size = transcripts.size() ; | |
259 if ( minSampleCnt > sampleCnt ) | |
260 minSampleCnt = sampleCnt ; | |
261 | |
262 for ( i = 0 ; i < size ; ) | |
263 { | |
264 int l ; | |
265 for ( j = i + 1 ; j < size ; ++j ) | |
266 { | |
267 if ( MultiThreadOutputTranscript::CompTranscripts( transcripts[i], transcripts[j] ) ) | |
268 break ; | |
269 } | |
270 // [i,j) are the same transcripts. | |
271 /*if ( j - i >= fraction * sampleCnt && j - i >= minSampleCnt ) | |
272 { | |
273 outputTranscripts.push_back( transcripts[i] ) ; | |
274 }*/ | |
275 | |
276 double sumFPKM = 0 ; | |
277 double sumTPM = 0 ; | |
278 double sumCov = 0 ; | |
279 for ( l = i ; l < j ; ++l ) | |
280 { | |
281 sumFPKM += transcripts[l].FPKM ; | |
282 sumTPM += transcripts[l].TPM ; | |
283 sumCov += transcripts[l].cov ; | |
284 } | |
285 | |
286 transcripts[k] = transcripts[i] ; | |
287 for ( l = i + 1 ; l < j ; ++l ) | |
288 delete[] transcripts[l].exons ; | |
289 | |
290 transcripts[k].FPKM = sumFPKM / ( j - i ) ; | |
291 transcripts[k].TPM = sumTPM / ( j - i ) ; | |
292 transcripts[k].cov = sumCov / ( j - i ) ; | |
293 | |
294 if ( ( j - i < int( fraction * sampleCnt ) || j - i < minSampleCnt ) && sumCov < minAvgDepth * sampleCnt ) | |
295 //if ( transcripts[k].score * ( j - i ) < 1 && ( j - i ) < sampleCnt * 0.5 ) | |
296 //if ( sumTPM < sampleCnt || sumFPKM < sampleCnt ) | |
297 //if ( sumCov < minAvgDepthn * sampleCnt ) | |
298 { | |
299 transcripts[k].FPKM = -1 ; | |
300 } | |
301 else | |
302 sampleSupport.push_back( j - i ) ; | |
303 ++k ; | |
304 i = j ; | |
305 } | |
306 transcripts.resize( k ) ; | |
307 | |
308 // The motivation to separate the coalscing and voting is to allow | |
309 // a gene with no txpt passing the vote to pick a representative. | |
310 size = k ; | |
311 for ( i = 0 ; i < size ; ) | |
312 { | |
313 int cnt = 0 ; | |
314 if ( transcripts[i].FPKM != -1 ) | |
315 ++cnt ; | |
316 | |
317 for ( j = i + 1 ; j < size ; ++j ) | |
318 { | |
319 if ( transcripts[i].geneId != transcripts[j].geneId ) | |
320 break ; | |
321 if ( transcripts[j].FPKM != -1 ) | |
322 ++cnt ; | |
323 } | |
324 int l ; | |
325 /*double *freq = new double[cnt] ; | |
326 for ( l = 0 ; l < cnt ; ++l ) | |
327 freq[l] = transcripts[l].FPKM / sampleCnt ; | |
328 qsort( freq, cnt, sizeof( double ), CompDouble ) ; | |
329 | |
330 for ( l = 0 ; l < cnt ; ++l ) | |
331 printf( "%lf\n", freq[l] ) ; | |
332 printf( "===========\n" ) ; | |
333 delete[] freq ;*/ | |
334 /*if ( cnt == 0 ) | |
335 { | |
336 int maxEcnt = -1 ; | |
337 int maxCnt = 0 ; | |
338 int maxtag ; | |
339 for ( l = i ; l < j ; ++l ) | |
340 if ( transcripts[l].ecnt > maxEcnt ) | |
341 maxEcnt = transcripts[l].ecnt ; | |
342 | |
343 if ( maxEcnt >= 2 ) | |
344 { | |
345 int maxSampleSupport = -1 ; | |
346 maxtag = i ; | |
347 for ( l = i ; l < j ; ++l ) | |
348 { | |
349 if ( transcripts[l].ecnt > 1 && transcripts[l].ecnt >= 2 ) | |
350 { | |
351 if ( sampleSupport[l] > maxSampleSupport ) | |
352 { | |
353 maxSampleSupport = sampleSupport[l] ; | |
354 maxtag = l ; | |
355 maxCnt = 1 ; | |
356 } | |
357 else if ( sampleSupport[l] == maxSampleSupport ) | |
358 ++maxCnt ; | |
359 } | |
360 } | |
361 | |
362 if ( maxSampleSupport >= 3 && maxSampleSupport >= ( fraction / 10 ) * sampleCnt && transcripts[maxtag].TPM > 0) | |
363 { | |
364 if ( maxCnt > 1 ) | |
365 { | |
366 int maxTPM = -1 ; | |
367 for ( l = i ; l < j ; ++l ) | |
368 { | |
369 if ( sampleSupport[l] != maxSampleSupport ) | |
370 continue ; | |
371 if ( transcripts[l].TPM > maxTPM ) | |
372 { | |
373 maxTPM = transcripts[l].TPM ; | |
374 maxtag = l ; | |
375 } | |
376 } | |
377 | |
378 } | |
379 transcripts[maxtag].FPKM = 0 ; | |
380 //fprintf( stderr, "recovered: %d %d\n", sampleSupport[ maxtag ], transcripts[ maxtag ].ecnt ) ; | |
381 outputTranscripts.push_back( transcripts[maxtag] ) ; | |
382 } | |
383 } | |
384 } | |
385 else*/ | |
386 { | |
387 for ( l = i ; l < j ; ++l ) | |
388 { | |
389 //if ( transcripts[l].FPKM >= fraction * sampleCnt && transcripts[l].FPKM >= minSampleCnt ) | |
390 if ( transcripts[l].FPKM != -1 ) | |
391 { | |
392 outputTranscripts.push_back( transcripts[l] ) ; | |
393 } | |
394 } | |
395 } | |
396 | |
397 i = j ; | |
398 } | |
399 | |
400 // Output | |
401 size = outputTranscripts.size() ; | |
402 //printf( "%d\n", size ) ; | |
403 int transcriptId = 0 ; | |
404 int prevGid = -1 ; | |
405 for ( i = 0 ; i < size ; ++i ) | |
406 { | |
407 struct _outputTranscript &t = outputTranscripts[i] ; | |
408 const char *chrom = chrIdToName[t.chrId].c_str() ; | |
409 /*if ( t.geneId != prevGid ) | |
410 transcriptId = 0 ; | |
411 else | |
412 ++transcriptId ;*/ | |
413 transcriptId = outputTranscripts[i].transcriptId ; | |
414 fprintf( stdout, "%s\tPsiCLASS\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n", | |
415 chrom, t.exons[0].a, t.exons[t.ecnt - 1].b, t.strand, | |
416 prefix, chrom, t.geneId, | |
417 prefix, chrom, t.geneId, transcriptId, t.FPKM, t.TPM, t.cov, sampleSupport[i] ) ; | |
418 for ( j = 0 ; j < t.ecnt ; ++j ) | |
419 { | |
420 fprintf( stdout, "%s\tPsiCLASS\texon\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; " | |
421 "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\"; sample_cnt \"%d\";\n", | |
422 chrom, t.exons[j].a, t.exons[j].b, t.strand, | |
423 prefix, chrom, t.geneId, | |
424 prefix, chrom, t.geneId, transcriptId, | |
425 j + 1, t.FPKM, t.TPM, t.cov, | |
426 sampleSupport[i] ) ; | |
427 } | |
428 | |
429 prevGid = t.geneId ; | |
430 } | |
431 return 0 ; | |
432 } |