Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/grader.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 // Compare the reference and the predictions | |
2 // Format: ./a.out ref.gtf prediction.gtf | |
3 #include <stdio.h> | |
4 #include <string.h> | |
5 #include <stdlib.h> | |
6 | |
7 #define MAX_TXPT 2000000 | |
8 | |
9 #define DEBUG 0 | |
10 | |
11 bool flagMultiExonOnly = false ; | |
12 bool flagValidChr = true ; | |
13 int relaxWidth = 20 ; | |
14 | |
15 struct _pair | |
16 { | |
17 int a, b ; | |
18 } ; | |
19 | |
20 struct _transcript | |
21 { | |
22 char tid[50] ; | |
23 char chrom[50] ; | |
24 char strand ; | |
25 struct _pair *exons ; | |
26 int ecnt ; | |
27 } ; | |
28 | |
29 struct _info | |
30 { | |
31 double coverage ; | |
32 int byId ; // It gets this coverage by comparing with byId. | |
33 } ; | |
34 | |
35 struct _intron | |
36 { | |
37 char chrom[50] ; | |
38 int start, end ; | |
39 int tindex ; // The index to the corresponding transcripts | |
40 } ; | |
41 | |
42 struct _exon | |
43 { | |
44 char chrom[50] ; | |
45 int start, end ; | |
46 //int tindex ; | |
47 int soft ; // left-bit: left side is soft, right-bit: right side is soft | |
48 bool matched ; | |
49 } ; | |
50 | |
51 int TranscriptComp( const void *p1, const void *p2 ) | |
52 { | |
53 const struct _transcript *a = (struct _transcript *)p1 ; | |
54 const struct _transcript *b = ( struct _transcript *)p2 ; | |
55 int tmp = strcmp( a->chrom, b->chrom ) ; | |
56 if ( tmp != 0 ) | |
57 return tmp ; | |
58 return a->exons[0].a - b->exons[0].a ; | |
59 } | |
60 | |
61 int TranscriptComp_ByIntron( const void *p1, const void *p2 ) | |
62 { | |
63 const struct _transcript *a = (struct _transcript *)p1 ; | |
64 const struct _transcript *b = ( struct _transcript *)p2 ; | |
65 if ( a->strand != b->strand ) | |
66 return a->strand - b->strand ; | |
67 int tmp = strcmp( a->chrom, b->chrom ) ; | |
68 if ( tmp != 0 ) | |
69 return tmp ; | |
70 if ( a->ecnt != b->ecnt ) | |
71 return a->ecnt - b->ecnt ; | |
72 int i ; | |
73 for ( i = 0 ; i < a->ecnt - 1 ; ++i ) | |
74 { | |
75 if ( a->exons[i].b != b->exons[i].b ) | |
76 return a->exons[i].b - b->exons[i].b ; | |
77 if ( a->exons[i + 1].a != b->exons[i + 1].a ) | |
78 return a->exons[i + 1].a - b->exons[i + 1].a ; | |
79 } | |
80 | |
81 return 0 ; //strcmp( a->tid, b->tid ) ; | |
82 } | |
83 | |
84 bool validChrom( char *chrom ) | |
85 { | |
86 if ( !flagValidChr ) | |
87 return true ; | |
88 if ( chrom[0] != 'c' || chrom[1] != 'h' || chrom[2] != 'r' ) | |
89 return false ; | |
90 // only consider chr1-22,x,y,z | |
91 if ( chrom[3]=='x' || chrom[3]=='X' | |
92 || chrom[3] == 'y' || chrom[3] == 'Y' | |
93 || chrom[3] == 'm' || chrom[3] == 'M' | |
94 || ( chrom[3] >= '3' && chrom[3] <= '9' ) ) | |
95 { | |
96 if ( chrom[4] == '\0' ) | |
97 return true ; | |
98 else | |
99 return false ; | |
100 } | |
101 if ( chrom[3] == '1' ) | |
102 { | |
103 if ( chrom[4] == '\0' ) | |
104 return true ; | |
105 if ( chrom[4] >= '0' && chrom[4] <= '9' && chrom[5] == '\0' ) | |
106 return true ; | |
107 return false ; | |
108 } | |
109 if ( chrom[3] == '2' ) | |
110 { | |
111 if ( chrom[4] == '\0' ) | |
112 return true ; | |
113 if ( chrom[4] >= '0' && chrom[4] <= '2' && chrom[5] == '\0' ) | |
114 return true ; | |
115 return false ; | |
116 } | |
117 return false ; | |
118 } | |
119 | |
120 void ReverseExonList( struct _pair *exons, int ecnt ) | |
121 { | |
122 if ( ecnt < 2 || ( exons[0].a < exons[1].a ) ) | |
123 return ; | |
124 int i, j ; | |
125 struct _pair tmp ; | |
126 | |
127 for ( i = 0, j = ecnt - 1 ; i < j ; ++i, --j ) | |
128 { | |
129 tmp = exons[i] ; | |
130 exons[i] = exons[j] ; | |
131 exons[j] = tmp ; | |
132 } | |
133 } | |
134 | |
135 /** | |
136 Merge exons that are next to each other. Showed up in scripture. | |
137 */ | |
138 int CleanExonList( struct _pair *exons, int ecnt ) | |
139 { | |
140 int i, k ; | |
141 for ( i = 0 ; i < ecnt - 1 ; ++i ) | |
142 if ( exons[i + 1].a - exons[i].b - 1 < 20 ) | |
143 { | |
144 exons[i + 1].a = exons[i].a ; | |
145 exons[i].a = -1 ; | |
146 } | |
147 k = 0 ; | |
148 for ( i = 0 ; i < ecnt ; ++i ) | |
149 { | |
150 if ( exons[i].a == -1 ) | |
151 continue ; | |
152 exons[k] = exons[i] ; | |
153 ++k ; | |
154 } | |
155 return k ; | |
156 } | |
157 | |
158 /** | |
159 Remove the duplicated intron chain in the list | |
160 */ | |
161 int RemoveDuplicateIntronChain( struct _transcript *t, int tcnt ) | |
162 { | |
163 int i, j, k ; | |
164 | |
165 qsort( t, tcnt, sizeof( *t ), TranscriptComp_ByIntron ) ; | |
166 for ( i = 1 ; i < tcnt ; ++i ) | |
167 { | |
168 k = i - 1 ; | |
169 while ( t[k].ecnt == -1 ) | |
170 --k ; | |
171 | |
172 if ( t[i].ecnt != t[k].ecnt || strcmp( t[i].chrom, t[k].chrom ) || t[i].strand != t[k].strand ) | |
173 continue ; | |
174 for ( j = 0 ; j < t[i].ecnt - 1 ; ++j ) | |
175 if ( t[i].exons[j].b != t[k].exons[j].b || | |
176 t[i].exons[j + 1].a != t[k].exons[j + 1].a ) | |
177 break ; | |
178 if ( j >= t[i].ecnt - 1 && t[i].ecnt != 1 ) | |
179 t[i].ecnt = -1 ; | |
180 if ( t[i].ecnt == 1 && t[i].exons[0].a == t[k].exons[0].a && | |
181 t[i].exons[0].b == t[k].exons[0].b ) | |
182 t[i].ecnt = -1 ; | |
183 /*if ( t[i].ecnt == -1 ) | |
184 { | |
185 printf( "%s <- %s\n", t[i].tid, t[k].tid ) ; | |
186 }*/ | |
187 } | |
188 | |
189 k = 0 ; | |
190 for ( i = 0 ; i < tcnt ; ++i ) | |
191 { | |
192 if ( t[i].ecnt == -1 ) | |
193 { | |
194 free( t[i].exons ) ; | |
195 continue ; | |
196 } | |
197 t[k] = t[i] ; | |
198 ++k ; | |
199 } | |
200 tcnt = k ; | |
201 return tcnt ; | |
202 } | |
203 | |
204 | |
205 double CompareTranscripts( const struct _transcript &ref, const struct _transcript &pred ) | |
206 { | |
207 int i, j ; | |
208 struct _pair *refExons = ref.exons ; | |
209 int refECnt = ref.ecnt ; | |
210 struct _pair *predExons = pred.exons ; | |
211 int predECnt = pred.ecnt ; | |
212 | |
213 // Prediction must be a "subset" of the reference | |
214 if ( refECnt < predECnt ) | |
215 return -1 ; | |
216 | |
217 // single exon case | |
218 if ( refECnt == 1 ) | |
219 { | |
220 if ( refExons[0].b < predExons[0].a || refExons[0].a > predExons[0].b ) | |
221 return -1 ; | |
222 else | |
223 return 1 ; | |
224 } | |
225 | |
226 if ( predECnt == 1 ) | |
227 { | |
228 for ( i = 0 ; i < refECnt ; ++i ) | |
229 { | |
230 if ( refExons[i].b < predExons[0].a || refExons[i].a > predExons[0].b ) | |
231 continue ; | |
232 else | |
233 { | |
234 /*if ( i == 0 && predExons[0].a >= refExons[0].a - relaxWidth ) | |
235 return 0 ; | |
236 else if ( i == refECnt - 1 && predExons[0].b <= refExons[i].b + relaxWidth ) | |
237 return 0 ; | |
238 else if ( i > 0 && i < refECnt - 1 && predExons[0].a >= refExons[i].a - relaxWidth && | |
239 predExons[0].b <= refExons[i].b + relaxWidth ) | |
240 return 0 ; | |
241 | |
242 return -1 ;*/ | |
243 return 0 ; | |
244 } | |
245 } | |
246 return -1 ; | |
247 } | |
248 | |
249 if ( predECnt > 0 && ref.strand != pred.strand ) | |
250 return -1 ; | |
251 | |
252 // Test the intron chain | |
253 for ( i = 0 ; i < refECnt - 1 ; ++i ) | |
254 if ( refExons[i].b == predExons[0].b ) | |
255 break ; | |
256 if ( i >= refECnt - 1 ) | |
257 return -1 ; | |
258 //if ( i != 0 && predExons[0].a < refExons[i].a - relaxWidth ) | |
259 // return -1 ; | |
260 | |
261 for ( j = 0 ; i < refECnt - 1 && j < predECnt - 1 ; ++i, ++j ) | |
262 { | |
263 if ( refExons[i].b != predExons[j].b || refExons[i + 1].a != predExons[j + 1].a ) | |
264 break ; | |
265 } | |
266 if ( j >= predECnt - 1 ) | |
267 { | |
268 /*if ( i == refECnt - 1 || | |
269 predExons[ predECnt - 1 ].b <= refExons[i].b + relaxWidth ) | |
270 return (double)( predECnt - 1 ) / (double)( refECnt - 1 ) ; | |
271 else | |
272 return -1 ;*/ | |
273 return (double)( predECnt - 1 ) / (double)( refECnt - 1 ) ; | |
274 } | |
275 else | |
276 return -1 ; | |
277 } | |
278 | |
279 bool WithinRange( int a, int b, int w = 10 ) | |
280 { | |
281 if ( a - b >= -w && a - b <= w ) | |
282 return true ; | |
283 return false ; | |
284 } | |
285 | |
286 int GetIntrons( struct _transcript *transcripts, int tcnt, struct _intron *introns ) | |
287 { | |
288 int i, j, k, tag ; | |
289 int cnt = 0 ; | |
290 for ( i = 0 ; i < tcnt ; ++i ) | |
291 { | |
292 for ( j = 0 ; j < transcripts[i].ecnt - 1 ; ++j ) | |
293 { | |
294 bool flag = false ; | |
295 | |
296 tag = 0 ; | |
297 for ( k = cnt - 1 ; k >= 0 ; --k ) | |
298 { | |
299 if ( strcmp( introns[k].chrom, transcripts[i].chrom ) ) | |
300 { | |
301 //printf( "hi\n" ) ; | |
302 tag = k + 1 ; | |
303 break ; | |
304 } | |
305 if ( introns[k].start < transcripts[i].exons[j].b ) | |
306 { | |
307 tag = k + 1 ; | |
308 break ; | |
309 } | |
310 else if ( introns[k].start == transcripts[i].exons[j].b && | |
311 introns[k].end == transcripts[i].exons[j + 1].a ) | |
312 { | |
313 flag = true ; | |
314 break ; | |
315 } | |
316 } | |
317 | |
318 if ( !flag ) | |
319 { | |
320 for ( k = cnt ; k > tag ; --k ) | |
321 introns[k] = introns[k - 1] ; | |
322 strcpy( introns[k].chrom, transcripts[i].chrom ) ; | |
323 introns[k].start = transcripts[i].exons[j].b ; | |
324 introns[k].end = transcripts[i].exons[j + 1].a ; | |
325 introns[k].tindex = i ; | |
326 ++cnt ; | |
327 } | |
328 } | |
329 } | |
330 return cnt ; | |
331 } | |
332 | |
333 int GetExons( struct _transcript *transcripts, int tcnt, struct _exon *exons ) | |
334 { | |
335 int i, j, k, tag ; | |
336 int cnt = 0 ; | |
337 for ( i = 0 ; i < tcnt ; ++i ) | |
338 { | |
339 for ( j = 0 ; j < transcripts[i].ecnt ; ++j ) | |
340 { | |
341 bool flag = false ; | |
342 int soft = 0 ; | |
343 if ( j == 0 ) | |
344 soft = soft | 2 ; | |
345 if ( j == transcripts[i].ecnt - 1 ) | |
346 soft = soft | 1 ; | |
347 | |
348 tag = 0 ; | |
349 for ( k = cnt - 1 ; k >= 0 ; --k ) | |
350 { | |
351 if ( strcmp( exons[k].chrom, transcripts[i].chrom ) ) | |
352 { | |
353 //printf( "hi\n" ) ; | |
354 tag = k + 1 ; | |
355 break ; | |
356 } | |
357 | |
358 if ( exons[k].start < transcripts[i].exons[j].a ) | |
359 { | |
360 tag = k + 1 ; | |
361 break ; | |
362 } | |
363 else if ( exons[k].start == transcripts[i].exons[j].a && | |
364 exons[k].end == transcripts[i].exons[j].b && | |
365 exons[k].soft == soft ) | |
366 { | |
367 flag = true ; | |
368 break ; | |
369 } | |
370 } | |
371 | |
372 if ( !flag ) | |
373 { | |
374 for ( k = cnt ; k > tag ; --k ) | |
375 exons[k] = exons[k - 1] ; | |
376 strcpy( exons[k].chrom, transcripts[i].chrom ) ; | |
377 exons[k].start = transcripts[i].exons[j].a ; | |
378 exons[k].end = transcripts[i].exons[j].b ; | |
379 //exons[k].tindex = i ; | |
380 exons[k].soft = soft ; | |
381 exons[k].matched = false ; | |
382 ++cnt ; | |
383 } | |
384 } | |
385 } | |
386 return cnt ; | |
387 } | |
388 | |
389 //chr1 HAVANA transcript 320162 324461 . + . gene_id "ENSG00000237094.6"; transcript_id "ENST00000423728.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP4-669L17.10"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP4-669L17.10-002"; level 2; havana_gene "OTTHUMG00000156968.4"; havana_transcript "OTTHUMT00000346879.1"; | |
390 int main( int argc, char *argv[] ) | |
391 { | |
392 FILE *fp ; | |
393 struct _transcript *ref, *pred ; | |
394 struct _info *refInfo, *predInfo ; | |
395 | |
396 int refCnt, predCnt ; | |
397 char line[10000], filename[10000] ; | |
398 struct _pair tmpExons[10000] ; | |
399 int tmpECnt = 0 ; | |
400 | |
401 char chrom[50], tool[20], type[40], strand[3] ; | |
402 char tid[50] ; | |
403 char buffer[50] ; | |
404 int start, end ; | |
405 | |
406 int i, j, k, tag ; | |
407 | |
408 if ( argc == 1 ) | |
409 { | |
410 printf( "Compare the reference transcripts and predicted transcripts.\n" | |
411 "Format: ./grader ref.gtf prediction.gtf\n" ) ; | |
412 exit( 1 ) ; | |
413 } | |
414 // Parse the arguments | |
415 for ( i = 3 ; i < argc ; ++i ) | |
416 { | |
417 if ( !strcmp( argv[i], "-M" ) ) | |
418 flagMultiExonOnly = true ; | |
419 else if ( !strcmp( argv[i], "-ac" ) ) | |
420 flagValidChr = false ; | |
421 else | |
422 { | |
423 printf( "Unknown argument: %s\n", argv[i] ) ; | |
424 exit( 1 ) ; | |
425 } | |
426 } | |
427 | |
428 /*======================================================================================== | |
429 Stage 1: Read the transcripts from the reference and prediction's GTF files. | |
430 ========================================================================================*/ | |
431 fp = NULL ; | |
432 fp = fopen( argv[1], "r" ) ; | |
433 if ( fp == NULL ) | |
434 { | |
435 printf( "Could not open file %s.\n", argv[1] ) ; | |
436 exit( 1 ) ; | |
437 } | |
438 refCnt = 0 ; | |
439 ref = ( struct _transcript *)malloc( MAX_TXPT * sizeof( struct _transcript ) ) ; | |
440 | |
441 strcpy( ref[0].tid, "tmp_-1" ) ; | |
442 while ( fgets( line, sizeof( line ), fp ) != NULL ) | |
443 { | |
444 if ( line[0] == '#' ) | |
445 continue ; | |
446 | |
447 sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ; | |
448 | |
449 if ( strcmp( type, "exon" ) || !validChrom( chrom ) ) | |
450 continue ; | |
451 | |
452 char *p = strstr( line, "transcript_id" ) ; | |
453 for ( ; *p != ' ' ; ++p ) | |
454 ; | |
455 p += 2 ; | |
456 sscanf( p, "%s", tid ) ; | |
457 //printf( "+%s %d\n", tid, strlen( tid ) ) ; | |
458 p = tid + strlen( tid ) ; | |
459 while ( *p != '\"' ) | |
460 --p ; | |
461 *p = '\0' ; | |
462 //printf( "%s\n", tid ) ; | |
463 if ( strcmp( tid, ref[ refCnt ].tid ) && tmpECnt ) | |
464 { | |
465 ReverseExonList( tmpExons, tmpECnt ) ; | |
466 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; | |
467 if ( tmpECnt > 1 || !flagMultiExonOnly ) | |
468 { | |
469 ref[ refCnt ].ecnt = tmpECnt ; | |
470 ref[ refCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; | |
471 memcpy( ref[ refCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; | |
472 ++refCnt ; | |
473 } | |
474 tmpECnt = 0 ; | |
475 } | |
476 | |
477 tmpExons[ tmpECnt ].a = start ; | |
478 tmpExons[ tmpECnt ].b = end ; | |
479 ++tmpECnt ; | |
480 | |
481 ref[ refCnt ].strand = strand[0] ; | |
482 strcpy( ref[ refCnt ].chrom, chrom ) ; | |
483 strcpy( ref[ refCnt ].tid, tid ) ; | |
484 } | |
485 if ( tmpECnt != 0 ) | |
486 { | |
487 ReverseExonList( tmpExons, tmpECnt ) ; | |
488 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; | |
489 if ( tmpECnt > 1 || !flagMultiExonOnly ) | |
490 { | |
491 ref[ refCnt ].ecnt = tmpECnt ; | |
492 ref[ refCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; | |
493 memcpy( ref[ refCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; | |
494 ++refCnt ; | |
495 } | |
496 tmpECnt = 0 ; | |
497 } | |
498 fclose( fp ) ; | |
499 | |
500 fp = NULL ; | |
501 fp = fopen( argv[2], "r" ) ; | |
502 if ( fp == NULL ) | |
503 { | |
504 printf( "Could not open file %s.\n", argv[2] ) ; | |
505 exit( 1 ) ; | |
506 } | |
507 predCnt = 0 ; | |
508 pred = ( struct _transcript *)malloc( MAX_TXPT * sizeof( struct _transcript ) ) ; | |
509 | |
510 strcpy( pred[0].tid, "tmp_-1" ) ; | |
511 tmpECnt = 0 ; | |
512 while ( fgets( line, sizeof( line ), fp ) != NULL ) | |
513 { | |
514 if ( line[0] == '#' ) | |
515 continue ; | |
516 sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ; | |
517 | |
518 if ( strcmp( type, "exon" ) || !validChrom( chrom ) ) | |
519 continue ; | |
520 | |
521 char *p = strstr( line, "transcript_id" ) ; | |
522 //char *p = strstr( line, "gene_name" ) ; | |
523 for ( ; *p != ' ' ; ++p ) | |
524 ; | |
525 p += 2 ; | |
526 sscanf( p, "%s", tid ) ; | |
527 //tid[ strlen( tid ) - 2 ] = '\0' ; | |
528 p = tid + strlen( tid ) ; | |
529 while ( *p != '\"' ) | |
530 --p ; | |
531 *p = '\0' ; | |
532 | |
533 if ( strcmp( tid, pred[ predCnt ].tid ) && tmpECnt ) | |
534 { | |
535 ReverseExonList( tmpExons, tmpECnt ) ; | |
536 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; | |
537 if ( tmpECnt > 1 || !flagMultiExonOnly ) | |
538 { | |
539 pred[ predCnt ].ecnt = tmpECnt ; | |
540 pred[ predCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; | |
541 memcpy( pred[ predCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; | |
542 ++predCnt ; | |
543 } | |
544 tmpECnt = 0 ; | |
545 } | |
546 | |
547 tmpExons[ tmpECnt ].a = start ; | |
548 tmpExons[ tmpECnt ].b = end ; | |
549 ++tmpECnt ; | |
550 | |
551 pred[ predCnt ].strand = strand[0] ; | |
552 strcpy( pred[ predCnt ].chrom, chrom ) ; | |
553 strcpy( pred[ predCnt ].tid, tid ) ; | |
554 } | |
555 if ( tmpECnt > 0 ) | |
556 { | |
557 ReverseExonList( tmpExons, tmpECnt ) ; | |
558 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; | |
559 if ( tmpECnt > 1 || !flagMultiExonOnly ) | |
560 { | |
561 pred[ predCnt ].ecnt = tmpECnt ; | |
562 pred[ predCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; | |
563 memcpy( pred[ predCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; | |
564 ++predCnt ; | |
565 } | |
566 tmpECnt = 0 ; | |
567 } | |
568 | |
569 refCnt = RemoveDuplicateIntronChain( ref, refCnt ) ; | |
570 //printf( "predCnt = %d\n", predCnt ) ; | |
571 predCnt = RemoveDuplicateIntronChain( pred, predCnt ) ; | |
572 //printf( "predCnt = %d\n", predCnt ) ; | |
573 qsort( ref, refCnt, sizeof( ref[0] ), TranscriptComp ) ; | |
574 qsort( pred, predCnt, sizeof( pred[0] ), TranscriptComp ) ; | |
575 | |
576 | |
577 /*======================================================================================== | |
578 Stage 2: Compute the recall and precision. | |
579 ========================================================================================*/ | |
580 refInfo = ( struct _info * )malloc( refCnt * sizeof( *refInfo ) ) ; | |
581 predInfo = ( struct _info * )malloc( predCnt * sizeof( *predInfo ) ) ; | |
582 for ( i = 0 ; i < refCnt ; ++i ) | |
583 { | |
584 refInfo[i].coverage = -1 ; | |
585 refInfo[i].byId = -1 ; | |
586 #if DEBUG | |
587 printf( "=%d %s %s %d\n", i, ref[i].chrom, ref[i].tid, ref[i].ecnt ) ; | |
588 for ( j = 0 ; j < ref[i].ecnt ; ++j ) | |
589 printf( "%d %d\n", ref[i].exons[j].a, ref[i].exons[j].b ) ; | |
590 #endif | |
591 | |
592 } | |
593 for ( i = 0 ; i < predCnt ; ++i ) | |
594 { | |
595 predInfo[i].coverage = -1 ; | |
596 predInfo[i].byId = -1 ; | |
597 #if DEBUG | |
598 printf( "#%d %s %s %d\n", i, pred[i].chrom, pred[i].tid, pred[i].ecnt ) ; | |
599 for ( j = 0 ; j < pred[i].ecnt ; ++j ) | |
600 printf( "%d %d\n", pred[i].exons[j].a, pred[i].exons[j].b ) ; | |
601 #endif | |
602 } | |
603 tag = 0 ; | |
604 for ( i = 0 ; i < predCnt ; ++i ) | |
605 { | |
606 while ( tag < refCnt && | |
607 ( ( strcmp( ref[tag].chrom, pred[i].chrom ) < 0 ) || | |
608 ( !strcmp( ref[tag].chrom, pred[i].chrom ) && | |
609 ref[tag].exons[ ref[tag].ecnt - 1 ].b < pred[i].exons[0].a - relaxWidth ) ) ) | |
610 ++tag ; | |
611 | |
612 // if ( i == 16474 ) | |
613 // printf( "%d %d %d", i, tag, refCnt ) ; | |
614 for ( j = tag ; j < refCnt && ref[j].exons[0].a <= pred[i].exons[ pred[i].ecnt - 1 ].b + relaxWidth && !strcmp( ref[j].chrom, pred[i].chrom ); ++j ) | |
615 { | |
616 /*if ( !strcmp( pred[i].tid, "CUFF.4256.1" ) ) | |
617 { | |
618 printf( "%s ? %s\n", pred[i].tid, ref[j].tid ) ; | |
619 }*/ | |
620 // if ( i == 16474 ) | |
621 // { | |
622 // printf( "%d %d\n", i, j ) ; | |
623 // } | |
624 double coverage = CompareTranscripts( ref[j], pred[i] ) ; | |
625 if ( coverage > refInfo[j].coverage ) | |
626 { | |
627 refInfo[j].coverage = coverage ; | |
628 refInfo[j].byId = i ; | |
629 } | |
630 if ( coverage > predInfo[i].coverage ) | |
631 { | |
632 predInfo[i].coverage = coverage ; | |
633 predInfo[i].byId = j ; | |
634 } | |
635 } | |
636 } | |
637 | |
638 /*======================================================================================== | |
639 Stage 3: Dump the information into output files. | |
640 ========================================================================================*/ | |
641 //sprintf( filename, "grader.%s.recall", argv[1] ) ; | |
642 sprintf( filename, "grader.recall" ) ; | |
643 fp = fopen( filename, "w" ) ; | |
644 for ( i = 0 ; i < refCnt ; ++i ) | |
645 { | |
646 fprintf( fp, "%s\t%d\t%lf\t%s\n", ref[i].tid, ref[i].ecnt, refInfo[i].coverage, | |
647 refInfo[i].byId == -1 ? "-" : pred[ refInfo[i].byId ].tid ) ; | |
648 } | |
649 fclose( fp ) ; | |
650 | |
651 //sprintf( filename, "grader.%s.precision", argv[2] ) ; | |
652 sprintf( filename, "grader.precision" ) ; | |
653 fp = fopen( filename, "w" ) ; | |
654 for ( i = 0 ; i < predCnt ; ++i ) | |
655 { | |
656 fprintf( fp, "%s\t%d\t%lf\t%s\n", pred[i].tid, pred[i].ecnt, predInfo[i].coverage, | |
657 predInfo[i].byId == -1 ? "-" : ref[predInfo[i].byId].tid ) ; | |
658 } | |
659 fclose( fp ) ; | |
660 | |
661 // print the summary to the stdout | |
662 const int binCnt = 20 ; | |
663 int bins[binCnt + 1] ; | |
664 memset( bins, 0, sizeof( bins ) ) ; | |
665 k = 0 ; | |
666 for ( i = 0 ; i < refCnt ; ++i ) | |
667 { | |
668 if ( refInfo[i].coverage == -1 ) | |
669 continue ; | |
670 ++bins[ (int)( refInfo[i].coverage * binCnt ) ] ; | |
671 ++k ; | |
672 } | |
673 for ( i = 0 ; i <= binCnt ; ++i ) | |
674 { | |
675 printf( "recall %lf %lf %d/%d\n", (double)i / binCnt, (double)k / refCnt, k, refCnt ) ; | |
676 k -= bins[i] ; | |
677 } | |
678 | |
679 memset( bins, 0, sizeof( bins ) ) ; | |
680 k = 0 ; | |
681 for ( i = 0 ; i < predCnt ; ++i ) | |
682 { | |
683 if ( predInfo[i].coverage == -1 ) | |
684 continue ; | |
685 ++bins[ (int)( predInfo[i].coverage * binCnt ) ] ; | |
686 ++k ; | |
687 } | |
688 for ( i = 0 ; i <= binCnt ; ++i ) | |
689 { | |
690 printf( "precision %lf %lf %d/%d\n", (double)i / binCnt, (double)k / predCnt, k, predCnt ) ; | |
691 k -= bins[i] ; | |
692 } | |
693 | |
694 /*======================================================================================== | |
695 Stage 4: Evaluations for introns. | |
696 ========================================================================================*/ | |
697 struct _intron *refIntrons = ( struct _intron * )malloc( sizeof( struct _intron ) * MAX_TXPT ) ; | |
698 int riCnt = GetIntrons( ref, refCnt, refIntrons ) ; | |
699 struct _intron *predIntrons = ( struct _intron * )malloc( sizeof( struct _intron ) * MAX_TXPT ) ; | |
700 int piCnt = GetIntrons( pred, predCnt, predIntrons ) ; | |
701 int matchedIntron = 0 ; | |
702 /*for ( i = 0 ; i < riCnt ; ++i ) | |
703 { | |
704 printf( "%d %s %d %d\n", i, refIntrons[i].chrom, refIntrons[i].start, refIntrons[i].end ) ; | |
705 }*/ | |
706 | |
707 fp = fopen( "grader.intron", "w" ) ; | |
708 tag = 0 ; | |
709 for ( i = 0 ; i < piCnt ; ++i ) | |
710 { | |
711 bool flag = false ; | |
712 while ( 1 ) | |
713 { | |
714 if ( tag >= riCnt ) | |
715 break ; | |
716 int tmp = strcmp( refIntrons[tag].chrom, predIntrons[i].chrom ) ; | |
717 if ( tmp < 0 || ( tmp == 0 && refIntrons[tag].start < predIntrons[i].start ) ) | |
718 ++tag ; | |
719 else | |
720 break ; | |
721 } | |
722 //printf( "%d (%s %d %d) %d=>", i, predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, tag ) ; | |
723 /*if ( predIntrons[i].start == 1613457 ) | |
724 printf( "%d %d\n", tag, riCnt ) ;*/ | |
725 for ( j = tag ; j < riCnt ; ++j ) | |
726 { | |
727 if ( strcmp( refIntrons[j].chrom, predIntrons[i].chrom ) > 0 || | |
728 refIntrons[j].start > predIntrons[i].start /*+ 10*/ ) | |
729 break ; | |
730 if ( refIntrons[j].start == predIntrons[i].start && | |
731 refIntrons[j].end == predIntrons[i].end ) | |
732 //if ( WithinRange( refIntrons[j].start, predIntrons[i].start ) && | |
733 // WithinRange( refIntrons[j].end, predIntrons[i].end ) ) | |
734 { | |
735 ++matchedIntron ; | |
736 flag = true ; | |
737 break ; | |
738 } | |
739 } | |
740 //printf( "%d\n", j ) ; | |
741 if ( flag ) | |
742 fprintf( fp, "Y\t%s\t%d\t%d\t%s\n", predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, pred[ predIntrons[i].tindex ].tid ) ; | |
743 else | |
744 fprintf( fp, "N\t%s\t%d\t%d\t%s\n", predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, pred[ predIntrons[i].tindex ].tid ) ; | |
745 } | |
746 fclose( fp ) ; | |
747 printf( "\n" ) ; | |
748 printf( "Intron evaluation\n") ; | |
749 //printf( "\tmatched\t%d\n", matchedIntron ) ; | |
750 printf( "\trecall\t%lf\t%d/%d\n", matchedIntron / (double)riCnt, matchedIntron, riCnt ) ; | |
751 printf( "\tprecision\t%lf\t%d/%d\n", matchedIntron / (double)piCnt, matchedIntron, piCnt ) ; | |
752 | |
753 /*======================================================================================== | |
754 Stage 4: Evaluations for exons. | |
755 ========================================================================================*/ | |
756 struct _exon *refExons = ( struct _exon * )malloc( sizeof( struct _exon ) * MAX_TXPT ) ; | |
757 int reCnt = GetExons( ref, refCnt, refExons ) ; | |
758 struct _exon *predExons = ( struct _exon * )malloc( sizeof( struct _exon ) * MAX_TXPT ) ; | |
759 int peCnt = GetExons( pred, predCnt, predExons ) ; | |
760 int matchedExons = 0, matchedInternalExons = 0 ; | |
761 int refInternalExons = 0, predInternalExons = 0 ; | |
762 | |
763 for ( i = 0 ; i < reCnt ; ++i ) | |
764 { | |
765 //printf( "%d %d\n", refExons[i].start, refExons[i].end ) ; | |
766 if ( refExons[i].soft == 0 ) | |
767 ++refInternalExons ; | |
768 //if ( refExons[i].soft == 3 ) | |
769 // printf( "hi\n" ) ; | |
770 } | |
771 for ( i = 0 ; i < peCnt ; ++i ) | |
772 if ( predExons[i].soft == 0 ) | |
773 ++predInternalExons ; | |
774 tag = 0 ; | |
775 for ( i = 0 ; i < peCnt ; ++i ) | |
776 { | |
777 bool flag = false ; | |
778 while ( 1 ) | |
779 { | |
780 if ( tag >= reCnt ) | |
781 break ; | |
782 int tmp = strcmp( refExons[tag].chrom, predExons[i].chrom ) ; | |
783 if ( tmp < 0 || ( tmp == 0 && refExons[tag].end < predExons[i].start ) ) | |
784 ++tag ; | |
785 else | |
786 break ; | |
787 } | |
788 for ( j = tag ; j < reCnt ; ++j ) | |
789 { | |
790 if ( strcmp( refExons[j].chrom, predExons[i].chrom ) > 0 || | |
791 refExons[j].start > predExons[i].end /*+ 10*/ ) | |
792 break ; | |
793 if ( refExons[j].soft != predExons[i].soft | |
794 || refExons[j].end < predExons[i].start ) | |
795 continue ; | |
796 | |
797 if ( refExons[j].start == predExons[i].start && | |
798 refExons[j].end == predExons[i].end && predExons[i].soft == 0 ) | |
799 //if ( WithinRange( refIntrons[j].start, predIntrons[i].start ) && | |
800 // WithinRange( refIntrons[j].end, predIntrons[i].end ) ) | |
801 { | |
802 refExons[j].matched = true ; | |
803 predExons[i].matched = true ; | |
804 ++matchedInternalExons ; | |
805 flag = true ; | |
806 break ; | |
807 } | |
808 else if ( ( refExons[j].start == predExons[i].start || ( predExons[i].soft & 2 ) ) | |
809 && ( refExons[j].end == predExons[i].end || (predExons[i].soft & 1 ) ) ) | |
810 { | |
811 refExons[j].matched = true ; | |
812 predExons[i].matched = true ; | |
813 flag = true ; | |
814 //break ; | |
815 } | |
816 } | |
817 //printf( "%d\n", j ) ; | |
818 } | |
819 | |
820 printf( "\n" ) ; | |
821 printf( "Exon evaluation\n" ) ; | |
822 //printf( "\tmatched\t%d\n", matchedExons ) ; | |
823 matchedExons = 0 ; | |
824 for ( i = 0 ; i < reCnt ; ++i ) | |
825 if ( refExons[i].matched ) | |
826 ++matchedExons ; | |
827 printf( "\trecall\t%lf\t%d/%d\n", (double)matchedExons / reCnt, matchedExons, reCnt ) ; | |
828 matchedExons = 0 ; | |
829 for ( i = 0 ; i < peCnt ; ++i ) | |
830 if ( predExons[i].matched ) | |
831 ++matchedExons ; | |
832 printf( "\tprecision\t%lf\t%d/%d\n", (double)matchedExons / peCnt, matchedExons, peCnt ) ; | |
833 | |
834 printf( "\n" ) ; | |
835 printf( "Internal exon evaluation\n" ) ; | |
836 //printf( "\tmatched\t%d\n", matchedInternalExons ) ; | |
837 printf( "\trecall\t%lf\t%d/%d\n", (double)matchedInternalExons / refInternalExons, matchedInternalExons, refInternalExons ) ; | |
838 printf( "\tprecision\t%lf\t%d/%d\n", (double)matchedInternalExons / predInternalExons, matchedInternalExons, predInternalExons ) ; | |
839 return 0 ; | |
840 } |