comparison PsiCLASS-1.0.2/grader.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 // Compare the reference and the predictions
2 // Format: ./a.out ref.gtf prediction.gtf
3 #include <stdio.h>
4 #include <string.h>
5 #include <stdlib.h>
6
7 #define MAX_TXPT 2000000
8
9 #define DEBUG 0
10
11 bool flagMultiExonOnly = false ;
12 bool flagValidChr = true ;
13 int relaxWidth = 20 ;
14
15 struct _pair
16 {
17 int a, b ;
18 } ;
19
20 struct _transcript
21 {
22 char tid[50] ;
23 char chrom[50] ;
24 char strand ;
25 struct _pair *exons ;
26 int ecnt ;
27 } ;
28
29 struct _info
30 {
31 double coverage ;
32 int byId ; // It gets this coverage by comparing with byId.
33 } ;
34
35 struct _intron
36 {
37 char chrom[50] ;
38 int start, end ;
39 int tindex ; // The index to the corresponding transcripts
40 } ;
41
42 struct _exon
43 {
44 char chrom[50] ;
45 int start, end ;
46 //int tindex ;
47 int soft ; // left-bit: left side is soft, right-bit: right side is soft
48 bool matched ;
49 } ;
50
51 int TranscriptComp( const void *p1, const void *p2 )
52 {
53 const struct _transcript *a = (struct _transcript *)p1 ;
54 const struct _transcript *b = ( struct _transcript *)p2 ;
55 int tmp = strcmp( a->chrom, b->chrom ) ;
56 if ( tmp != 0 )
57 return tmp ;
58 return a->exons[0].a - b->exons[0].a ;
59 }
60
61 int TranscriptComp_ByIntron( const void *p1, const void *p2 )
62 {
63 const struct _transcript *a = (struct _transcript *)p1 ;
64 const struct _transcript *b = ( struct _transcript *)p2 ;
65 if ( a->strand != b->strand )
66 return a->strand - b->strand ;
67 int tmp = strcmp( a->chrom, b->chrom ) ;
68 if ( tmp != 0 )
69 return tmp ;
70 if ( a->ecnt != b->ecnt )
71 return a->ecnt - b->ecnt ;
72 int i ;
73 for ( i = 0 ; i < a->ecnt - 1 ; ++i )
74 {
75 if ( a->exons[i].b != b->exons[i].b )
76 return a->exons[i].b - b->exons[i].b ;
77 if ( a->exons[i + 1].a != b->exons[i + 1].a )
78 return a->exons[i + 1].a - b->exons[i + 1].a ;
79 }
80
81 return 0 ; //strcmp( a->tid, b->tid ) ;
82 }
83
84 bool validChrom( char *chrom )
85 {
86 if ( !flagValidChr )
87 return true ;
88 if ( chrom[0] != 'c' || chrom[1] != 'h' || chrom[2] != 'r' )
89 return false ;
90 // only consider chr1-22,x,y,z
91 if ( chrom[3]=='x' || chrom[3]=='X'
92 || chrom[3] == 'y' || chrom[3] == 'Y'
93 || chrom[3] == 'm' || chrom[3] == 'M'
94 || ( chrom[3] >= '3' && chrom[3] <= '9' ) )
95 {
96 if ( chrom[4] == '\0' )
97 return true ;
98 else
99 return false ;
100 }
101 if ( chrom[3] == '1' )
102 {
103 if ( chrom[4] == '\0' )
104 return true ;
105 if ( chrom[4] >= '0' && chrom[4] <= '9' && chrom[5] == '\0' )
106 return true ;
107 return false ;
108 }
109 if ( chrom[3] == '2' )
110 {
111 if ( chrom[4] == '\0' )
112 return true ;
113 if ( chrom[4] >= '0' && chrom[4] <= '2' && chrom[5] == '\0' )
114 return true ;
115 return false ;
116 }
117 return false ;
118 }
119
120 void ReverseExonList( struct _pair *exons, int ecnt )
121 {
122 if ( ecnt < 2 || ( exons[0].a < exons[1].a ) )
123 return ;
124 int i, j ;
125 struct _pair tmp ;
126
127 for ( i = 0, j = ecnt - 1 ; i < j ; ++i, --j )
128 {
129 tmp = exons[i] ;
130 exons[i] = exons[j] ;
131 exons[j] = tmp ;
132 }
133 }
134
135 /**
136 Merge exons that are next to each other. Showed up in scripture.
137 */
138 int CleanExonList( struct _pair *exons, int ecnt )
139 {
140 int i, k ;
141 for ( i = 0 ; i < ecnt - 1 ; ++i )
142 if ( exons[i + 1].a - exons[i].b - 1 < 20 )
143 {
144 exons[i + 1].a = exons[i].a ;
145 exons[i].a = -1 ;
146 }
147 k = 0 ;
148 for ( i = 0 ; i < ecnt ; ++i )
149 {
150 if ( exons[i].a == -1 )
151 continue ;
152 exons[k] = exons[i] ;
153 ++k ;
154 }
155 return k ;
156 }
157
158 /**
159 Remove the duplicated intron chain in the list
160 */
161 int RemoveDuplicateIntronChain( struct _transcript *t, int tcnt )
162 {
163 int i, j, k ;
164
165 qsort( t, tcnt, sizeof( *t ), TranscriptComp_ByIntron ) ;
166 for ( i = 1 ; i < tcnt ; ++i )
167 {
168 k = i - 1 ;
169 while ( t[k].ecnt == -1 )
170 --k ;
171
172 if ( t[i].ecnt != t[k].ecnt || strcmp( t[i].chrom, t[k].chrom ) || t[i].strand != t[k].strand )
173 continue ;
174 for ( j = 0 ; j < t[i].ecnt - 1 ; ++j )
175 if ( t[i].exons[j].b != t[k].exons[j].b ||
176 t[i].exons[j + 1].a != t[k].exons[j + 1].a )
177 break ;
178 if ( j >= t[i].ecnt - 1 && t[i].ecnt != 1 )
179 t[i].ecnt = -1 ;
180 if ( t[i].ecnt == 1 && t[i].exons[0].a == t[k].exons[0].a &&
181 t[i].exons[0].b == t[k].exons[0].b )
182 t[i].ecnt = -1 ;
183 /*if ( t[i].ecnt == -1 )
184 {
185 printf( "%s <- %s\n", t[i].tid, t[k].tid ) ;
186 }*/
187 }
188
189 k = 0 ;
190 for ( i = 0 ; i < tcnt ; ++i )
191 {
192 if ( t[i].ecnt == -1 )
193 {
194 free( t[i].exons ) ;
195 continue ;
196 }
197 t[k] = t[i] ;
198 ++k ;
199 }
200 tcnt = k ;
201 return tcnt ;
202 }
203
204
205 double CompareTranscripts( const struct _transcript &ref, const struct _transcript &pred )
206 {
207 int i, j ;
208 struct _pair *refExons = ref.exons ;
209 int refECnt = ref.ecnt ;
210 struct _pair *predExons = pred.exons ;
211 int predECnt = pred.ecnt ;
212
213 // Prediction must be a "subset" of the reference
214 if ( refECnt < predECnt )
215 return -1 ;
216
217 // single exon case
218 if ( refECnt == 1 )
219 {
220 if ( refExons[0].b < predExons[0].a || refExons[0].a > predExons[0].b )
221 return -1 ;
222 else
223 return 1 ;
224 }
225
226 if ( predECnt == 1 )
227 {
228 for ( i = 0 ; i < refECnt ; ++i )
229 {
230 if ( refExons[i].b < predExons[0].a || refExons[i].a > predExons[0].b )
231 continue ;
232 else
233 {
234 /*if ( i == 0 && predExons[0].a >= refExons[0].a - relaxWidth )
235 return 0 ;
236 else if ( i == refECnt - 1 && predExons[0].b <= refExons[i].b + relaxWidth )
237 return 0 ;
238 else if ( i > 0 && i < refECnt - 1 && predExons[0].a >= refExons[i].a - relaxWidth &&
239 predExons[0].b <= refExons[i].b + relaxWidth )
240 return 0 ;
241
242 return -1 ;*/
243 return 0 ;
244 }
245 }
246 return -1 ;
247 }
248
249 if ( predECnt > 0 && ref.strand != pred.strand )
250 return -1 ;
251
252 // Test the intron chain
253 for ( i = 0 ; i < refECnt - 1 ; ++i )
254 if ( refExons[i].b == predExons[0].b )
255 break ;
256 if ( i >= refECnt - 1 )
257 return -1 ;
258 //if ( i != 0 && predExons[0].a < refExons[i].a - relaxWidth )
259 // return -1 ;
260
261 for ( j = 0 ; i < refECnt - 1 && j < predECnt - 1 ; ++i, ++j )
262 {
263 if ( refExons[i].b != predExons[j].b || refExons[i + 1].a != predExons[j + 1].a )
264 break ;
265 }
266 if ( j >= predECnt - 1 )
267 {
268 /*if ( i == refECnt - 1 ||
269 predExons[ predECnt - 1 ].b <= refExons[i].b + relaxWidth )
270 return (double)( predECnt - 1 ) / (double)( refECnt - 1 ) ;
271 else
272 return -1 ;*/
273 return (double)( predECnt - 1 ) / (double)( refECnt - 1 ) ;
274 }
275 else
276 return -1 ;
277 }
278
279 bool WithinRange( int a, int b, int w = 10 )
280 {
281 if ( a - b >= -w && a - b <= w )
282 return true ;
283 return false ;
284 }
285
286 int GetIntrons( struct _transcript *transcripts, int tcnt, struct _intron *introns )
287 {
288 int i, j, k, tag ;
289 int cnt = 0 ;
290 for ( i = 0 ; i < tcnt ; ++i )
291 {
292 for ( j = 0 ; j < transcripts[i].ecnt - 1 ; ++j )
293 {
294 bool flag = false ;
295
296 tag = 0 ;
297 for ( k = cnt - 1 ; k >= 0 ; --k )
298 {
299 if ( strcmp( introns[k].chrom, transcripts[i].chrom ) )
300 {
301 //printf( "hi\n" ) ;
302 tag = k + 1 ;
303 break ;
304 }
305 if ( introns[k].start < transcripts[i].exons[j].b )
306 {
307 tag = k + 1 ;
308 break ;
309 }
310 else if ( introns[k].start == transcripts[i].exons[j].b &&
311 introns[k].end == transcripts[i].exons[j + 1].a )
312 {
313 flag = true ;
314 break ;
315 }
316 }
317
318 if ( !flag )
319 {
320 for ( k = cnt ; k > tag ; --k )
321 introns[k] = introns[k - 1] ;
322 strcpy( introns[k].chrom, transcripts[i].chrom ) ;
323 introns[k].start = transcripts[i].exons[j].b ;
324 introns[k].end = transcripts[i].exons[j + 1].a ;
325 introns[k].tindex = i ;
326 ++cnt ;
327 }
328 }
329 }
330 return cnt ;
331 }
332
333 int GetExons( struct _transcript *transcripts, int tcnt, struct _exon *exons )
334 {
335 int i, j, k, tag ;
336 int cnt = 0 ;
337 for ( i = 0 ; i < tcnt ; ++i )
338 {
339 for ( j = 0 ; j < transcripts[i].ecnt ; ++j )
340 {
341 bool flag = false ;
342 int soft = 0 ;
343 if ( j == 0 )
344 soft = soft | 2 ;
345 if ( j == transcripts[i].ecnt - 1 )
346 soft = soft | 1 ;
347
348 tag = 0 ;
349 for ( k = cnt - 1 ; k >= 0 ; --k )
350 {
351 if ( strcmp( exons[k].chrom, transcripts[i].chrom ) )
352 {
353 //printf( "hi\n" ) ;
354 tag = k + 1 ;
355 break ;
356 }
357
358 if ( exons[k].start < transcripts[i].exons[j].a )
359 {
360 tag = k + 1 ;
361 break ;
362 }
363 else if ( exons[k].start == transcripts[i].exons[j].a &&
364 exons[k].end == transcripts[i].exons[j].b &&
365 exons[k].soft == soft )
366 {
367 flag = true ;
368 break ;
369 }
370 }
371
372 if ( !flag )
373 {
374 for ( k = cnt ; k > tag ; --k )
375 exons[k] = exons[k - 1] ;
376 strcpy( exons[k].chrom, transcripts[i].chrom ) ;
377 exons[k].start = transcripts[i].exons[j].a ;
378 exons[k].end = transcripts[i].exons[j].b ;
379 //exons[k].tindex = i ;
380 exons[k].soft = soft ;
381 exons[k].matched = false ;
382 ++cnt ;
383 }
384 }
385 }
386 return cnt ;
387 }
388
389 //chr1 HAVANA transcript 320162 324461 . + . gene_id "ENSG00000237094.6"; transcript_id "ENST00000423728.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP4-669L17.10"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP4-669L17.10-002"; level 2; havana_gene "OTTHUMG00000156968.4"; havana_transcript "OTTHUMT00000346879.1";
390 int main( int argc, char *argv[] )
391 {
392 FILE *fp ;
393 struct _transcript *ref, *pred ;
394 struct _info *refInfo, *predInfo ;
395
396 int refCnt, predCnt ;
397 char line[10000], filename[10000] ;
398 struct _pair tmpExons[10000] ;
399 int tmpECnt = 0 ;
400
401 char chrom[50], tool[20], type[40], strand[3] ;
402 char tid[50] ;
403 char buffer[50] ;
404 int start, end ;
405
406 int i, j, k, tag ;
407
408 if ( argc == 1 )
409 {
410 printf( "Compare the reference transcripts and predicted transcripts.\n"
411 "Format: ./grader ref.gtf prediction.gtf\n" ) ;
412 exit( 1 ) ;
413 }
414 // Parse the arguments
415 for ( i = 3 ; i < argc ; ++i )
416 {
417 if ( !strcmp( argv[i], "-M" ) )
418 flagMultiExonOnly = true ;
419 else if ( !strcmp( argv[i], "-ac" ) )
420 flagValidChr = false ;
421 else
422 {
423 printf( "Unknown argument: %s\n", argv[i] ) ;
424 exit( 1 ) ;
425 }
426 }
427
428 /*========================================================================================
429 Stage 1: Read the transcripts from the reference and prediction's GTF files.
430 ========================================================================================*/
431 fp = NULL ;
432 fp = fopen( argv[1], "r" ) ;
433 if ( fp == NULL )
434 {
435 printf( "Could not open file %s.\n", argv[1] ) ;
436 exit( 1 ) ;
437 }
438 refCnt = 0 ;
439 ref = ( struct _transcript *)malloc( MAX_TXPT * sizeof( struct _transcript ) ) ;
440
441 strcpy( ref[0].tid, "tmp_-1" ) ;
442 while ( fgets( line, sizeof( line ), fp ) != NULL )
443 {
444 if ( line[0] == '#' )
445 continue ;
446
447 sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ;
448
449 if ( strcmp( type, "exon" ) || !validChrom( chrom ) )
450 continue ;
451
452 char *p = strstr( line, "transcript_id" ) ;
453 for ( ; *p != ' ' ; ++p )
454 ;
455 p += 2 ;
456 sscanf( p, "%s", tid ) ;
457 //printf( "+%s %d\n", tid, strlen( tid ) ) ;
458 p = tid + strlen( tid ) ;
459 while ( *p != '\"' )
460 --p ;
461 *p = '\0' ;
462 //printf( "%s\n", tid ) ;
463 if ( strcmp( tid, ref[ refCnt ].tid ) && tmpECnt )
464 {
465 ReverseExonList( tmpExons, tmpECnt ) ;
466 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ;
467 if ( tmpECnt > 1 || !flagMultiExonOnly )
468 {
469 ref[ refCnt ].ecnt = tmpECnt ;
470 ref[ refCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ;
471 memcpy( ref[ refCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ;
472 ++refCnt ;
473 }
474 tmpECnt = 0 ;
475 }
476
477 tmpExons[ tmpECnt ].a = start ;
478 tmpExons[ tmpECnt ].b = end ;
479 ++tmpECnt ;
480
481 ref[ refCnt ].strand = strand[0] ;
482 strcpy( ref[ refCnt ].chrom, chrom ) ;
483 strcpy( ref[ refCnt ].tid, tid ) ;
484 }
485 if ( tmpECnt != 0 )
486 {
487 ReverseExonList( tmpExons, tmpECnt ) ;
488 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ;
489 if ( tmpECnt > 1 || !flagMultiExonOnly )
490 {
491 ref[ refCnt ].ecnt = tmpECnt ;
492 ref[ refCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ;
493 memcpy( ref[ refCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ;
494 ++refCnt ;
495 }
496 tmpECnt = 0 ;
497 }
498 fclose( fp ) ;
499
500 fp = NULL ;
501 fp = fopen( argv[2], "r" ) ;
502 if ( fp == NULL )
503 {
504 printf( "Could not open file %s.\n", argv[2] ) ;
505 exit( 1 ) ;
506 }
507 predCnt = 0 ;
508 pred = ( struct _transcript *)malloc( MAX_TXPT * sizeof( struct _transcript ) ) ;
509
510 strcpy( pred[0].tid, "tmp_-1" ) ;
511 tmpECnt = 0 ;
512 while ( fgets( line, sizeof( line ), fp ) != NULL )
513 {
514 if ( line[0] == '#' )
515 continue ;
516 sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ;
517
518 if ( strcmp( type, "exon" ) || !validChrom( chrom ) )
519 continue ;
520
521 char *p = strstr( line, "transcript_id" ) ;
522 //char *p = strstr( line, "gene_name" ) ;
523 for ( ; *p != ' ' ; ++p )
524 ;
525 p += 2 ;
526 sscanf( p, "%s", tid ) ;
527 //tid[ strlen( tid ) - 2 ] = '\0' ;
528 p = tid + strlen( tid ) ;
529 while ( *p != '\"' )
530 --p ;
531 *p = '\0' ;
532
533 if ( strcmp( tid, pred[ predCnt ].tid ) && tmpECnt )
534 {
535 ReverseExonList( tmpExons, tmpECnt ) ;
536 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ;
537 if ( tmpECnt > 1 || !flagMultiExonOnly )
538 {
539 pred[ predCnt ].ecnt = tmpECnt ;
540 pred[ predCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ;
541 memcpy( pred[ predCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ;
542 ++predCnt ;
543 }
544 tmpECnt = 0 ;
545 }
546
547 tmpExons[ tmpECnt ].a = start ;
548 tmpExons[ tmpECnt ].b = end ;
549 ++tmpECnt ;
550
551 pred[ predCnt ].strand = strand[0] ;
552 strcpy( pred[ predCnt ].chrom, chrom ) ;
553 strcpy( pred[ predCnt ].tid, tid ) ;
554 }
555 if ( tmpECnt > 0 )
556 {
557 ReverseExonList( tmpExons, tmpECnt ) ;
558 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ;
559 if ( tmpECnt > 1 || !flagMultiExonOnly )
560 {
561 pred[ predCnt ].ecnt = tmpECnt ;
562 pred[ predCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ;
563 memcpy( pred[ predCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ;
564 ++predCnt ;
565 }
566 tmpECnt = 0 ;
567 }
568
569 refCnt = RemoveDuplicateIntronChain( ref, refCnt ) ;
570 //printf( "predCnt = %d\n", predCnt ) ;
571 predCnt = RemoveDuplicateIntronChain( pred, predCnt ) ;
572 //printf( "predCnt = %d\n", predCnt ) ;
573 qsort( ref, refCnt, sizeof( ref[0] ), TranscriptComp ) ;
574 qsort( pred, predCnt, sizeof( pred[0] ), TranscriptComp ) ;
575
576
577 /*========================================================================================
578 Stage 2: Compute the recall and precision.
579 ========================================================================================*/
580 refInfo = ( struct _info * )malloc( refCnt * sizeof( *refInfo ) ) ;
581 predInfo = ( struct _info * )malloc( predCnt * sizeof( *predInfo ) ) ;
582 for ( i = 0 ; i < refCnt ; ++i )
583 {
584 refInfo[i].coverage = -1 ;
585 refInfo[i].byId = -1 ;
586 #if DEBUG
587 printf( "=%d %s %s %d\n", i, ref[i].chrom, ref[i].tid, ref[i].ecnt ) ;
588 for ( j = 0 ; j < ref[i].ecnt ; ++j )
589 printf( "%d %d\n", ref[i].exons[j].a, ref[i].exons[j].b ) ;
590 #endif
591
592 }
593 for ( i = 0 ; i < predCnt ; ++i )
594 {
595 predInfo[i].coverage = -1 ;
596 predInfo[i].byId = -1 ;
597 #if DEBUG
598 printf( "#%d %s %s %d\n", i, pred[i].chrom, pred[i].tid, pred[i].ecnt ) ;
599 for ( j = 0 ; j < pred[i].ecnt ; ++j )
600 printf( "%d %d\n", pred[i].exons[j].a, pred[i].exons[j].b ) ;
601 #endif
602 }
603 tag = 0 ;
604 for ( i = 0 ; i < predCnt ; ++i )
605 {
606 while ( tag < refCnt &&
607 ( ( strcmp( ref[tag].chrom, pred[i].chrom ) < 0 ) ||
608 ( !strcmp( ref[tag].chrom, pred[i].chrom ) &&
609 ref[tag].exons[ ref[tag].ecnt - 1 ].b < pred[i].exons[0].a - relaxWidth ) ) )
610 ++tag ;
611
612 // if ( i == 16474 )
613 // printf( "%d %d %d", i, tag, refCnt ) ;
614 for ( j = tag ; j < refCnt && ref[j].exons[0].a <= pred[i].exons[ pred[i].ecnt - 1 ].b + relaxWidth && !strcmp( ref[j].chrom, pred[i].chrom ); ++j )
615 {
616 /*if ( !strcmp( pred[i].tid, "CUFF.4256.1" ) )
617 {
618 printf( "%s ? %s\n", pred[i].tid, ref[j].tid ) ;
619 }*/
620 // if ( i == 16474 )
621 // {
622 // printf( "%d %d\n", i, j ) ;
623 // }
624 double coverage = CompareTranscripts( ref[j], pred[i] ) ;
625 if ( coverage > refInfo[j].coverage )
626 {
627 refInfo[j].coverage = coverage ;
628 refInfo[j].byId = i ;
629 }
630 if ( coverage > predInfo[i].coverage )
631 {
632 predInfo[i].coverage = coverage ;
633 predInfo[i].byId = j ;
634 }
635 }
636 }
637
638 /*========================================================================================
639 Stage 3: Dump the information into output files.
640 ========================================================================================*/
641 //sprintf( filename, "grader.%s.recall", argv[1] ) ;
642 sprintf( filename, "grader.recall" ) ;
643 fp = fopen( filename, "w" ) ;
644 for ( i = 0 ; i < refCnt ; ++i )
645 {
646 fprintf( fp, "%s\t%d\t%lf\t%s\n", ref[i].tid, ref[i].ecnt, refInfo[i].coverage,
647 refInfo[i].byId == -1 ? "-" : pred[ refInfo[i].byId ].tid ) ;
648 }
649 fclose( fp ) ;
650
651 //sprintf( filename, "grader.%s.precision", argv[2] ) ;
652 sprintf( filename, "grader.precision" ) ;
653 fp = fopen( filename, "w" ) ;
654 for ( i = 0 ; i < predCnt ; ++i )
655 {
656 fprintf( fp, "%s\t%d\t%lf\t%s\n", pred[i].tid, pred[i].ecnt, predInfo[i].coverage,
657 predInfo[i].byId == -1 ? "-" : ref[predInfo[i].byId].tid ) ;
658 }
659 fclose( fp ) ;
660
661 // print the summary to the stdout
662 const int binCnt = 20 ;
663 int bins[binCnt + 1] ;
664 memset( bins, 0, sizeof( bins ) ) ;
665 k = 0 ;
666 for ( i = 0 ; i < refCnt ; ++i )
667 {
668 if ( refInfo[i].coverage == -1 )
669 continue ;
670 ++bins[ (int)( refInfo[i].coverage * binCnt ) ] ;
671 ++k ;
672 }
673 for ( i = 0 ; i <= binCnt ; ++i )
674 {
675 printf( "recall %lf %lf %d/%d\n", (double)i / binCnt, (double)k / refCnt, k, refCnt ) ;
676 k -= bins[i] ;
677 }
678
679 memset( bins, 0, sizeof( bins ) ) ;
680 k = 0 ;
681 for ( i = 0 ; i < predCnt ; ++i )
682 {
683 if ( predInfo[i].coverage == -1 )
684 continue ;
685 ++bins[ (int)( predInfo[i].coverage * binCnt ) ] ;
686 ++k ;
687 }
688 for ( i = 0 ; i <= binCnt ; ++i )
689 {
690 printf( "precision %lf %lf %d/%d\n", (double)i / binCnt, (double)k / predCnt, k, predCnt ) ;
691 k -= bins[i] ;
692 }
693
694 /*========================================================================================
695 Stage 4: Evaluations for introns.
696 ========================================================================================*/
697 struct _intron *refIntrons = ( struct _intron * )malloc( sizeof( struct _intron ) * MAX_TXPT ) ;
698 int riCnt = GetIntrons( ref, refCnt, refIntrons ) ;
699 struct _intron *predIntrons = ( struct _intron * )malloc( sizeof( struct _intron ) * MAX_TXPT ) ;
700 int piCnt = GetIntrons( pred, predCnt, predIntrons ) ;
701 int matchedIntron = 0 ;
702 /*for ( i = 0 ; i < riCnt ; ++i )
703 {
704 printf( "%d %s %d %d\n", i, refIntrons[i].chrom, refIntrons[i].start, refIntrons[i].end ) ;
705 }*/
706
707 fp = fopen( "grader.intron", "w" ) ;
708 tag = 0 ;
709 for ( i = 0 ; i < piCnt ; ++i )
710 {
711 bool flag = false ;
712 while ( 1 )
713 {
714 if ( tag >= riCnt )
715 break ;
716 int tmp = strcmp( refIntrons[tag].chrom, predIntrons[i].chrom ) ;
717 if ( tmp < 0 || ( tmp == 0 && refIntrons[tag].start < predIntrons[i].start ) )
718 ++tag ;
719 else
720 break ;
721 }
722 //printf( "%d (%s %d %d) %d=>", i, predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, tag ) ;
723 /*if ( predIntrons[i].start == 1613457 )
724 printf( "%d %d\n", tag, riCnt ) ;*/
725 for ( j = tag ; j < riCnt ; ++j )
726 {
727 if ( strcmp( refIntrons[j].chrom, predIntrons[i].chrom ) > 0 ||
728 refIntrons[j].start > predIntrons[i].start /*+ 10*/ )
729 break ;
730 if ( refIntrons[j].start == predIntrons[i].start &&
731 refIntrons[j].end == predIntrons[i].end )
732 //if ( WithinRange( refIntrons[j].start, predIntrons[i].start ) &&
733 // WithinRange( refIntrons[j].end, predIntrons[i].end ) )
734 {
735 ++matchedIntron ;
736 flag = true ;
737 break ;
738 }
739 }
740 //printf( "%d\n", j ) ;
741 if ( flag )
742 fprintf( fp, "Y\t%s\t%d\t%d\t%s\n", predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, pred[ predIntrons[i].tindex ].tid ) ;
743 else
744 fprintf( fp, "N\t%s\t%d\t%d\t%s\n", predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, pred[ predIntrons[i].tindex ].tid ) ;
745 }
746 fclose( fp ) ;
747 printf( "\n" ) ;
748 printf( "Intron evaluation\n") ;
749 //printf( "\tmatched\t%d\n", matchedIntron ) ;
750 printf( "\trecall\t%lf\t%d/%d\n", matchedIntron / (double)riCnt, matchedIntron, riCnt ) ;
751 printf( "\tprecision\t%lf\t%d/%d\n", matchedIntron / (double)piCnt, matchedIntron, piCnt ) ;
752
753 /*========================================================================================
754 Stage 4: Evaluations for exons.
755 ========================================================================================*/
756 struct _exon *refExons = ( struct _exon * )malloc( sizeof( struct _exon ) * MAX_TXPT ) ;
757 int reCnt = GetExons( ref, refCnt, refExons ) ;
758 struct _exon *predExons = ( struct _exon * )malloc( sizeof( struct _exon ) * MAX_TXPT ) ;
759 int peCnt = GetExons( pred, predCnt, predExons ) ;
760 int matchedExons = 0, matchedInternalExons = 0 ;
761 int refInternalExons = 0, predInternalExons = 0 ;
762
763 for ( i = 0 ; i < reCnt ; ++i )
764 {
765 //printf( "%d %d\n", refExons[i].start, refExons[i].end ) ;
766 if ( refExons[i].soft == 0 )
767 ++refInternalExons ;
768 //if ( refExons[i].soft == 3 )
769 // printf( "hi\n" ) ;
770 }
771 for ( i = 0 ; i < peCnt ; ++i )
772 if ( predExons[i].soft == 0 )
773 ++predInternalExons ;
774 tag = 0 ;
775 for ( i = 0 ; i < peCnt ; ++i )
776 {
777 bool flag = false ;
778 while ( 1 )
779 {
780 if ( tag >= reCnt )
781 break ;
782 int tmp = strcmp( refExons[tag].chrom, predExons[i].chrom ) ;
783 if ( tmp < 0 || ( tmp == 0 && refExons[tag].end < predExons[i].start ) )
784 ++tag ;
785 else
786 break ;
787 }
788 for ( j = tag ; j < reCnt ; ++j )
789 {
790 if ( strcmp( refExons[j].chrom, predExons[i].chrom ) > 0 ||
791 refExons[j].start > predExons[i].end /*+ 10*/ )
792 break ;
793 if ( refExons[j].soft != predExons[i].soft
794 || refExons[j].end < predExons[i].start )
795 continue ;
796
797 if ( refExons[j].start == predExons[i].start &&
798 refExons[j].end == predExons[i].end && predExons[i].soft == 0 )
799 //if ( WithinRange( refIntrons[j].start, predIntrons[i].start ) &&
800 // WithinRange( refIntrons[j].end, predIntrons[i].end ) )
801 {
802 refExons[j].matched = true ;
803 predExons[i].matched = true ;
804 ++matchedInternalExons ;
805 flag = true ;
806 break ;
807 }
808 else if ( ( refExons[j].start == predExons[i].start || ( predExons[i].soft & 2 ) )
809 && ( refExons[j].end == predExons[i].end || (predExons[i].soft & 1 ) ) )
810 {
811 refExons[j].matched = true ;
812 predExons[i].matched = true ;
813 flag = true ;
814 //break ;
815 }
816 }
817 //printf( "%d\n", j ) ;
818 }
819
820 printf( "\n" ) ;
821 printf( "Exon evaluation\n" ) ;
822 //printf( "\tmatched\t%d\n", matchedExons ) ;
823 matchedExons = 0 ;
824 for ( i = 0 ; i < reCnt ; ++i )
825 if ( refExons[i].matched )
826 ++matchedExons ;
827 printf( "\trecall\t%lf\t%d/%d\n", (double)matchedExons / reCnt, matchedExons, reCnt ) ;
828 matchedExons = 0 ;
829 for ( i = 0 ; i < peCnt ; ++i )
830 if ( predExons[i].matched )
831 ++matchedExons ;
832 printf( "\tprecision\t%lf\t%d/%d\n", (double)matchedExons / peCnt, matchedExons, peCnt ) ;
833
834 printf( "\n" ) ;
835 printf( "Internal exon evaluation\n" ) ;
836 //printf( "\tmatched\t%d\n", matchedInternalExons ) ;
837 printf( "\trecall\t%lf\t%d/%d\n", (double)matchedInternalExons / refInternalExons, matchedInternalExons, refInternalExons ) ;
838 printf( "\tprecision\t%lf\t%d/%d\n", (double)matchedInternalExons / predInternalExons, matchedInternalExons, predInternalExons ) ;
839 return 0 ;
840 }