0
|
1 // Compare the reference and the predictions
|
|
2 // Format: ./a.out ref.gtf prediction.gtf
|
|
3 #include <stdio.h>
|
|
4 #include <string.h>
|
|
5 #include <stdlib.h>
|
|
6
|
|
7 #define MAX_TXPT 2000000
|
|
8
|
|
9 #define DEBUG 0
|
|
10
|
|
11 bool flagMultiExonOnly = false ;
|
|
12 bool flagValidChr = true ;
|
|
13 int relaxWidth = 20 ;
|
|
14
|
|
15 struct _pair
|
|
16 {
|
|
17 int a, b ;
|
|
18 } ;
|
|
19
|
|
20 struct _transcript
|
|
21 {
|
|
22 char tid[50] ;
|
|
23 char chrom[50] ;
|
|
24 char strand ;
|
|
25 struct _pair *exons ;
|
|
26 int ecnt ;
|
|
27 } ;
|
|
28
|
|
29 struct _info
|
|
30 {
|
|
31 double coverage ;
|
|
32 int byId ; // It gets this coverage by comparing with byId.
|
|
33 } ;
|
|
34
|
|
35 struct _intron
|
|
36 {
|
|
37 char chrom[50] ;
|
|
38 int start, end ;
|
|
39 int tindex ; // The index to the corresponding transcripts
|
|
40 } ;
|
|
41
|
|
42 struct _exon
|
|
43 {
|
|
44 char chrom[50] ;
|
|
45 int start, end ;
|
|
46 //int tindex ;
|
|
47 int soft ; // left-bit: left side is soft, right-bit: right side is soft
|
|
48 bool matched ;
|
|
49 } ;
|
|
50
|
|
51 int TranscriptComp( const void *p1, const void *p2 )
|
|
52 {
|
|
53 const struct _transcript *a = (struct _transcript *)p1 ;
|
|
54 const struct _transcript *b = ( struct _transcript *)p2 ;
|
|
55 int tmp = strcmp( a->chrom, b->chrom ) ;
|
|
56 if ( tmp != 0 )
|
|
57 return tmp ;
|
|
58 return a->exons[0].a - b->exons[0].a ;
|
|
59 }
|
|
60
|
|
61 int TranscriptComp_ByIntron( const void *p1, const void *p2 )
|
|
62 {
|
|
63 const struct _transcript *a = (struct _transcript *)p1 ;
|
|
64 const struct _transcript *b = ( struct _transcript *)p2 ;
|
|
65 if ( a->strand != b->strand )
|
|
66 return a->strand - b->strand ;
|
|
67 int tmp = strcmp( a->chrom, b->chrom ) ;
|
|
68 if ( tmp != 0 )
|
|
69 return tmp ;
|
|
70 if ( a->ecnt != b->ecnt )
|
|
71 return a->ecnt - b->ecnt ;
|
|
72 int i ;
|
|
73 for ( i = 0 ; i < a->ecnt - 1 ; ++i )
|
|
74 {
|
|
75 if ( a->exons[i].b != b->exons[i].b )
|
|
76 return a->exons[i].b - b->exons[i].b ;
|
|
77 if ( a->exons[i + 1].a != b->exons[i + 1].a )
|
|
78 return a->exons[i + 1].a - b->exons[i + 1].a ;
|
|
79 }
|
|
80
|
|
81 return 0 ; //strcmp( a->tid, b->tid ) ;
|
|
82 }
|
|
83
|
|
84 bool validChrom( char *chrom )
|
|
85 {
|
|
86 if ( !flagValidChr )
|
|
87 return true ;
|
|
88 if ( chrom[0] != 'c' || chrom[1] != 'h' || chrom[2] != 'r' )
|
|
89 return false ;
|
|
90 // only consider chr1-22,x,y,z
|
|
91 if ( chrom[3]=='x' || chrom[3]=='X'
|
|
92 || chrom[3] == 'y' || chrom[3] == 'Y'
|
|
93 || chrom[3] == 'm' || chrom[3] == 'M'
|
|
94 || ( chrom[3] >= '3' && chrom[3] <= '9' ) )
|
|
95 {
|
|
96 if ( chrom[4] == '\0' )
|
|
97 return true ;
|
|
98 else
|
|
99 return false ;
|
|
100 }
|
|
101 if ( chrom[3] == '1' )
|
|
102 {
|
|
103 if ( chrom[4] == '\0' )
|
|
104 return true ;
|
|
105 if ( chrom[4] >= '0' && chrom[4] <= '9' && chrom[5] == '\0' )
|
|
106 return true ;
|
|
107 return false ;
|
|
108 }
|
|
109 if ( chrom[3] == '2' )
|
|
110 {
|
|
111 if ( chrom[4] == '\0' )
|
|
112 return true ;
|
|
113 if ( chrom[4] >= '0' && chrom[4] <= '2' && chrom[5] == '\0' )
|
|
114 return true ;
|
|
115 return false ;
|
|
116 }
|
|
117 return false ;
|
|
118 }
|
|
119
|
|
120 void ReverseExonList( struct _pair *exons, int ecnt )
|
|
121 {
|
|
122 if ( ecnt < 2 || ( exons[0].a < exons[1].a ) )
|
|
123 return ;
|
|
124 int i, j ;
|
|
125 struct _pair tmp ;
|
|
126
|
|
127 for ( i = 0, j = ecnt - 1 ; i < j ; ++i, --j )
|
|
128 {
|
|
129 tmp = exons[i] ;
|
|
130 exons[i] = exons[j] ;
|
|
131 exons[j] = tmp ;
|
|
132 }
|
|
133 }
|
|
134
|
|
135 /**
|
|
136 Merge exons that are next to each other. Showed up in scripture.
|
|
137 */
|
|
138 int CleanExonList( struct _pair *exons, int ecnt )
|
|
139 {
|
|
140 int i, k ;
|
|
141 for ( i = 0 ; i < ecnt - 1 ; ++i )
|
|
142 if ( exons[i + 1].a - exons[i].b - 1 < 20 )
|
|
143 {
|
|
144 exons[i + 1].a = exons[i].a ;
|
|
145 exons[i].a = -1 ;
|
|
146 }
|
|
147 k = 0 ;
|
|
148 for ( i = 0 ; i < ecnt ; ++i )
|
|
149 {
|
|
150 if ( exons[i].a == -1 )
|
|
151 continue ;
|
|
152 exons[k] = exons[i] ;
|
|
153 ++k ;
|
|
154 }
|
|
155 return k ;
|
|
156 }
|
|
157
|
|
158 /**
|
|
159 Remove the duplicated intron chain in the list
|
|
160 */
|
|
161 int RemoveDuplicateIntronChain( struct _transcript *t, int tcnt )
|
|
162 {
|
|
163 int i, j, k ;
|
|
164
|
|
165 qsort( t, tcnt, sizeof( *t ), TranscriptComp_ByIntron ) ;
|
|
166 for ( i = 1 ; i < tcnt ; ++i )
|
|
167 {
|
|
168 k = i - 1 ;
|
|
169 while ( t[k].ecnt == -1 )
|
|
170 --k ;
|
|
171
|
|
172 if ( t[i].ecnt != t[k].ecnt || strcmp( t[i].chrom, t[k].chrom ) || t[i].strand != t[k].strand )
|
|
173 continue ;
|
|
174 for ( j = 0 ; j < t[i].ecnt - 1 ; ++j )
|
|
175 if ( t[i].exons[j].b != t[k].exons[j].b ||
|
|
176 t[i].exons[j + 1].a != t[k].exons[j + 1].a )
|
|
177 break ;
|
|
178 if ( j >= t[i].ecnt - 1 && t[i].ecnt != 1 )
|
|
179 t[i].ecnt = -1 ;
|
|
180 if ( t[i].ecnt == 1 && t[i].exons[0].a == t[k].exons[0].a &&
|
|
181 t[i].exons[0].b == t[k].exons[0].b )
|
|
182 t[i].ecnt = -1 ;
|
|
183 /*if ( t[i].ecnt == -1 )
|
|
184 {
|
|
185 printf( "%s <- %s\n", t[i].tid, t[k].tid ) ;
|
|
186 }*/
|
|
187 }
|
|
188
|
|
189 k = 0 ;
|
|
190 for ( i = 0 ; i < tcnt ; ++i )
|
|
191 {
|
|
192 if ( t[i].ecnt == -1 )
|
|
193 {
|
|
194 free( t[i].exons ) ;
|
|
195 continue ;
|
|
196 }
|
|
197 t[k] = t[i] ;
|
|
198 ++k ;
|
|
199 }
|
|
200 tcnt = k ;
|
|
201 return tcnt ;
|
|
202 }
|
|
203
|
|
204
|
|
205 double CompareTranscripts( const struct _transcript &ref, const struct _transcript &pred )
|
|
206 {
|
|
207 int i, j ;
|
|
208 struct _pair *refExons = ref.exons ;
|
|
209 int refECnt = ref.ecnt ;
|
|
210 struct _pair *predExons = pred.exons ;
|
|
211 int predECnt = pred.ecnt ;
|
|
212
|
|
213 // Prediction must be a "subset" of the reference
|
|
214 if ( refECnt < predECnt )
|
|
215 return -1 ;
|
|
216
|
|
217 // single exon case
|
|
218 if ( refECnt == 1 )
|
|
219 {
|
|
220 if ( refExons[0].b < predExons[0].a || refExons[0].a > predExons[0].b )
|
|
221 return -1 ;
|
|
222 else
|
|
223 return 1 ;
|
|
224 }
|
|
225
|
|
226 if ( predECnt == 1 )
|
|
227 {
|
|
228 for ( i = 0 ; i < refECnt ; ++i )
|
|
229 {
|
|
230 if ( refExons[i].b < predExons[0].a || refExons[i].a > predExons[0].b )
|
|
231 continue ;
|
|
232 else
|
|
233 {
|
|
234 /*if ( i == 0 && predExons[0].a >= refExons[0].a - relaxWidth )
|
|
235 return 0 ;
|
|
236 else if ( i == refECnt - 1 && predExons[0].b <= refExons[i].b + relaxWidth )
|
|
237 return 0 ;
|
|
238 else if ( i > 0 && i < refECnt - 1 && predExons[0].a >= refExons[i].a - relaxWidth &&
|
|
239 predExons[0].b <= refExons[i].b + relaxWidth )
|
|
240 return 0 ;
|
|
241
|
|
242 return -1 ;*/
|
|
243 return 0 ;
|
|
244 }
|
|
245 }
|
|
246 return -1 ;
|
|
247 }
|
|
248
|
|
249 if ( predECnt > 0 && ref.strand != pred.strand )
|
|
250 return -1 ;
|
|
251
|
|
252 // Test the intron chain
|
|
253 for ( i = 0 ; i < refECnt - 1 ; ++i )
|
|
254 if ( refExons[i].b == predExons[0].b )
|
|
255 break ;
|
|
256 if ( i >= refECnt - 1 )
|
|
257 return -1 ;
|
|
258 //if ( i != 0 && predExons[0].a < refExons[i].a - relaxWidth )
|
|
259 // return -1 ;
|
|
260
|
|
261 for ( j = 0 ; i < refECnt - 1 && j < predECnt - 1 ; ++i, ++j )
|
|
262 {
|
|
263 if ( refExons[i].b != predExons[j].b || refExons[i + 1].a != predExons[j + 1].a )
|
|
264 break ;
|
|
265 }
|
|
266 if ( j >= predECnt - 1 )
|
|
267 {
|
|
268 /*if ( i == refECnt - 1 ||
|
|
269 predExons[ predECnt - 1 ].b <= refExons[i].b + relaxWidth )
|
|
270 return (double)( predECnt - 1 ) / (double)( refECnt - 1 ) ;
|
|
271 else
|
|
272 return -1 ;*/
|
|
273 return (double)( predECnt - 1 ) / (double)( refECnt - 1 ) ;
|
|
274 }
|
|
275 else
|
|
276 return -1 ;
|
|
277 }
|
|
278
|
|
279 bool WithinRange( int a, int b, int w = 10 )
|
|
280 {
|
|
281 if ( a - b >= -w && a - b <= w )
|
|
282 return true ;
|
|
283 return false ;
|
|
284 }
|
|
285
|
|
286 int GetIntrons( struct _transcript *transcripts, int tcnt, struct _intron *introns )
|
|
287 {
|
|
288 int i, j, k, tag ;
|
|
289 int cnt = 0 ;
|
|
290 for ( i = 0 ; i < tcnt ; ++i )
|
|
291 {
|
|
292 for ( j = 0 ; j < transcripts[i].ecnt - 1 ; ++j )
|
|
293 {
|
|
294 bool flag = false ;
|
|
295
|
|
296 tag = 0 ;
|
|
297 for ( k = cnt - 1 ; k >= 0 ; --k )
|
|
298 {
|
|
299 if ( strcmp( introns[k].chrom, transcripts[i].chrom ) )
|
|
300 {
|
|
301 //printf( "hi\n" ) ;
|
|
302 tag = k + 1 ;
|
|
303 break ;
|
|
304 }
|
|
305 if ( introns[k].start < transcripts[i].exons[j].b )
|
|
306 {
|
|
307 tag = k + 1 ;
|
|
308 break ;
|
|
309 }
|
|
310 else if ( introns[k].start == transcripts[i].exons[j].b &&
|
|
311 introns[k].end == transcripts[i].exons[j + 1].a )
|
|
312 {
|
|
313 flag = true ;
|
|
314 break ;
|
|
315 }
|
|
316 }
|
|
317
|
|
318 if ( !flag )
|
|
319 {
|
|
320 for ( k = cnt ; k > tag ; --k )
|
|
321 introns[k] = introns[k - 1] ;
|
|
322 strcpy( introns[k].chrom, transcripts[i].chrom ) ;
|
|
323 introns[k].start = transcripts[i].exons[j].b ;
|
|
324 introns[k].end = transcripts[i].exons[j + 1].a ;
|
|
325 introns[k].tindex = i ;
|
|
326 ++cnt ;
|
|
327 }
|
|
328 }
|
|
329 }
|
|
330 return cnt ;
|
|
331 }
|
|
332
|
|
333 int GetExons( struct _transcript *transcripts, int tcnt, struct _exon *exons )
|
|
334 {
|
|
335 int i, j, k, tag ;
|
|
336 int cnt = 0 ;
|
|
337 for ( i = 0 ; i < tcnt ; ++i )
|
|
338 {
|
|
339 for ( j = 0 ; j < transcripts[i].ecnt ; ++j )
|
|
340 {
|
|
341 bool flag = false ;
|
|
342 int soft = 0 ;
|
|
343 if ( j == 0 )
|
|
344 soft = soft | 2 ;
|
|
345 if ( j == transcripts[i].ecnt - 1 )
|
|
346 soft = soft | 1 ;
|
|
347
|
|
348 tag = 0 ;
|
|
349 for ( k = cnt - 1 ; k >= 0 ; --k )
|
|
350 {
|
|
351 if ( strcmp( exons[k].chrom, transcripts[i].chrom ) )
|
|
352 {
|
|
353 //printf( "hi\n" ) ;
|
|
354 tag = k + 1 ;
|
|
355 break ;
|
|
356 }
|
|
357
|
|
358 if ( exons[k].start < transcripts[i].exons[j].a )
|
|
359 {
|
|
360 tag = k + 1 ;
|
|
361 break ;
|
|
362 }
|
|
363 else if ( exons[k].start == transcripts[i].exons[j].a &&
|
|
364 exons[k].end == transcripts[i].exons[j].b &&
|
|
365 exons[k].soft == soft )
|
|
366 {
|
|
367 flag = true ;
|
|
368 break ;
|
|
369 }
|
|
370 }
|
|
371
|
|
372 if ( !flag )
|
|
373 {
|
|
374 for ( k = cnt ; k > tag ; --k )
|
|
375 exons[k] = exons[k - 1] ;
|
|
376 strcpy( exons[k].chrom, transcripts[i].chrom ) ;
|
|
377 exons[k].start = transcripts[i].exons[j].a ;
|
|
378 exons[k].end = transcripts[i].exons[j].b ;
|
|
379 //exons[k].tindex = i ;
|
|
380 exons[k].soft = soft ;
|
|
381 exons[k].matched = false ;
|
|
382 ++cnt ;
|
|
383 }
|
|
384 }
|
|
385 }
|
|
386 return cnt ;
|
|
387 }
|
|
388
|
|
389 //chr1 HAVANA transcript 320162 324461 . + . gene_id "ENSG00000237094.6"; transcript_id "ENST00000423728.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP4-669L17.10"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP4-669L17.10-002"; level 2; havana_gene "OTTHUMG00000156968.4"; havana_transcript "OTTHUMT00000346879.1";
|
|
390 int main( int argc, char *argv[] )
|
|
391 {
|
|
392 FILE *fp ;
|
|
393 struct _transcript *ref, *pred ;
|
|
394 struct _info *refInfo, *predInfo ;
|
|
395
|
|
396 int refCnt, predCnt ;
|
|
397 char line[10000], filename[10000] ;
|
|
398 struct _pair tmpExons[10000] ;
|
|
399 int tmpECnt = 0 ;
|
|
400
|
|
401 char chrom[50], tool[20], type[40], strand[3] ;
|
|
402 char tid[50] ;
|
|
403 char buffer[50] ;
|
|
404 int start, end ;
|
|
405
|
|
406 int i, j, k, tag ;
|
|
407
|
|
408 if ( argc == 1 )
|
|
409 {
|
|
410 printf( "Compare the reference transcripts and predicted transcripts.\n"
|
|
411 "Format: ./grader ref.gtf prediction.gtf\n" ) ;
|
|
412 exit( 1 ) ;
|
|
413 }
|
|
414 // Parse the arguments
|
|
415 for ( i = 3 ; i < argc ; ++i )
|
|
416 {
|
|
417 if ( !strcmp( argv[i], "-M" ) )
|
|
418 flagMultiExonOnly = true ;
|
|
419 else if ( !strcmp( argv[i], "-ac" ) )
|
|
420 flagValidChr = false ;
|
|
421 else
|
|
422 {
|
|
423 printf( "Unknown argument: %s\n", argv[i] ) ;
|
|
424 exit( 1 ) ;
|
|
425 }
|
|
426 }
|
|
427
|
|
428 /*========================================================================================
|
|
429 Stage 1: Read the transcripts from the reference and prediction's GTF files.
|
|
430 ========================================================================================*/
|
|
431 fp = NULL ;
|
|
432 fp = fopen( argv[1], "r" ) ;
|
|
433 if ( fp == NULL )
|
|
434 {
|
|
435 printf( "Could not open file %s.\n", argv[1] ) ;
|
|
436 exit( 1 ) ;
|
|
437 }
|
|
438 refCnt = 0 ;
|
|
439 ref = ( struct _transcript *)malloc( MAX_TXPT * sizeof( struct _transcript ) ) ;
|
|
440
|
|
441 strcpy( ref[0].tid, "tmp_-1" ) ;
|
|
442 while ( fgets( line, sizeof( line ), fp ) != NULL )
|
|
443 {
|
|
444 if ( line[0] == '#' )
|
|
445 continue ;
|
|
446
|
|
447 sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ;
|
|
448
|
|
449 if ( strcmp( type, "exon" ) || !validChrom( chrom ) )
|
|
450 continue ;
|
|
451
|
|
452 char *p = strstr( line, "transcript_id" ) ;
|
|
453 for ( ; *p != ' ' ; ++p )
|
|
454 ;
|
|
455 p += 2 ;
|
|
456 sscanf( p, "%s", tid ) ;
|
|
457 //printf( "+%s %d\n", tid, strlen( tid ) ) ;
|
|
458 p = tid + strlen( tid ) ;
|
|
459 while ( *p != '\"' )
|
|
460 --p ;
|
|
461 *p = '\0' ;
|
|
462 //printf( "%s\n", tid ) ;
|
|
463 if ( strcmp( tid, ref[ refCnt ].tid ) && tmpECnt )
|
|
464 {
|
|
465 ReverseExonList( tmpExons, tmpECnt ) ;
|
|
466 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ;
|
|
467 if ( tmpECnt > 1 || !flagMultiExonOnly )
|
|
468 {
|
|
469 ref[ refCnt ].ecnt = tmpECnt ;
|
|
470 ref[ refCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ;
|
|
471 memcpy( ref[ refCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ;
|
|
472 ++refCnt ;
|
|
473 }
|
|
474 tmpECnt = 0 ;
|
|
475 }
|
|
476
|
|
477 tmpExons[ tmpECnt ].a = start ;
|
|
478 tmpExons[ tmpECnt ].b = end ;
|
|
479 ++tmpECnt ;
|
|
480
|
|
481 ref[ refCnt ].strand = strand[0] ;
|
|
482 strcpy( ref[ refCnt ].chrom, chrom ) ;
|
|
483 strcpy( ref[ refCnt ].tid, tid ) ;
|
|
484 }
|
|
485 if ( tmpECnt != 0 )
|
|
486 {
|
|
487 ReverseExonList( tmpExons, tmpECnt ) ;
|
|
488 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ;
|
|
489 if ( tmpECnt > 1 || !flagMultiExonOnly )
|
|
490 {
|
|
491 ref[ refCnt ].ecnt = tmpECnt ;
|
|
492 ref[ refCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ;
|
|
493 memcpy( ref[ refCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ;
|
|
494 ++refCnt ;
|
|
495 }
|
|
496 tmpECnt = 0 ;
|
|
497 }
|
|
498 fclose( fp ) ;
|
|
499
|
|
500 fp = NULL ;
|
|
501 fp = fopen( argv[2], "r" ) ;
|
|
502 if ( fp == NULL )
|
|
503 {
|
|
504 printf( "Could not open file %s.\n", argv[2] ) ;
|
|
505 exit( 1 ) ;
|
|
506 }
|
|
507 predCnt = 0 ;
|
|
508 pred = ( struct _transcript *)malloc( MAX_TXPT * sizeof( struct _transcript ) ) ;
|
|
509
|
|
510 strcpy( pred[0].tid, "tmp_-1" ) ;
|
|
511 tmpECnt = 0 ;
|
|
512 while ( fgets( line, sizeof( line ), fp ) != NULL )
|
|
513 {
|
|
514 if ( line[0] == '#' )
|
|
515 continue ;
|
|
516 sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ;
|
|
517
|
|
518 if ( strcmp( type, "exon" ) || !validChrom( chrom ) )
|
|
519 continue ;
|
|
520
|
|
521 char *p = strstr( line, "transcript_id" ) ;
|
|
522 //char *p = strstr( line, "gene_name" ) ;
|
|
523 for ( ; *p != ' ' ; ++p )
|
|
524 ;
|
|
525 p += 2 ;
|
|
526 sscanf( p, "%s", tid ) ;
|
|
527 //tid[ strlen( tid ) - 2 ] = '\0' ;
|
|
528 p = tid + strlen( tid ) ;
|
|
529 while ( *p != '\"' )
|
|
530 --p ;
|
|
531 *p = '\0' ;
|
|
532
|
|
533 if ( strcmp( tid, pred[ predCnt ].tid ) && tmpECnt )
|
|
534 {
|
|
535 ReverseExonList( tmpExons, tmpECnt ) ;
|
|
536 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ;
|
|
537 if ( tmpECnt > 1 || !flagMultiExonOnly )
|
|
538 {
|
|
539 pred[ predCnt ].ecnt = tmpECnt ;
|
|
540 pred[ predCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ;
|
|
541 memcpy( pred[ predCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ;
|
|
542 ++predCnt ;
|
|
543 }
|
|
544 tmpECnt = 0 ;
|
|
545 }
|
|
546
|
|
547 tmpExons[ tmpECnt ].a = start ;
|
|
548 tmpExons[ tmpECnt ].b = end ;
|
|
549 ++tmpECnt ;
|
|
550
|
|
551 pred[ predCnt ].strand = strand[0] ;
|
|
552 strcpy( pred[ predCnt ].chrom, chrom ) ;
|
|
553 strcpy( pred[ predCnt ].tid, tid ) ;
|
|
554 }
|
|
555 if ( tmpECnt > 0 )
|
|
556 {
|
|
557 ReverseExonList( tmpExons, tmpECnt ) ;
|
|
558 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ;
|
|
559 if ( tmpECnt > 1 || !flagMultiExonOnly )
|
|
560 {
|
|
561 pred[ predCnt ].ecnt = tmpECnt ;
|
|
562 pred[ predCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ;
|
|
563 memcpy( pred[ predCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ;
|
|
564 ++predCnt ;
|
|
565 }
|
|
566 tmpECnt = 0 ;
|
|
567 }
|
|
568
|
|
569 refCnt = RemoveDuplicateIntronChain( ref, refCnt ) ;
|
|
570 //printf( "predCnt = %d\n", predCnt ) ;
|
|
571 predCnt = RemoveDuplicateIntronChain( pred, predCnt ) ;
|
|
572 //printf( "predCnt = %d\n", predCnt ) ;
|
|
573 qsort( ref, refCnt, sizeof( ref[0] ), TranscriptComp ) ;
|
|
574 qsort( pred, predCnt, sizeof( pred[0] ), TranscriptComp ) ;
|
|
575
|
|
576
|
|
577 /*========================================================================================
|
|
578 Stage 2: Compute the recall and precision.
|
|
579 ========================================================================================*/
|
|
580 refInfo = ( struct _info * )malloc( refCnt * sizeof( *refInfo ) ) ;
|
|
581 predInfo = ( struct _info * )malloc( predCnt * sizeof( *predInfo ) ) ;
|
|
582 for ( i = 0 ; i < refCnt ; ++i )
|
|
583 {
|
|
584 refInfo[i].coverage = -1 ;
|
|
585 refInfo[i].byId = -1 ;
|
|
586 #if DEBUG
|
|
587 printf( "=%d %s %s %d\n", i, ref[i].chrom, ref[i].tid, ref[i].ecnt ) ;
|
|
588 for ( j = 0 ; j < ref[i].ecnt ; ++j )
|
|
589 printf( "%d %d\n", ref[i].exons[j].a, ref[i].exons[j].b ) ;
|
|
590 #endif
|
|
591
|
|
592 }
|
|
593 for ( i = 0 ; i < predCnt ; ++i )
|
|
594 {
|
|
595 predInfo[i].coverage = -1 ;
|
|
596 predInfo[i].byId = -1 ;
|
|
597 #if DEBUG
|
|
598 printf( "#%d %s %s %d\n", i, pred[i].chrom, pred[i].tid, pred[i].ecnt ) ;
|
|
599 for ( j = 0 ; j < pred[i].ecnt ; ++j )
|
|
600 printf( "%d %d\n", pred[i].exons[j].a, pred[i].exons[j].b ) ;
|
|
601 #endif
|
|
602 }
|
|
603 tag = 0 ;
|
|
604 for ( i = 0 ; i < predCnt ; ++i )
|
|
605 {
|
|
606 while ( tag < refCnt &&
|
|
607 ( ( strcmp( ref[tag].chrom, pred[i].chrom ) < 0 ) ||
|
|
608 ( !strcmp( ref[tag].chrom, pred[i].chrom ) &&
|
|
609 ref[tag].exons[ ref[tag].ecnt - 1 ].b < pred[i].exons[0].a - relaxWidth ) ) )
|
|
610 ++tag ;
|
|
611
|
|
612 // if ( i == 16474 )
|
|
613 // printf( "%d %d %d", i, tag, refCnt ) ;
|
|
614 for ( j = tag ; j < refCnt && ref[j].exons[0].a <= pred[i].exons[ pred[i].ecnt - 1 ].b + relaxWidth && !strcmp( ref[j].chrom, pred[i].chrom ); ++j )
|
|
615 {
|
|
616 /*if ( !strcmp( pred[i].tid, "CUFF.4256.1" ) )
|
|
617 {
|
|
618 printf( "%s ? %s\n", pred[i].tid, ref[j].tid ) ;
|
|
619 }*/
|
|
620 // if ( i == 16474 )
|
|
621 // {
|
|
622 // printf( "%d %d\n", i, j ) ;
|
|
623 // }
|
|
624 double coverage = CompareTranscripts( ref[j], pred[i] ) ;
|
|
625 if ( coverage > refInfo[j].coverage )
|
|
626 {
|
|
627 refInfo[j].coverage = coverage ;
|
|
628 refInfo[j].byId = i ;
|
|
629 }
|
|
630 if ( coverage > predInfo[i].coverage )
|
|
631 {
|
|
632 predInfo[i].coverage = coverage ;
|
|
633 predInfo[i].byId = j ;
|
|
634 }
|
|
635 }
|
|
636 }
|
|
637
|
|
638 /*========================================================================================
|
|
639 Stage 3: Dump the information into output files.
|
|
640 ========================================================================================*/
|
|
641 //sprintf( filename, "grader.%s.recall", argv[1] ) ;
|
|
642 sprintf( filename, "grader.recall" ) ;
|
|
643 fp = fopen( filename, "w" ) ;
|
|
644 for ( i = 0 ; i < refCnt ; ++i )
|
|
645 {
|
|
646 fprintf( fp, "%s\t%d\t%lf\t%s\n", ref[i].tid, ref[i].ecnt, refInfo[i].coverage,
|
|
647 refInfo[i].byId == -1 ? "-" : pred[ refInfo[i].byId ].tid ) ;
|
|
648 }
|
|
649 fclose( fp ) ;
|
|
650
|
|
651 //sprintf( filename, "grader.%s.precision", argv[2] ) ;
|
|
652 sprintf( filename, "grader.precision" ) ;
|
|
653 fp = fopen( filename, "w" ) ;
|
|
654 for ( i = 0 ; i < predCnt ; ++i )
|
|
655 {
|
|
656 fprintf( fp, "%s\t%d\t%lf\t%s\n", pred[i].tid, pred[i].ecnt, predInfo[i].coverage,
|
|
657 predInfo[i].byId == -1 ? "-" : ref[predInfo[i].byId].tid ) ;
|
|
658 }
|
|
659 fclose( fp ) ;
|
|
660
|
|
661 // print the summary to the stdout
|
|
662 const int binCnt = 20 ;
|
|
663 int bins[binCnt + 1] ;
|
|
664 memset( bins, 0, sizeof( bins ) ) ;
|
|
665 k = 0 ;
|
|
666 for ( i = 0 ; i < refCnt ; ++i )
|
|
667 {
|
|
668 if ( refInfo[i].coverage == -1 )
|
|
669 continue ;
|
|
670 ++bins[ (int)( refInfo[i].coverage * binCnt ) ] ;
|
|
671 ++k ;
|
|
672 }
|
|
673 for ( i = 0 ; i <= binCnt ; ++i )
|
|
674 {
|
|
675 printf( "recall %lf %lf %d/%d\n", (double)i / binCnt, (double)k / refCnt, k, refCnt ) ;
|
|
676 k -= bins[i] ;
|
|
677 }
|
|
678
|
|
679 memset( bins, 0, sizeof( bins ) ) ;
|
|
680 k = 0 ;
|
|
681 for ( i = 0 ; i < predCnt ; ++i )
|
|
682 {
|
|
683 if ( predInfo[i].coverage == -1 )
|
|
684 continue ;
|
|
685 ++bins[ (int)( predInfo[i].coverage * binCnt ) ] ;
|
|
686 ++k ;
|
|
687 }
|
|
688 for ( i = 0 ; i <= binCnt ; ++i )
|
|
689 {
|
|
690 printf( "precision %lf %lf %d/%d\n", (double)i / binCnt, (double)k / predCnt, k, predCnt ) ;
|
|
691 k -= bins[i] ;
|
|
692 }
|
|
693
|
|
694 /*========================================================================================
|
|
695 Stage 4: Evaluations for introns.
|
|
696 ========================================================================================*/
|
|
697 struct _intron *refIntrons = ( struct _intron * )malloc( sizeof( struct _intron ) * MAX_TXPT ) ;
|
|
698 int riCnt = GetIntrons( ref, refCnt, refIntrons ) ;
|
|
699 struct _intron *predIntrons = ( struct _intron * )malloc( sizeof( struct _intron ) * MAX_TXPT ) ;
|
|
700 int piCnt = GetIntrons( pred, predCnt, predIntrons ) ;
|
|
701 int matchedIntron = 0 ;
|
|
702 /*for ( i = 0 ; i < riCnt ; ++i )
|
|
703 {
|
|
704 printf( "%d %s %d %d\n", i, refIntrons[i].chrom, refIntrons[i].start, refIntrons[i].end ) ;
|
|
705 }*/
|
|
706
|
|
707 fp = fopen( "grader.intron", "w" ) ;
|
|
708 tag = 0 ;
|
|
709 for ( i = 0 ; i < piCnt ; ++i )
|
|
710 {
|
|
711 bool flag = false ;
|
|
712 while ( 1 )
|
|
713 {
|
|
714 if ( tag >= riCnt )
|
|
715 break ;
|
|
716 int tmp = strcmp( refIntrons[tag].chrom, predIntrons[i].chrom ) ;
|
|
717 if ( tmp < 0 || ( tmp == 0 && refIntrons[tag].start < predIntrons[i].start ) )
|
|
718 ++tag ;
|
|
719 else
|
|
720 break ;
|
|
721 }
|
|
722 //printf( "%d (%s %d %d) %d=>", i, predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, tag ) ;
|
|
723 /*if ( predIntrons[i].start == 1613457 )
|
|
724 printf( "%d %d\n", tag, riCnt ) ;*/
|
|
725 for ( j = tag ; j < riCnt ; ++j )
|
|
726 {
|
|
727 if ( strcmp( refIntrons[j].chrom, predIntrons[i].chrom ) > 0 ||
|
|
728 refIntrons[j].start > predIntrons[i].start /*+ 10*/ )
|
|
729 break ;
|
|
730 if ( refIntrons[j].start == predIntrons[i].start &&
|
|
731 refIntrons[j].end == predIntrons[i].end )
|
|
732 //if ( WithinRange( refIntrons[j].start, predIntrons[i].start ) &&
|
|
733 // WithinRange( refIntrons[j].end, predIntrons[i].end ) )
|
|
734 {
|
|
735 ++matchedIntron ;
|
|
736 flag = true ;
|
|
737 break ;
|
|
738 }
|
|
739 }
|
|
740 //printf( "%d\n", j ) ;
|
|
741 if ( flag )
|
|
742 fprintf( fp, "Y\t%s\t%d\t%d\t%s\n", predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, pred[ predIntrons[i].tindex ].tid ) ;
|
|
743 else
|
|
744 fprintf( fp, "N\t%s\t%d\t%d\t%s\n", predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, pred[ predIntrons[i].tindex ].tid ) ;
|
|
745 }
|
|
746 fclose( fp ) ;
|
|
747 printf( "\n" ) ;
|
|
748 printf( "Intron evaluation\n") ;
|
|
749 //printf( "\tmatched\t%d\n", matchedIntron ) ;
|
|
750 printf( "\trecall\t%lf\t%d/%d\n", matchedIntron / (double)riCnt, matchedIntron, riCnt ) ;
|
|
751 printf( "\tprecision\t%lf\t%d/%d\n", matchedIntron / (double)piCnt, matchedIntron, piCnt ) ;
|
|
752
|
|
753 /*========================================================================================
|
|
754 Stage 4: Evaluations for exons.
|
|
755 ========================================================================================*/
|
|
756 struct _exon *refExons = ( struct _exon * )malloc( sizeof( struct _exon ) * MAX_TXPT ) ;
|
|
757 int reCnt = GetExons( ref, refCnt, refExons ) ;
|
|
758 struct _exon *predExons = ( struct _exon * )malloc( sizeof( struct _exon ) * MAX_TXPT ) ;
|
|
759 int peCnt = GetExons( pred, predCnt, predExons ) ;
|
|
760 int matchedExons = 0, matchedInternalExons = 0 ;
|
|
761 int refInternalExons = 0, predInternalExons = 0 ;
|
|
762
|
|
763 for ( i = 0 ; i < reCnt ; ++i )
|
|
764 {
|
|
765 //printf( "%d %d\n", refExons[i].start, refExons[i].end ) ;
|
|
766 if ( refExons[i].soft == 0 )
|
|
767 ++refInternalExons ;
|
|
768 //if ( refExons[i].soft == 3 )
|
|
769 // printf( "hi\n" ) ;
|
|
770 }
|
|
771 for ( i = 0 ; i < peCnt ; ++i )
|
|
772 if ( predExons[i].soft == 0 )
|
|
773 ++predInternalExons ;
|
|
774 tag = 0 ;
|
|
775 for ( i = 0 ; i < peCnt ; ++i )
|
|
776 {
|
|
777 bool flag = false ;
|
|
778 while ( 1 )
|
|
779 {
|
|
780 if ( tag >= reCnt )
|
|
781 break ;
|
|
782 int tmp = strcmp( refExons[tag].chrom, predExons[i].chrom ) ;
|
|
783 if ( tmp < 0 || ( tmp == 0 && refExons[tag].end < predExons[i].start ) )
|
|
784 ++tag ;
|
|
785 else
|
|
786 break ;
|
|
787 }
|
|
788 for ( j = tag ; j < reCnt ; ++j )
|
|
789 {
|
|
790 if ( strcmp( refExons[j].chrom, predExons[i].chrom ) > 0 ||
|
|
791 refExons[j].start > predExons[i].end /*+ 10*/ )
|
|
792 break ;
|
|
793 if ( refExons[j].soft != predExons[i].soft
|
|
794 || refExons[j].end < predExons[i].start )
|
|
795 continue ;
|
|
796
|
|
797 if ( refExons[j].start == predExons[i].start &&
|
|
798 refExons[j].end == predExons[i].end && predExons[i].soft == 0 )
|
|
799 //if ( WithinRange( refIntrons[j].start, predIntrons[i].start ) &&
|
|
800 // WithinRange( refIntrons[j].end, predIntrons[i].end ) )
|
|
801 {
|
|
802 refExons[j].matched = true ;
|
|
803 predExons[i].matched = true ;
|
|
804 ++matchedInternalExons ;
|
|
805 flag = true ;
|
|
806 break ;
|
|
807 }
|
|
808 else if ( ( refExons[j].start == predExons[i].start || ( predExons[i].soft & 2 ) )
|
|
809 && ( refExons[j].end == predExons[i].end || (predExons[i].soft & 1 ) ) )
|
|
810 {
|
|
811 refExons[j].matched = true ;
|
|
812 predExons[i].matched = true ;
|
|
813 flag = true ;
|
|
814 //break ;
|
|
815 }
|
|
816 }
|
|
817 //printf( "%d\n", j ) ;
|
|
818 }
|
|
819
|
|
820 printf( "\n" ) ;
|
|
821 printf( "Exon evaluation\n" ) ;
|
|
822 //printf( "\tmatched\t%d\n", matchedExons ) ;
|
|
823 matchedExons = 0 ;
|
|
824 for ( i = 0 ; i < reCnt ; ++i )
|
|
825 if ( refExons[i].matched )
|
|
826 ++matchedExons ;
|
|
827 printf( "\trecall\t%lf\t%d/%d\n", (double)matchedExons / reCnt, matchedExons, reCnt ) ;
|
|
828 matchedExons = 0 ;
|
|
829 for ( i = 0 ; i < peCnt ; ++i )
|
|
830 if ( predExons[i].matched )
|
|
831 ++matchedExons ;
|
|
832 printf( "\tprecision\t%lf\t%d/%d\n", (double)matchedExons / peCnt, matchedExons, peCnt ) ;
|
|
833
|
|
834 printf( "\n" ) ;
|
|
835 printf( "Internal exon evaluation\n" ) ;
|
|
836 //printf( "\tmatched\t%d\n", matchedInternalExons ) ;
|
|
837 printf( "\trecall\t%lf\t%d/%d\n", (double)matchedInternalExons / refInternalExons, matchedInternalExons, refInternalExons ) ;
|
|
838 printf( "\tprecision\t%lf\t%d/%d\n", (double)matchedInternalExons / predInternalExons, matchedInternalExons, predInternalExons ) ;
|
|
839 return 0 ;
|
|
840 }
|