Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/grader.cpp @ 0:903fc43d6227 draft default tip
Uploaded
| author | lsong10 | 
|---|---|
| date | Fri, 26 Mar 2021 16:52:45 +0000 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:903fc43d6227 | 
|---|---|
| 1 // Compare the reference and the predictions | |
| 2 // Format: ./a.out ref.gtf prediction.gtf | |
| 3 #include <stdio.h> | |
| 4 #include <string.h> | |
| 5 #include <stdlib.h> | |
| 6 | |
| 7 #define MAX_TXPT 2000000 | |
| 8 | |
| 9 #define DEBUG 0 | |
| 10 | |
| 11 bool flagMultiExonOnly = false ; | |
| 12 bool flagValidChr = true ; | |
| 13 int relaxWidth = 20 ; | |
| 14 | |
| 15 struct _pair | |
| 16 { | |
| 17 int a, b ; | |
| 18 } ; | |
| 19 | |
| 20 struct _transcript | |
| 21 { | |
| 22 char tid[50] ; | |
| 23 char chrom[50] ; | |
| 24 char strand ; | |
| 25 struct _pair *exons ; | |
| 26 int ecnt ; | |
| 27 } ; | |
| 28 | |
| 29 struct _info | |
| 30 { | |
| 31 double coverage ; | |
| 32 int byId ; // It gets this coverage by comparing with byId. | |
| 33 } ; | |
| 34 | |
| 35 struct _intron | |
| 36 { | |
| 37 char chrom[50] ; | |
| 38 int start, end ; | |
| 39 int tindex ; // The index to the corresponding transcripts | |
| 40 } ; | |
| 41 | |
| 42 struct _exon | |
| 43 { | |
| 44 char chrom[50] ; | |
| 45 int start, end ; | |
| 46 //int tindex ; | |
| 47 int soft ; // left-bit: left side is soft, right-bit: right side is soft | |
| 48 bool matched ; | |
| 49 } ; | |
| 50 | |
| 51 int TranscriptComp( const void *p1, const void *p2 ) | |
| 52 { | |
| 53 const struct _transcript *a = (struct _transcript *)p1 ; | |
| 54 const struct _transcript *b = ( struct _transcript *)p2 ; | |
| 55 int tmp = strcmp( a->chrom, b->chrom ) ; | |
| 56 if ( tmp != 0 ) | |
| 57 return tmp ; | |
| 58 return a->exons[0].a - b->exons[0].a ; | |
| 59 } | |
| 60 | |
| 61 int TranscriptComp_ByIntron( const void *p1, const void *p2 ) | |
| 62 { | |
| 63 const struct _transcript *a = (struct _transcript *)p1 ; | |
| 64 const struct _transcript *b = ( struct _transcript *)p2 ; | |
| 65 if ( a->strand != b->strand ) | |
| 66 return a->strand - b->strand ; | |
| 67 int tmp = strcmp( a->chrom, b->chrom ) ; | |
| 68 if ( tmp != 0 ) | |
| 69 return tmp ; | |
| 70 if ( a->ecnt != b->ecnt ) | |
| 71 return a->ecnt - b->ecnt ; | |
| 72 int i ; | |
| 73 for ( i = 0 ; i < a->ecnt - 1 ; ++i ) | |
| 74 { | |
| 75 if ( a->exons[i].b != b->exons[i].b ) | |
| 76 return a->exons[i].b - b->exons[i].b ; | |
| 77 if ( a->exons[i + 1].a != b->exons[i + 1].a ) | |
| 78 return a->exons[i + 1].a - b->exons[i + 1].a ; | |
| 79 } | |
| 80 | |
| 81 return 0 ; //strcmp( a->tid, b->tid ) ; | |
| 82 } | |
| 83 | |
| 84 bool validChrom( char *chrom ) | |
| 85 { | |
| 86 if ( !flagValidChr ) | |
| 87 return true ; | |
| 88 if ( chrom[0] != 'c' || chrom[1] != 'h' || chrom[2] != 'r' ) | |
| 89 return false ; | |
| 90 // only consider chr1-22,x,y,z | |
| 91 if ( chrom[3]=='x' || chrom[3]=='X' | |
| 92 || chrom[3] == 'y' || chrom[3] == 'Y' | |
| 93 || chrom[3] == 'm' || chrom[3] == 'M' | |
| 94 || ( chrom[3] >= '3' && chrom[3] <= '9' ) ) | |
| 95 { | |
| 96 if ( chrom[4] == '\0' ) | |
| 97 return true ; | |
| 98 else | |
| 99 return false ; | |
| 100 } | |
| 101 if ( chrom[3] == '1' ) | |
| 102 { | |
| 103 if ( chrom[4] == '\0' ) | |
| 104 return true ; | |
| 105 if ( chrom[4] >= '0' && chrom[4] <= '9' && chrom[5] == '\0' ) | |
| 106 return true ; | |
| 107 return false ; | |
| 108 } | |
| 109 if ( chrom[3] == '2' ) | |
| 110 { | |
| 111 if ( chrom[4] == '\0' ) | |
| 112 return true ; | |
| 113 if ( chrom[4] >= '0' && chrom[4] <= '2' && chrom[5] == '\0' ) | |
| 114 return true ; | |
| 115 return false ; | |
| 116 } | |
| 117 return false ; | |
| 118 } | |
| 119 | |
| 120 void ReverseExonList( struct _pair *exons, int ecnt ) | |
| 121 { | |
| 122 if ( ecnt < 2 || ( exons[0].a < exons[1].a ) ) | |
| 123 return ; | |
| 124 int i, j ; | |
| 125 struct _pair tmp ; | |
| 126 | |
| 127 for ( i = 0, j = ecnt - 1 ; i < j ; ++i, --j ) | |
| 128 { | |
| 129 tmp = exons[i] ; | |
| 130 exons[i] = exons[j] ; | |
| 131 exons[j] = tmp ; | |
| 132 } | |
| 133 } | |
| 134 | |
| 135 /** | |
| 136 Merge exons that are next to each other. Showed up in scripture. | |
| 137 */ | |
| 138 int CleanExonList( struct _pair *exons, int ecnt ) | |
| 139 { | |
| 140 int i, k ; | |
| 141 for ( i = 0 ; i < ecnt - 1 ; ++i ) | |
| 142 if ( exons[i + 1].a - exons[i].b - 1 < 20 ) | |
| 143 { | |
| 144 exons[i + 1].a = exons[i].a ; | |
| 145 exons[i].a = -1 ; | |
| 146 } | |
| 147 k = 0 ; | |
| 148 for ( i = 0 ; i < ecnt ; ++i ) | |
| 149 { | |
| 150 if ( exons[i].a == -1 ) | |
| 151 continue ; | |
| 152 exons[k] = exons[i] ; | |
| 153 ++k ; | |
| 154 } | |
| 155 return k ; | |
| 156 } | |
| 157 | |
| 158 /** | |
| 159 Remove the duplicated intron chain in the list | |
| 160 */ | |
| 161 int RemoveDuplicateIntronChain( struct _transcript *t, int tcnt ) | |
| 162 { | |
| 163 int i, j, k ; | |
| 164 | |
| 165 qsort( t, tcnt, sizeof( *t ), TranscriptComp_ByIntron ) ; | |
| 166 for ( i = 1 ; i < tcnt ; ++i ) | |
| 167 { | |
| 168 k = i - 1 ; | |
| 169 while ( t[k].ecnt == -1 ) | |
| 170 --k ; | |
| 171 | |
| 172 if ( t[i].ecnt != t[k].ecnt || strcmp( t[i].chrom, t[k].chrom ) || t[i].strand != t[k].strand ) | |
| 173 continue ; | |
| 174 for ( j = 0 ; j < t[i].ecnt - 1 ; ++j ) | |
| 175 if ( t[i].exons[j].b != t[k].exons[j].b || | |
| 176 t[i].exons[j + 1].a != t[k].exons[j + 1].a ) | |
| 177 break ; | |
| 178 if ( j >= t[i].ecnt - 1 && t[i].ecnt != 1 ) | |
| 179 t[i].ecnt = -1 ; | |
| 180 if ( t[i].ecnt == 1 && t[i].exons[0].a == t[k].exons[0].a && | |
| 181 t[i].exons[0].b == t[k].exons[0].b ) | |
| 182 t[i].ecnt = -1 ; | |
| 183 /*if ( t[i].ecnt == -1 ) | |
| 184 { | |
| 185 printf( "%s <- %s\n", t[i].tid, t[k].tid ) ; | |
| 186 }*/ | |
| 187 } | |
| 188 | |
| 189 k = 0 ; | |
| 190 for ( i = 0 ; i < tcnt ; ++i ) | |
| 191 { | |
| 192 if ( t[i].ecnt == -1 ) | |
| 193 { | |
| 194 free( t[i].exons ) ; | |
| 195 continue ; | |
| 196 } | |
| 197 t[k] = t[i] ; | |
| 198 ++k ; | |
| 199 } | |
| 200 tcnt = k ; | |
| 201 return tcnt ; | |
| 202 } | |
| 203 | |
| 204 | |
| 205 double CompareTranscripts( const struct _transcript &ref, const struct _transcript &pred ) | |
| 206 { | |
| 207 int i, j ; | |
| 208 struct _pair *refExons = ref.exons ; | |
| 209 int refECnt = ref.ecnt ; | |
| 210 struct _pair *predExons = pred.exons ; | |
| 211 int predECnt = pred.ecnt ; | |
| 212 | |
| 213 // Prediction must be a "subset" of the reference | |
| 214 if ( refECnt < predECnt ) | |
| 215 return -1 ; | |
| 216 | |
| 217 // single exon case | |
| 218 if ( refECnt == 1 ) | |
| 219 { | |
| 220 if ( refExons[0].b < predExons[0].a || refExons[0].a > predExons[0].b ) | |
| 221 return -1 ; | |
| 222 else | |
| 223 return 1 ; | |
| 224 } | |
| 225 | |
| 226 if ( predECnt == 1 ) | |
| 227 { | |
| 228 for ( i = 0 ; i < refECnt ; ++i ) | |
| 229 { | |
| 230 if ( refExons[i].b < predExons[0].a || refExons[i].a > predExons[0].b ) | |
| 231 continue ; | |
| 232 else | |
| 233 { | |
| 234 /*if ( i == 0 && predExons[0].a >= refExons[0].a - relaxWidth ) | |
| 235 return 0 ; | |
| 236 else if ( i == refECnt - 1 && predExons[0].b <= refExons[i].b + relaxWidth ) | |
| 237 return 0 ; | |
| 238 else if ( i > 0 && i < refECnt - 1 && predExons[0].a >= refExons[i].a - relaxWidth && | |
| 239 predExons[0].b <= refExons[i].b + relaxWidth ) | |
| 240 return 0 ; | |
| 241 | |
| 242 return -1 ;*/ | |
| 243 return 0 ; | |
| 244 } | |
| 245 } | |
| 246 return -1 ; | |
| 247 } | |
| 248 | |
| 249 if ( predECnt > 0 && ref.strand != pred.strand ) | |
| 250 return -1 ; | |
| 251 | |
| 252 // Test the intron chain | |
| 253 for ( i = 0 ; i < refECnt - 1 ; ++i ) | |
| 254 if ( refExons[i].b == predExons[0].b ) | |
| 255 break ; | |
| 256 if ( i >= refECnt - 1 ) | |
| 257 return -1 ; | |
| 258 //if ( i != 0 && predExons[0].a < refExons[i].a - relaxWidth ) | |
| 259 // return -1 ; | |
| 260 | |
| 261 for ( j = 0 ; i < refECnt - 1 && j < predECnt - 1 ; ++i, ++j ) | |
| 262 { | |
| 263 if ( refExons[i].b != predExons[j].b || refExons[i + 1].a != predExons[j + 1].a ) | |
| 264 break ; | |
| 265 } | |
| 266 if ( j >= predECnt - 1 ) | |
| 267 { | |
| 268 /*if ( i == refECnt - 1 || | |
| 269 predExons[ predECnt - 1 ].b <= refExons[i].b + relaxWidth ) | |
| 270 return (double)( predECnt - 1 ) / (double)( refECnt - 1 ) ; | |
| 271 else | |
| 272 return -1 ;*/ | |
| 273 return (double)( predECnt - 1 ) / (double)( refECnt - 1 ) ; | |
| 274 } | |
| 275 else | |
| 276 return -1 ; | |
| 277 } | |
| 278 | |
| 279 bool WithinRange( int a, int b, int w = 10 ) | |
| 280 { | |
| 281 if ( a - b >= -w && a - b <= w ) | |
| 282 return true ; | |
| 283 return false ; | |
| 284 } | |
| 285 | |
| 286 int GetIntrons( struct _transcript *transcripts, int tcnt, struct _intron *introns ) | |
| 287 { | |
| 288 int i, j, k, tag ; | |
| 289 int cnt = 0 ; | |
| 290 for ( i = 0 ; i < tcnt ; ++i ) | |
| 291 { | |
| 292 for ( j = 0 ; j < transcripts[i].ecnt - 1 ; ++j ) | |
| 293 { | |
| 294 bool flag = false ; | |
| 295 | |
| 296 tag = 0 ; | |
| 297 for ( k = cnt - 1 ; k >= 0 ; --k ) | |
| 298 { | |
| 299 if ( strcmp( introns[k].chrom, transcripts[i].chrom ) ) | |
| 300 { | |
| 301 //printf( "hi\n" ) ; | |
| 302 tag = k + 1 ; | |
| 303 break ; | |
| 304 } | |
| 305 if ( introns[k].start < transcripts[i].exons[j].b ) | |
| 306 { | |
| 307 tag = k + 1 ; | |
| 308 break ; | |
| 309 } | |
| 310 else if ( introns[k].start == transcripts[i].exons[j].b && | |
| 311 introns[k].end == transcripts[i].exons[j + 1].a ) | |
| 312 { | |
| 313 flag = true ; | |
| 314 break ; | |
| 315 } | |
| 316 } | |
| 317 | |
| 318 if ( !flag ) | |
| 319 { | |
| 320 for ( k = cnt ; k > tag ; --k ) | |
| 321 introns[k] = introns[k - 1] ; | |
| 322 strcpy( introns[k].chrom, transcripts[i].chrom ) ; | |
| 323 introns[k].start = transcripts[i].exons[j].b ; | |
| 324 introns[k].end = transcripts[i].exons[j + 1].a ; | |
| 325 introns[k].tindex = i ; | |
| 326 ++cnt ; | |
| 327 } | |
| 328 } | |
| 329 } | |
| 330 return cnt ; | |
| 331 } | |
| 332 | |
| 333 int GetExons( struct _transcript *transcripts, int tcnt, struct _exon *exons ) | |
| 334 { | |
| 335 int i, j, k, tag ; | |
| 336 int cnt = 0 ; | |
| 337 for ( i = 0 ; i < tcnt ; ++i ) | |
| 338 { | |
| 339 for ( j = 0 ; j < transcripts[i].ecnt ; ++j ) | |
| 340 { | |
| 341 bool flag = false ; | |
| 342 int soft = 0 ; | |
| 343 if ( j == 0 ) | |
| 344 soft = soft | 2 ; | |
| 345 if ( j == transcripts[i].ecnt - 1 ) | |
| 346 soft = soft | 1 ; | |
| 347 | |
| 348 tag = 0 ; | |
| 349 for ( k = cnt - 1 ; k >= 0 ; --k ) | |
| 350 { | |
| 351 if ( strcmp( exons[k].chrom, transcripts[i].chrom ) ) | |
| 352 { | |
| 353 //printf( "hi\n" ) ; | |
| 354 tag = k + 1 ; | |
| 355 break ; | |
| 356 } | |
| 357 | |
| 358 if ( exons[k].start < transcripts[i].exons[j].a ) | |
| 359 { | |
| 360 tag = k + 1 ; | |
| 361 break ; | |
| 362 } | |
| 363 else if ( exons[k].start == transcripts[i].exons[j].a && | |
| 364 exons[k].end == transcripts[i].exons[j].b && | |
| 365 exons[k].soft == soft ) | |
| 366 { | |
| 367 flag = true ; | |
| 368 break ; | |
| 369 } | |
| 370 } | |
| 371 | |
| 372 if ( !flag ) | |
| 373 { | |
| 374 for ( k = cnt ; k > tag ; --k ) | |
| 375 exons[k] = exons[k - 1] ; | |
| 376 strcpy( exons[k].chrom, transcripts[i].chrom ) ; | |
| 377 exons[k].start = transcripts[i].exons[j].a ; | |
| 378 exons[k].end = transcripts[i].exons[j].b ; | |
| 379 //exons[k].tindex = i ; | |
| 380 exons[k].soft = soft ; | |
| 381 exons[k].matched = false ; | |
| 382 ++cnt ; | |
| 383 } | |
| 384 } | |
| 385 } | |
| 386 return cnt ; | |
| 387 } | |
| 388 | |
| 389 //chr1 HAVANA transcript 320162 324461 . + . gene_id "ENSG00000237094.6"; transcript_id "ENST00000423728.1"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP4-669L17.10"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP4-669L17.10-002"; level 2; havana_gene "OTTHUMG00000156968.4"; havana_transcript "OTTHUMT00000346879.1"; | |
| 390 int main( int argc, char *argv[] ) | |
| 391 { | |
| 392 FILE *fp ; | |
| 393 struct _transcript *ref, *pred ; | |
| 394 struct _info *refInfo, *predInfo ; | |
| 395 | |
| 396 int refCnt, predCnt ; | |
| 397 char line[10000], filename[10000] ; | |
| 398 struct _pair tmpExons[10000] ; | |
| 399 int tmpECnt = 0 ; | |
| 400 | |
| 401 char chrom[50], tool[20], type[40], strand[3] ; | |
| 402 char tid[50] ; | |
| 403 char buffer[50] ; | |
| 404 int start, end ; | |
| 405 | |
| 406 int i, j, k, tag ; | |
| 407 | |
| 408 if ( argc == 1 ) | |
| 409 { | |
| 410 printf( "Compare the reference transcripts and predicted transcripts.\n" | |
| 411 "Format: ./grader ref.gtf prediction.gtf\n" ) ; | |
| 412 exit( 1 ) ; | |
| 413 } | |
| 414 // Parse the arguments | |
| 415 for ( i = 3 ; i < argc ; ++i ) | |
| 416 { | |
| 417 if ( !strcmp( argv[i], "-M" ) ) | |
| 418 flagMultiExonOnly = true ; | |
| 419 else if ( !strcmp( argv[i], "-ac" ) ) | |
| 420 flagValidChr = false ; | |
| 421 else | |
| 422 { | |
| 423 printf( "Unknown argument: %s\n", argv[i] ) ; | |
| 424 exit( 1 ) ; | |
| 425 } | |
| 426 } | |
| 427 | |
| 428 /*======================================================================================== | |
| 429 Stage 1: Read the transcripts from the reference and prediction's GTF files. | |
| 430 ========================================================================================*/ | |
| 431 fp = NULL ; | |
| 432 fp = fopen( argv[1], "r" ) ; | |
| 433 if ( fp == NULL ) | |
| 434 { | |
| 435 printf( "Could not open file %s.\n", argv[1] ) ; | |
| 436 exit( 1 ) ; | |
| 437 } | |
| 438 refCnt = 0 ; | |
| 439 ref = ( struct _transcript *)malloc( MAX_TXPT * sizeof( struct _transcript ) ) ; | |
| 440 | |
| 441 strcpy( ref[0].tid, "tmp_-1" ) ; | |
| 442 while ( fgets( line, sizeof( line ), fp ) != NULL ) | |
| 443 { | |
| 444 if ( line[0] == '#' ) | |
| 445 continue ; | |
| 446 | |
| 447 sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ; | |
| 448 | |
| 449 if ( strcmp( type, "exon" ) || !validChrom( chrom ) ) | |
| 450 continue ; | |
| 451 | |
| 452 char *p = strstr( line, "transcript_id" ) ; | |
| 453 for ( ; *p != ' ' ; ++p ) | |
| 454 ; | |
| 455 p += 2 ; | |
| 456 sscanf( p, "%s", tid ) ; | |
| 457 //printf( "+%s %d\n", tid, strlen( tid ) ) ; | |
| 458 p = tid + strlen( tid ) ; | |
| 459 while ( *p != '\"' ) | |
| 460 --p ; | |
| 461 *p = '\0' ; | |
| 462 //printf( "%s\n", tid ) ; | |
| 463 if ( strcmp( tid, ref[ refCnt ].tid ) && tmpECnt ) | |
| 464 { | |
| 465 ReverseExonList( tmpExons, tmpECnt ) ; | |
| 466 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; | |
| 467 if ( tmpECnt > 1 || !flagMultiExonOnly ) | |
| 468 { | |
| 469 ref[ refCnt ].ecnt = tmpECnt ; | |
| 470 ref[ refCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; | |
| 471 memcpy( ref[ refCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; | |
| 472 ++refCnt ; | |
| 473 } | |
| 474 tmpECnt = 0 ; | |
| 475 } | |
| 476 | |
| 477 tmpExons[ tmpECnt ].a = start ; | |
| 478 tmpExons[ tmpECnt ].b = end ; | |
| 479 ++tmpECnt ; | |
| 480 | |
| 481 ref[ refCnt ].strand = strand[0] ; | |
| 482 strcpy( ref[ refCnt ].chrom, chrom ) ; | |
| 483 strcpy( ref[ refCnt ].tid, tid ) ; | |
| 484 } | |
| 485 if ( tmpECnt != 0 ) | |
| 486 { | |
| 487 ReverseExonList( tmpExons, tmpECnt ) ; | |
| 488 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; | |
| 489 if ( tmpECnt > 1 || !flagMultiExonOnly ) | |
| 490 { | |
| 491 ref[ refCnt ].ecnt = tmpECnt ; | |
| 492 ref[ refCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; | |
| 493 memcpy( ref[ refCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; | |
| 494 ++refCnt ; | |
| 495 } | |
| 496 tmpECnt = 0 ; | |
| 497 } | |
| 498 fclose( fp ) ; | |
| 499 | |
| 500 fp = NULL ; | |
| 501 fp = fopen( argv[2], "r" ) ; | |
| 502 if ( fp == NULL ) | |
| 503 { | |
| 504 printf( "Could not open file %s.\n", argv[2] ) ; | |
| 505 exit( 1 ) ; | |
| 506 } | |
| 507 predCnt = 0 ; | |
| 508 pred = ( struct _transcript *)malloc( MAX_TXPT * sizeof( struct _transcript ) ) ; | |
| 509 | |
| 510 strcpy( pred[0].tid, "tmp_-1" ) ; | |
| 511 tmpECnt = 0 ; | |
| 512 while ( fgets( line, sizeof( line ), fp ) != NULL ) | |
| 513 { | |
| 514 if ( line[0] == '#' ) | |
| 515 continue ; | |
| 516 sscanf( line, "%s %s %s %d %d %s %s", chrom, tool, type, &start, &end, buffer, strand ) ; | |
| 517 | |
| 518 if ( strcmp( type, "exon" ) || !validChrom( chrom ) ) | |
| 519 continue ; | |
| 520 | |
| 521 char *p = strstr( line, "transcript_id" ) ; | |
| 522 //char *p = strstr( line, "gene_name" ) ; | |
| 523 for ( ; *p != ' ' ; ++p ) | |
| 524 ; | |
| 525 p += 2 ; | |
| 526 sscanf( p, "%s", tid ) ; | |
| 527 //tid[ strlen( tid ) - 2 ] = '\0' ; | |
| 528 p = tid + strlen( tid ) ; | |
| 529 while ( *p != '\"' ) | |
| 530 --p ; | |
| 531 *p = '\0' ; | |
| 532 | |
| 533 if ( strcmp( tid, pred[ predCnt ].tid ) && tmpECnt ) | |
| 534 { | |
| 535 ReverseExonList( tmpExons, tmpECnt ) ; | |
| 536 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; | |
| 537 if ( tmpECnt > 1 || !flagMultiExonOnly ) | |
| 538 { | |
| 539 pred[ predCnt ].ecnt = tmpECnt ; | |
| 540 pred[ predCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; | |
| 541 memcpy( pred[ predCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; | |
| 542 ++predCnt ; | |
| 543 } | |
| 544 tmpECnt = 0 ; | |
| 545 } | |
| 546 | |
| 547 tmpExons[ tmpECnt ].a = start ; | |
| 548 tmpExons[ tmpECnt ].b = end ; | |
| 549 ++tmpECnt ; | |
| 550 | |
| 551 pred[ predCnt ].strand = strand[0] ; | |
| 552 strcpy( pred[ predCnt ].chrom, chrom ) ; | |
| 553 strcpy( pred[ predCnt ].tid, tid ) ; | |
| 554 } | |
| 555 if ( tmpECnt > 0 ) | |
| 556 { | |
| 557 ReverseExonList( tmpExons, tmpECnt ) ; | |
| 558 tmpECnt = CleanExonList( tmpExons, tmpECnt ) ; | |
| 559 if ( tmpECnt > 1 || !flagMultiExonOnly ) | |
| 560 { | |
| 561 pred[ predCnt ].ecnt = tmpECnt ; | |
| 562 pred[ predCnt ].exons = ( struct _pair * )malloc( sizeof( struct _pair ) * tmpECnt ) ; | |
| 563 memcpy( pred[ predCnt ].exons, tmpExons, sizeof( tmpExons[0] ) * tmpECnt ) ; | |
| 564 ++predCnt ; | |
| 565 } | |
| 566 tmpECnt = 0 ; | |
| 567 } | |
| 568 | |
| 569 refCnt = RemoveDuplicateIntronChain( ref, refCnt ) ; | |
| 570 //printf( "predCnt = %d\n", predCnt ) ; | |
| 571 predCnt = RemoveDuplicateIntronChain( pred, predCnt ) ; | |
| 572 //printf( "predCnt = %d\n", predCnt ) ; | |
| 573 qsort( ref, refCnt, sizeof( ref[0] ), TranscriptComp ) ; | |
| 574 qsort( pred, predCnt, sizeof( pred[0] ), TranscriptComp ) ; | |
| 575 | |
| 576 | |
| 577 /*======================================================================================== | |
| 578 Stage 2: Compute the recall and precision. | |
| 579 ========================================================================================*/ | |
| 580 refInfo = ( struct _info * )malloc( refCnt * sizeof( *refInfo ) ) ; | |
| 581 predInfo = ( struct _info * )malloc( predCnt * sizeof( *predInfo ) ) ; | |
| 582 for ( i = 0 ; i < refCnt ; ++i ) | |
| 583 { | |
| 584 refInfo[i].coverage = -1 ; | |
| 585 refInfo[i].byId = -1 ; | |
| 586 #if DEBUG | |
| 587 printf( "=%d %s %s %d\n", i, ref[i].chrom, ref[i].tid, ref[i].ecnt ) ; | |
| 588 for ( j = 0 ; j < ref[i].ecnt ; ++j ) | |
| 589 printf( "%d %d\n", ref[i].exons[j].a, ref[i].exons[j].b ) ; | |
| 590 #endif | |
| 591 | |
| 592 } | |
| 593 for ( i = 0 ; i < predCnt ; ++i ) | |
| 594 { | |
| 595 predInfo[i].coverage = -1 ; | |
| 596 predInfo[i].byId = -1 ; | |
| 597 #if DEBUG | |
| 598 printf( "#%d %s %s %d\n", i, pred[i].chrom, pred[i].tid, pred[i].ecnt ) ; | |
| 599 for ( j = 0 ; j < pred[i].ecnt ; ++j ) | |
| 600 printf( "%d %d\n", pred[i].exons[j].a, pred[i].exons[j].b ) ; | |
| 601 #endif | |
| 602 } | |
| 603 tag = 0 ; | |
| 604 for ( i = 0 ; i < predCnt ; ++i ) | |
| 605 { | |
| 606 while ( tag < refCnt && | |
| 607 ( ( strcmp( ref[tag].chrom, pred[i].chrom ) < 0 ) || | |
| 608 ( !strcmp( ref[tag].chrom, pred[i].chrom ) && | |
| 609 ref[tag].exons[ ref[tag].ecnt - 1 ].b < pred[i].exons[0].a - relaxWidth ) ) ) | |
| 610 ++tag ; | |
| 611 | |
| 612 // if ( i == 16474 ) | |
| 613 // printf( "%d %d %d", i, tag, refCnt ) ; | |
| 614 for ( j = tag ; j < refCnt && ref[j].exons[0].a <= pred[i].exons[ pred[i].ecnt - 1 ].b + relaxWidth && !strcmp( ref[j].chrom, pred[i].chrom ); ++j ) | |
| 615 { | |
| 616 /*if ( !strcmp( pred[i].tid, "CUFF.4256.1" ) ) | |
| 617 { | |
| 618 printf( "%s ? %s\n", pred[i].tid, ref[j].tid ) ; | |
| 619 }*/ | |
| 620 // if ( i == 16474 ) | |
| 621 // { | |
| 622 // printf( "%d %d\n", i, j ) ; | |
| 623 // } | |
| 624 double coverage = CompareTranscripts( ref[j], pred[i] ) ; | |
| 625 if ( coverage > refInfo[j].coverage ) | |
| 626 { | |
| 627 refInfo[j].coverage = coverage ; | |
| 628 refInfo[j].byId = i ; | |
| 629 } | |
| 630 if ( coverage > predInfo[i].coverage ) | |
| 631 { | |
| 632 predInfo[i].coverage = coverage ; | |
| 633 predInfo[i].byId = j ; | |
| 634 } | |
| 635 } | |
| 636 } | |
| 637 | |
| 638 /*======================================================================================== | |
| 639 Stage 3: Dump the information into output files. | |
| 640 ========================================================================================*/ | |
| 641 //sprintf( filename, "grader.%s.recall", argv[1] ) ; | |
| 642 sprintf( filename, "grader.recall" ) ; | |
| 643 fp = fopen( filename, "w" ) ; | |
| 644 for ( i = 0 ; i < refCnt ; ++i ) | |
| 645 { | |
| 646 fprintf( fp, "%s\t%d\t%lf\t%s\n", ref[i].tid, ref[i].ecnt, refInfo[i].coverage, | |
| 647 refInfo[i].byId == -1 ? "-" : pred[ refInfo[i].byId ].tid ) ; | |
| 648 } | |
| 649 fclose( fp ) ; | |
| 650 | |
| 651 //sprintf( filename, "grader.%s.precision", argv[2] ) ; | |
| 652 sprintf( filename, "grader.precision" ) ; | |
| 653 fp = fopen( filename, "w" ) ; | |
| 654 for ( i = 0 ; i < predCnt ; ++i ) | |
| 655 { | |
| 656 fprintf( fp, "%s\t%d\t%lf\t%s\n", pred[i].tid, pred[i].ecnt, predInfo[i].coverage, | |
| 657 predInfo[i].byId == -1 ? "-" : ref[predInfo[i].byId].tid ) ; | |
| 658 } | |
| 659 fclose( fp ) ; | |
| 660 | |
| 661 // print the summary to the stdout | |
| 662 const int binCnt = 20 ; | |
| 663 int bins[binCnt + 1] ; | |
| 664 memset( bins, 0, sizeof( bins ) ) ; | |
| 665 k = 0 ; | |
| 666 for ( i = 0 ; i < refCnt ; ++i ) | |
| 667 { | |
| 668 if ( refInfo[i].coverage == -1 ) | |
| 669 continue ; | |
| 670 ++bins[ (int)( refInfo[i].coverage * binCnt ) ] ; | |
| 671 ++k ; | |
| 672 } | |
| 673 for ( i = 0 ; i <= binCnt ; ++i ) | |
| 674 { | |
| 675 printf( "recall %lf %lf %d/%d\n", (double)i / binCnt, (double)k / refCnt, k, refCnt ) ; | |
| 676 k -= bins[i] ; | |
| 677 } | |
| 678 | |
| 679 memset( bins, 0, sizeof( bins ) ) ; | |
| 680 k = 0 ; | |
| 681 for ( i = 0 ; i < predCnt ; ++i ) | |
| 682 { | |
| 683 if ( predInfo[i].coverage == -1 ) | |
| 684 continue ; | |
| 685 ++bins[ (int)( predInfo[i].coverage * binCnt ) ] ; | |
| 686 ++k ; | |
| 687 } | |
| 688 for ( i = 0 ; i <= binCnt ; ++i ) | |
| 689 { | |
| 690 printf( "precision %lf %lf %d/%d\n", (double)i / binCnt, (double)k / predCnt, k, predCnt ) ; | |
| 691 k -= bins[i] ; | |
| 692 } | |
| 693 | |
| 694 /*======================================================================================== | |
| 695 Stage 4: Evaluations for introns. | |
| 696 ========================================================================================*/ | |
| 697 struct _intron *refIntrons = ( struct _intron * )malloc( sizeof( struct _intron ) * MAX_TXPT ) ; | |
| 698 int riCnt = GetIntrons( ref, refCnt, refIntrons ) ; | |
| 699 struct _intron *predIntrons = ( struct _intron * )malloc( sizeof( struct _intron ) * MAX_TXPT ) ; | |
| 700 int piCnt = GetIntrons( pred, predCnt, predIntrons ) ; | |
| 701 int matchedIntron = 0 ; | |
| 702 /*for ( i = 0 ; i < riCnt ; ++i ) | |
| 703 { | |
| 704 printf( "%d %s %d %d\n", i, refIntrons[i].chrom, refIntrons[i].start, refIntrons[i].end ) ; | |
| 705 }*/ | |
| 706 | |
| 707 fp = fopen( "grader.intron", "w" ) ; | |
| 708 tag = 0 ; | |
| 709 for ( i = 0 ; i < piCnt ; ++i ) | |
| 710 { | |
| 711 bool flag = false ; | |
| 712 while ( 1 ) | |
| 713 { | |
| 714 if ( tag >= riCnt ) | |
| 715 break ; | |
| 716 int tmp = strcmp( refIntrons[tag].chrom, predIntrons[i].chrom ) ; | |
| 717 if ( tmp < 0 || ( tmp == 0 && refIntrons[tag].start < predIntrons[i].start ) ) | |
| 718 ++tag ; | |
| 719 else | |
| 720 break ; | |
| 721 } | |
| 722 //printf( "%d (%s %d %d) %d=>", i, predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, tag ) ; | |
| 723 /*if ( predIntrons[i].start == 1613457 ) | |
| 724 printf( "%d %d\n", tag, riCnt ) ;*/ | |
| 725 for ( j = tag ; j < riCnt ; ++j ) | |
| 726 { | |
| 727 if ( strcmp( refIntrons[j].chrom, predIntrons[i].chrom ) > 0 || | |
| 728 refIntrons[j].start > predIntrons[i].start /*+ 10*/ ) | |
| 729 break ; | |
| 730 if ( refIntrons[j].start == predIntrons[i].start && | |
| 731 refIntrons[j].end == predIntrons[i].end ) | |
| 732 //if ( WithinRange( refIntrons[j].start, predIntrons[i].start ) && | |
| 733 // WithinRange( refIntrons[j].end, predIntrons[i].end ) ) | |
| 734 { | |
| 735 ++matchedIntron ; | |
| 736 flag = true ; | |
| 737 break ; | |
| 738 } | |
| 739 } | |
| 740 //printf( "%d\n", j ) ; | |
| 741 if ( flag ) | |
| 742 fprintf( fp, "Y\t%s\t%d\t%d\t%s\n", predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, pred[ predIntrons[i].tindex ].tid ) ; | |
| 743 else | |
| 744 fprintf( fp, "N\t%s\t%d\t%d\t%s\n", predIntrons[i].chrom, predIntrons[i].start, predIntrons[i].end, pred[ predIntrons[i].tindex ].tid ) ; | |
| 745 } | |
| 746 fclose( fp ) ; | |
| 747 printf( "\n" ) ; | |
| 748 printf( "Intron evaluation\n") ; | |
| 749 //printf( "\tmatched\t%d\n", matchedIntron ) ; | |
| 750 printf( "\trecall\t%lf\t%d/%d\n", matchedIntron / (double)riCnt, matchedIntron, riCnt ) ; | |
| 751 printf( "\tprecision\t%lf\t%d/%d\n", matchedIntron / (double)piCnt, matchedIntron, piCnt ) ; | |
| 752 | |
| 753 /*======================================================================================== | |
| 754 Stage 4: Evaluations for exons. | |
| 755 ========================================================================================*/ | |
| 756 struct _exon *refExons = ( struct _exon * )malloc( sizeof( struct _exon ) * MAX_TXPT ) ; | |
| 757 int reCnt = GetExons( ref, refCnt, refExons ) ; | |
| 758 struct _exon *predExons = ( struct _exon * )malloc( sizeof( struct _exon ) * MAX_TXPT ) ; | |
| 759 int peCnt = GetExons( pred, predCnt, predExons ) ; | |
| 760 int matchedExons = 0, matchedInternalExons = 0 ; | |
| 761 int refInternalExons = 0, predInternalExons = 0 ; | |
| 762 | |
| 763 for ( i = 0 ; i < reCnt ; ++i ) | |
| 764 { | |
| 765 //printf( "%d %d\n", refExons[i].start, refExons[i].end ) ; | |
| 766 if ( refExons[i].soft == 0 ) | |
| 767 ++refInternalExons ; | |
| 768 //if ( refExons[i].soft == 3 ) | |
| 769 // printf( "hi\n" ) ; | |
| 770 } | |
| 771 for ( i = 0 ; i < peCnt ; ++i ) | |
| 772 if ( predExons[i].soft == 0 ) | |
| 773 ++predInternalExons ; | |
| 774 tag = 0 ; | |
| 775 for ( i = 0 ; i < peCnt ; ++i ) | |
| 776 { | |
| 777 bool flag = false ; | |
| 778 while ( 1 ) | |
| 779 { | |
| 780 if ( tag >= reCnt ) | |
| 781 break ; | |
| 782 int tmp = strcmp( refExons[tag].chrom, predExons[i].chrom ) ; | |
| 783 if ( tmp < 0 || ( tmp == 0 && refExons[tag].end < predExons[i].start ) ) | |
| 784 ++tag ; | |
| 785 else | |
| 786 break ; | |
| 787 } | |
| 788 for ( j = tag ; j < reCnt ; ++j ) | |
| 789 { | |
| 790 if ( strcmp( refExons[j].chrom, predExons[i].chrom ) > 0 || | |
| 791 refExons[j].start > predExons[i].end /*+ 10*/ ) | |
| 792 break ; | |
| 793 if ( refExons[j].soft != predExons[i].soft | |
| 794 || refExons[j].end < predExons[i].start ) | |
| 795 continue ; | |
| 796 | |
| 797 if ( refExons[j].start == predExons[i].start && | |
| 798 refExons[j].end == predExons[i].end && predExons[i].soft == 0 ) | |
| 799 //if ( WithinRange( refIntrons[j].start, predIntrons[i].start ) && | |
| 800 // WithinRange( refIntrons[j].end, predIntrons[i].end ) ) | |
| 801 { | |
| 802 refExons[j].matched = true ; | |
| 803 predExons[i].matched = true ; | |
| 804 ++matchedInternalExons ; | |
| 805 flag = true ; | |
| 806 break ; | |
| 807 } | |
| 808 else if ( ( refExons[j].start == predExons[i].start || ( predExons[i].soft & 2 ) ) | |
| 809 && ( refExons[j].end == predExons[i].end || (predExons[i].soft & 1 ) ) ) | |
| 810 { | |
| 811 refExons[j].matched = true ; | |
| 812 predExons[i].matched = true ; | |
| 813 flag = true ; | |
| 814 //break ; | |
| 815 } | |
| 816 } | |
| 817 //printf( "%d\n", j ) ; | |
| 818 } | |
| 819 | |
| 820 printf( "\n" ) ; | |
| 821 printf( "Exon evaluation\n" ) ; | |
| 822 //printf( "\tmatched\t%d\n", matchedExons ) ; | |
| 823 matchedExons = 0 ; | |
| 824 for ( i = 0 ; i < reCnt ; ++i ) | |
| 825 if ( refExons[i].matched ) | |
| 826 ++matchedExons ; | |
| 827 printf( "\trecall\t%lf\t%d/%d\n", (double)matchedExons / reCnt, matchedExons, reCnt ) ; | |
| 828 matchedExons = 0 ; | |
| 829 for ( i = 0 ; i < peCnt ; ++i ) | |
| 830 if ( predExons[i].matched ) | |
| 831 ++matchedExons ; | |
| 832 printf( "\tprecision\t%lf\t%d/%d\n", (double)matchedExons / peCnt, matchedExons, peCnt ) ; | |
| 833 | |
| 834 printf( "\n" ) ; | |
| 835 printf( "Internal exon evaluation\n" ) ; | |
| 836 //printf( "\tmatched\t%d\n", matchedInternalExons ) ; | |
| 837 printf( "\trecall\t%lf\t%d/%d\n", (double)matchedInternalExons / refInternalExons, matchedInternalExons, refInternalExons ) ; | |
| 838 printf( "\tprecision\t%lf\t%d/%d\n", (double)matchedInternalExons / predInternalExons, matchedInternalExons, predInternalExons ) ; | |
| 839 return 0 ; | |
| 840 } | 
