Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/CombineSubexons.cpp @ 0:903fc43d6227 draft default tip
Uploaded
| author | lsong10 |
|---|---|
| date | Fri, 26 Mar 2021 16:52:45 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:903fc43d6227 |
|---|---|
| 1 #include <stdio.h> | |
| 2 #include <string.h> | |
| 3 | |
| 4 #include <algorithm> | |
| 5 #include <vector> | |
| 6 #include <map> | |
| 7 #include <string> | |
| 8 | |
| 9 #include "alignments.hpp" | |
| 10 #include "blocks.hpp" | |
| 11 #include "stats.hpp" | |
| 12 #include "SubexonGraph.hpp" | |
| 13 | |
| 14 char usage[] = "combineSubexons [options]\n" | |
| 15 "Required options:\n" | |
| 16 "\t-s STRING: the path to the predicted subexon information. Can use multiple -s to specify multiple subexon prediction files\n" | |
| 17 "\t\tor\n" | |
| 18 "\t--ls STRING: the path to file of the list of the predicted subexon information.\n" ; | |
| 19 | |
| 20 struct _overhang | |
| 21 { | |
| 22 int cnt ; // the number of samples support this subexon. | |
| 23 int validCnt ; // The number of samples that are used for compute probability. | |
| 24 int length ; | |
| 25 double classifier ; | |
| 26 } ; | |
| 27 | |
| 28 struct _intronicInfo | |
| 29 { | |
| 30 int chrId ; | |
| 31 int start, end ; | |
| 32 int leftSubexonIdx, rightSubexonIdx ; | |
| 33 double irClassifier ; | |
| 34 int irCnt ; | |
| 35 int validIrCnt ; | |
| 36 struct _overhang leftOverhang, rightOverhang ; // leftOverhangClassifier is for the overhang subexon at the left side of this intron. | |
| 37 } ; | |
| 38 | |
| 39 struct _seInterval | |
| 40 { | |
| 41 int chrId ; | |
| 42 int start, end ; | |
| 43 int type ; // 0-subexon, 1-intronicInfo | |
| 44 int idx ; | |
| 45 } ; | |
| 46 | |
| 47 struct _subexonSplit | |
| 48 { | |
| 49 int chrId ; | |
| 50 int pos ; | |
| 51 int type ; //1-start of a subexon. 2-end of a subexon | |
| 52 int splitType ; //0-soft boundary, 1-start of an exon, 2-end of an exon. | |
| 53 int strand ; | |
| 54 | |
| 55 int weight ; | |
| 56 } ; | |
| 57 | |
| 58 struct _interval // exon or intron | |
| 59 { | |
| 60 int chrId ; | |
| 61 int start, end ; | |
| 62 int strand ; | |
| 63 int sampleSupport ; | |
| 64 } ; | |
| 65 | |
| 66 struct _subexonSupplement // supplement the subexon structure defined in SubexonGraph. | |
| 67 { | |
| 68 int *nextSupport ; | |
| 69 int *prevSupport ; | |
| 70 } ; | |
| 71 | |
| 72 char buffer[4096] ; | |
| 73 | |
| 74 bool CompSubexonSplit( struct _subexonSplit a, struct _subexonSplit b ) | |
| 75 { | |
| 76 if ( a.chrId < b.chrId ) | |
| 77 return true ; | |
| 78 else if ( a.chrId > b.chrId ) | |
| 79 return false ; | |
| 80 else if ( a.pos != b.pos ) | |
| 81 return a.pos < b.pos ; | |
| 82 else if ( a.type != b.type ) | |
| 83 { | |
| 84 // the split site with no strand information should come first. | |
| 85 /*if ( a.splitType != b.splitType ) | |
| 86 { | |
| 87 if ( a.splitType == 0 ) | |
| 88 return true ; | |
| 89 else if ( b.splitType == 0 ) | |
| 90 return false ; | |
| 91 }*/ | |
| 92 return a.type < b.type ; | |
| 93 } | |
| 94 else if ( a.splitType != b.splitType ) | |
| 95 { | |
| 96 //return a.splitType < b.splitType ; | |
| 97 if ( a.splitType == 0 ) | |
| 98 return true ; | |
| 99 else if ( b.splitType == 0 ) | |
| 100 return false ; | |
| 101 | |
| 102 if ( a.type == 1 ) | |
| 103 return a.splitType > b.splitType ; | |
| 104 else | |
| 105 return a.splitType < b.splitType ; | |
| 106 } | |
| 107 | |
| 108 return false ; | |
| 109 } | |
| 110 | |
| 111 bool CompInterval( struct _interval a, struct _interval b ) | |
| 112 { | |
| 113 if ( a.chrId < b.chrId ) | |
| 114 return true ; | |
| 115 else if ( a.chrId > b.chrId ) | |
| 116 return false ; | |
| 117 else if ( a.start != b.start ) | |
| 118 return a.start < b.start ; | |
| 119 else if ( a.end != b.end ) | |
| 120 return a.end < b.end ; | |
| 121 return false ; | |
| 122 } | |
| 123 | |
| 124 bool CompSeInterval( struct _seInterval a, struct _seInterval b ) | |
| 125 { | |
| 126 if ( a.chrId < b.chrId ) | |
| 127 return true ; | |
| 128 else if ( a.chrId > b.chrId ) | |
| 129 return false ; | |
| 130 else if ( a.start < b.start ) | |
| 131 return true ; | |
| 132 else if ( a.start > b.start ) | |
| 133 return false ; | |
| 134 else if ( a.end < b.end ) | |
| 135 return true ; | |
| 136 else | |
| 137 return false ; | |
| 138 } | |
| 139 | |
| 140 // Keep this the same as in SubexonInfo.cpp. | |
| 141 double TransformCov( double c ) | |
| 142 { | |
| 143 double ret ; | |
| 144 //return sqrt( c ) - 1 ; | |
| 145 | |
| 146 if ( c <= 2 + 1e-6 ) | |
| 147 ret = 1e-6 ; | |
| 148 else | |
| 149 ret = c - 2 ; | |
| 150 | |
| 151 return ret ; | |
| 152 } | |
| 153 | |
| 154 double GetUpdateMixtureGammaClassifier( double ratio, double cov, double piRatio, double kRatio[2], double thetaRatio[2], | |
| 155 double piCov, double kCov[2], double thetaCov[2], bool conservative ) | |
| 156 { | |
| 157 double p1 = 0, p2 ; | |
| 158 | |
| 159 cov = TransformCov( cov ) ; | |
| 160 if ( cov < ( kCov[0] - 1 ) * thetaCov[0] ) | |
| 161 cov = ( kCov[0] - 1 ) * thetaCov[0] ; | |
| 162 | |
| 163 if ( ratio > 0 ) | |
| 164 p1 = MixtureGammaAssignment( ratio, piRatio, kRatio, thetaRatio ) ; | |
| 165 // Make sure cov > 1? | |
| 166 p2 = MixtureGammaAssignment( cov, piCov, kCov, thetaCov ) ; | |
| 167 double ret = 0 ; | |
| 168 | |
| 169 if ( conservative ) | |
| 170 { | |
| 171 if ( p1 >= p2 ) // we should use ratio. | |
| 172 ret = LogGammaDensity( ratio, kRatio[1], thetaRatio[1] ) | |
| 173 - LogGammaDensity( ratio, kRatio[0], thetaRatio[0] ) ; | |
| 174 else | |
| 175 ret = LogGammaDensity( cov, kCov[1], thetaCov[1] ) | |
| 176 - LogGammaDensity( cov, kCov[0], thetaCov[0] ) ; | |
| 177 } | |
| 178 else | |
| 179 { | |
| 180 if ( p1 >= p2 ) // we should use ratio. | |
| 181 ret = LogGammaDensity( ratio, kRatio[1], thetaRatio[1] ) | |
| 182 - LogGammaDensity( ratio, kRatio[0], thetaRatio[0] ) ; | |
| 183 else | |
| 184 ret = LogGammaDensity( cov, kCov[1], thetaCov[1] ) | |
| 185 - LogGammaDensity( cov, kCov[0], thetaCov[0] ) ; | |
| 186 } | |
| 187 return ret ; | |
| 188 } | |
| 189 | |
| 190 double GetPValueOfGeneEnd( double cov ) | |
| 191 { | |
| 192 if ( cov <= 2.0 ) | |
| 193 return 1.0 ; | |
| 194 double tmp = 2.0 * ( sqrt( cov ) - log( cov ) ) ; | |
| 195 if ( tmp <= 0 ) | |
| 196 return 1.0 ; | |
| 197 return 2.0 * alnorm( tmp, true ) ; | |
| 198 } | |
| 199 | |
| 200 char StrandNumToSymbol( int strand ) | |
| 201 { | |
| 202 if ( strand > 0 ) | |
| 203 return '+' ; | |
| 204 else if ( strand < 0 ) | |
| 205 return '-' ; | |
| 206 else | |
| 207 return '.' ; | |
| 208 } | |
| 209 | |
| 210 int StrandSymbolToNum( char c ) | |
| 211 { | |
| 212 if ( c == '+' ) | |
| 213 return 1 ; | |
| 214 else if ( c == '-' ) | |
| 215 return -1 ; | |
| 216 else | |
| 217 return 0 ; | |
| 218 } | |
| 219 | |
| 220 int *MergePositions( int *old, int ocnt, int *add, int acnt, int &newCnt ) //, int **support ) | |
| 221 { | |
| 222 int i, j, k ; | |
| 223 int *ret ; | |
| 224 if ( acnt == 0 ) | |
| 225 { | |
| 226 newCnt = ocnt ; | |
| 227 return old ; | |
| 228 } | |
| 229 if ( ocnt == 0 ) | |
| 230 { | |
| 231 newCnt = acnt ; | |
| 232 ret = new int[acnt] ; | |
| 233 //*support = new int[acnt] ; | |
| 234 for ( i = 0 ; i < acnt ; ++i ) | |
| 235 { | |
| 236 //(*support)[i] = 1 ; | |
| 237 ret[i] = add[i] ; | |
| 238 } | |
| 239 return ret ; | |
| 240 } | |
| 241 newCnt = 0 ; | |
| 242 for ( i = 0, j = 0 ; i < ocnt && j < acnt ; ) | |
| 243 { | |
| 244 if ( old[i] < add[j] ) | |
| 245 { | |
| 246 ++i ; | |
| 247 ++newCnt ; | |
| 248 } | |
| 249 else if ( old[i] == add[j] ) | |
| 250 { | |
| 251 ++i ; ++j ; | |
| 252 ++newCnt ; | |
| 253 } | |
| 254 else | |
| 255 { | |
| 256 ++j ; | |
| 257 ++newCnt ; | |
| 258 } | |
| 259 } | |
| 260 newCnt = newCnt + ( ocnt - i ) + ( acnt - j ) ; | |
| 261 // no new elements. | |
| 262 if ( newCnt == ocnt ) | |
| 263 { | |
| 264 /*i = 0 ; | |
| 265 for ( j = 0 ; j < acnt ; ++j ) | |
| 266 { | |
| 267 for ( ; old[i] < add[j] ; ++i ) | |
| 268 ; | |
| 269 ++(*support)[i] ; | |
| 270 }*/ | |
| 271 return old ; | |
| 272 } | |
| 273 k = 0 ; | |
| 274 //delete []old ; | |
| 275 ret = new int[ newCnt ] ; | |
| 276 //int *bufferSupport = new int[newCnt] ; | |
| 277 for ( i = 0, j = 0 ; i < ocnt && j < acnt ; ) | |
| 278 { | |
| 279 if ( old[i] < add[j] ) | |
| 280 { | |
| 281 ret[k] = old[i] ; | |
| 282 //bufferSupport[k] = (*support)[i] ; | |
| 283 ++i ; | |
| 284 ++k ; | |
| 285 } | |
| 286 else if ( old[i] == add[j] ) | |
| 287 { | |
| 288 ret[k] = old[i] ; | |
| 289 //bufferSupport[k] = (*support)[i] + 1 ; | |
| 290 ++i ; ++j ; | |
| 291 ++k ; | |
| 292 } | |
| 293 else | |
| 294 { | |
| 295 ret[k] = add[j] ; | |
| 296 //bufferSupport[k] = 1 ; | |
| 297 ++j ; | |
| 298 ++k ; | |
| 299 } | |
| 300 } | |
| 301 for ( ; i < ocnt ; ++i, ++k ) | |
| 302 { | |
| 303 ret[k] = old[i] ; | |
| 304 //bufferSupport[k] = (*support)[i] ; | |
| 305 } | |
| 306 for ( ; j < acnt ; ++j, ++k ) | |
| 307 { | |
| 308 ret[k] = add[j] ; | |
| 309 //bufferSupport[k] = 1 ; | |
| 310 } | |
| 311 delete[] old ; | |
| 312 //delete[] *support ; | |
| 313 //*support = bufferSupport ; | |
| 314 return ret ; | |
| 315 } | |
| 316 | |
| 317 void CoalesceSubexonSplits( std::vector<struct _subexonSplit> &splits, int mid ) | |
| 318 { | |
| 319 int i, j, k ; | |
| 320 int cnt = splits.size() ; | |
| 321 //std::sort( splits.begin(), splits.end(), CompSubexonSplit ) ; | |
| 322 | |
| 323 std::vector<struct _subexonSplit> sortedSplits ; | |
| 324 sortedSplits.resize( cnt ) ; | |
| 325 | |
| 326 k = 0 ; | |
| 327 for ( i = 0, j = mid ; i < mid && j < cnt ; ++k ) | |
| 328 { | |
| 329 if ( CompSubexonSplit( splits[i], splits[j] ) ) | |
| 330 { | |
| 331 sortedSplits[k] = splits[i] ; | |
| 332 ++i ; | |
| 333 } | |
| 334 else | |
| 335 { | |
| 336 sortedSplits[k] = splits[j] ; | |
| 337 ++j ; | |
| 338 } | |
| 339 } | |
| 340 for ( ; i < mid ; ++i, ++k ) | |
| 341 sortedSplits[k] = splits[i] ; | |
| 342 for ( ; j < cnt ; ++j, ++k ) | |
| 343 sortedSplits[k] = splits[j] ; | |
| 344 splits = sortedSplits ; | |
| 345 | |
| 346 k = 0 ; | |
| 347 for ( i = 1 ; i < cnt ; ++i ) | |
| 348 { | |
| 349 if ( splits[i].chrId == splits[k].chrId && splits[i].pos == splits[k].pos && splits[i].type == splits[k].type && splits[i].splitType == splits[k].splitType | |
| 350 && splits[i].strand == splits[k].strand ) | |
| 351 { | |
| 352 splits[k].weight += splits[i].weight ; | |
| 353 } | |
| 354 else | |
| 355 { | |
| 356 ++k ; | |
| 357 splits[k] = splits[i] ; | |
| 358 } | |
| 359 } | |
| 360 splits.resize( k + 1 ) ; | |
| 361 } | |
| 362 | |
| 363 void CoalesceDifferentStrandSubexonSplits( std::vector<struct _subexonSplit> &splits ) | |
| 364 { | |
| 365 int i, j, k, l ; | |
| 366 int cnt = splits.size() ; | |
| 367 k = 0 ; | |
| 368 for ( i = 0 ; i < cnt ; ) | |
| 369 { | |
| 370 for ( j = i + 1 ; j < cnt ; ++j ) | |
| 371 { | |
| 372 if ( splits[i].chrId == splits[j].chrId && splits[i].pos == splits[j].pos && splits[i].type == splits[j].type && splits[i].splitType == splits[j].splitType ) | |
| 373 continue ; | |
| 374 break ; | |
| 375 } | |
| 376 | |
| 377 int maxWeight = -1 ; | |
| 378 int weightSum = 0 ; | |
| 379 int strand = splits[i].strand ; | |
| 380 for ( l = i ; l < j ; ++l ) | |
| 381 { | |
| 382 weightSum += splits[l].weight ; | |
| 383 if ( splits[l].strand != 0 && splits[l].weight > maxWeight ) | |
| 384 { | |
| 385 strand = splits[l].strand ; | |
| 386 maxWeight = splits[l].weight ; | |
| 387 } | |
| 388 } | |
| 389 //printf( "%d: %d %d %d\n", splits[i].pos, i, j, strand ) ; | |
| 390 splits[k] = splits[i] ; | |
| 391 splits[k].strand = strand ; | |
| 392 splits[k].weight = weightSum ; | |
| 393 ++k ; | |
| 394 | |
| 395 i = j ; | |
| 396 } | |
| 397 splits.resize( k ) ; | |
| 398 } | |
| 399 | |
| 400 | |
| 401 void CoalesceIntervals( std::vector<struct _interval> &intervals ) | |
| 402 { | |
| 403 int i, k ; | |
| 404 std::sort( intervals.begin(), intervals.end(), CompInterval ) ; | |
| 405 int cnt = intervals.size() ; | |
| 406 k = 0 ; | |
| 407 for ( i = 1 ; i < cnt ; ++i ) | |
| 408 { | |
| 409 if ( intervals[i].chrId == intervals[k].chrId && intervals[i].start == intervals[k].start && intervals[i].end == intervals[k].end ) | |
| 410 intervals[k].sampleSupport += intervals[i].sampleSupport ; | |
| 411 else | |
| 412 { | |
| 413 ++k ; | |
| 414 intervals[k] = intervals[i] ; | |
| 415 } | |
| 416 } | |
| 417 intervals.resize( k + 1 ) ; | |
| 418 } | |
| 419 | |
| 420 void CleanIntervalIrOverhang( std::vector<struct _interval> &irOverhang ) | |
| 421 { | |
| 422 int i, j, k ; | |
| 423 std::sort( irOverhang.begin(), irOverhang.end(), CompInterval ) ; | |
| 424 | |
| 425 int cnt = irOverhang.size() ; | |
| 426 for ( i = 0 ; i < cnt ; ++i ) | |
| 427 { | |
| 428 if ( irOverhang[i].start == -1 ) | |
| 429 continue ; | |
| 430 | |
| 431 | |
| 432 // locate the longest interval start at the same coordinate. | |
| 433 int tag = i ; | |
| 434 | |
| 435 for ( j = i + 1 ; j < cnt ; ++j ) | |
| 436 { | |
| 437 if ( irOverhang[j].chrId != irOverhang[i].chrId || irOverhang[j].start != irOverhang[i].start ) | |
| 438 break ; | |
| 439 if ( irOverhang[j].start == -1 ) | |
| 440 continue ; | |
| 441 tag = j ; | |
| 442 } | |
| 443 | |
| 444 for ( k = i ; k < tag ; ++k ) | |
| 445 { | |
| 446 irOverhang[k].start = -1 ; | |
| 447 } | |
| 448 | |
| 449 for ( k = tag + 1 ; k < cnt ; ++k ) | |
| 450 { | |
| 451 if ( irOverhang[k].chrId != irOverhang[tag].chrId || irOverhang[k].start > irOverhang[tag].end ) | |
| 452 break ; | |
| 453 if ( irOverhang[k].end <= irOverhang[tag].end ) | |
| 454 { | |
| 455 irOverhang[k].start = -1 ; | |
| 456 } | |
| 457 } | |
| 458 } | |
| 459 | |
| 460 k = 0 ; | |
| 461 for ( i = 0 ; i < cnt ; ++i ) | |
| 462 { | |
| 463 if ( irOverhang[i].start == -1 ) | |
| 464 continue ; | |
| 465 irOverhang[k] = irOverhang[i] ; | |
| 466 ++k ; | |
| 467 } | |
| 468 irOverhang.resize( k ) ; | |
| 469 } | |
| 470 | |
| 471 // Remove the connection that does not match the boundary | |
| 472 // of subexons. | |
| 473 void CleanUpSubexonConnections( std::vector<struct _subexon> &subexons ) | |
| 474 { | |
| 475 int seCnt = subexons.size() ; | |
| 476 int i, j, k, m ; | |
| 477 for ( i = 0 ; i < seCnt ; ++i ) | |
| 478 { | |
| 479 if ( subexons[i].prevCnt > 0 ) | |
| 480 { | |
| 481 for ( k = i ; k >= 0 ; --k ) | |
| 482 if ( subexons[k].chrId != subexons[i].chrId || subexons[k].end <= subexons[i].prev[0] ) | |
| 483 break ; | |
| 484 if ( subexons[k].chrId != subexons[i].chrId ) | |
| 485 ++k ; | |
| 486 m = 0 ; | |
| 487 for ( j = 0 ; j < subexons[i].prevCnt ; ++j ) | |
| 488 { | |
| 489 for ( ; k <= i ; ++k ) | |
| 490 if ( subexons[k].end >= subexons[i].prev[j] ) | |
| 491 break ; | |
| 492 if ( subexons[k].end == subexons[i].prev[j] | |
| 493 && ( SubexonGraph::IsSameStrand( subexons[k].rightStrand, subexons[i].leftStrand ) || subexons[k].end + 1 == subexons[i].start ) ) | |
| 494 { | |
| 495 subexons[i].prev[m] = subexons[i].prev[j] ; | |
| 496 ++m ; | |
| 497 } | |
| 498 } | |
| 499 subexons[i].prevCnt = m ; | |
| 500 } | |
| 501 | |
| 502 m = 0 ; | |
| 503 k = i ; | |
| 504 for ( j = 0 ; j < subexons[i].nextCnt ; ++j ) | |
| 505 { | |
| 506 for ( ; k < seCnt ; ++k ) | |
| 507 if ( subexons[k].chrId != subexons[i].chrId || subexons[k].start >= subexons[i].next[j] ) | |
| 508 break ; | |
| 509 if ( subexons[k].start == subexons[i].next[j] | |
| 510 && ( SubexonGraph::IsSameStrand( subexons[i].rightStrand, subexons[k].leftStrand ) || subexons[i].end + 1 == subexons[k].start ) ) | |
| 511 { | |
| 512 subexons[i].next[m] = subexons[i].next[j] ; | |
| 513 ++m ; | |
| 514 } | |
| 515 } | |
| 516 subexons[i].nextCnt = m ; | |
| 517 } | |
| 518 } | |
| 519 | |
| 520 int main( int argc, char *argv[] ) | |
| 521 { | |
| 522 int i, j, k ; | |
| 523 FILE *fp ; | |
| 524 std::vector<char *> files ; | |
| 525 | |
| 526 Blocks regions ; | |
| 527 Alignments alignments ; | |
| 528 | |
| 529 if ( argc == 1 ) | |
| 530 { | |
| 531 printf( "%s", usage ) ; | |
| 532 return 0 ; | |
| 533 } | |
| 534 | |
| 535 for ( i = 1 ; i < argc ; ++i ) | |
| 536 { | |
| 537 if ( !strcmp( argv[i], "-s" ) ) | |
| 538 { | |
| 539 files.push_back( argv[i + 1] ) ; | |
| 540 ++i ; | |
| 541 continue ; | |
| 542 } | |
| 543 else if ( !strcmp( argv[i], "--ls" ) ) | |
| 544 { | |
| 545 FILE *fpLs = fopen( argv[i + 1], "r" ) ; | |
| 546 char buffer[1024] ; | |
| 547 while ( fgets( buffer, sizeof( buffer ), fpLs ) != NULL ) | |
| 548 { | |
| 549 int len = strlen( buffer ) ; | |
| 550 if ( buffer[len - 1] == '\n' ) | |
| 551 { | |
| 552 buffer[len - 1] = '\0' ; | |
| 553 --len ; | |
| 554 | |
| 555 } | |
| 556 char *fileName = strdup( buffer ) ; | |
| 557 files.push_back( fileName ) ; | |
| 558 } | |
| 559 } | |
| 560 } | |
| 561 int fileCnt = files.size() ; | |
| 562 // Obtain the chromosome ids through bam file. | |
| 563 fp = fopen( files[0], "r" ) ; | |
| 564 if ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) | |
| 565 { | |
| 566 int len = strlen( buffer ) ; | |
| 567 buffer[len - 1] = '\0' ; | |
| 568 alignments.Open( buffer + 1 ) ; | |
| 569 } | |
| 570 fclose( fp ) ; | |
| 571 | |
| 572 // Collect the split sites of subexons. | |
| 573 std::vector<struct _subexonSplit> subexonSplits ; | |
| 574 std::vector<struct _interval> intervalIrOverhang ; // intervals contains ir and overhang. | |
| 575 std::vector<struct _interval> introns ; | |
| 576 std::vector<struct _interval> exons ; | |
| 577 | |
| 578 | |
| 579 for ( k = 0 ; k < fileCnt ; ++k ) | |
| 580 { | |
| 581 fp = fopen( files[k], "r" ) ; | |
| 582 struct _subexon se ; | |
| 583 struct _subexonSplit sp ; | |
| 584 char chrName[50] ; | |
| 585 int origSize = subexonSplits.size() ; | |
| 586 while ( fgets( buffer, sizeof( buffer), fp ) != NULL ) | |
| 587 { | |
| 588 if ( buffer[0] == '#' ) | |
| 589 continue ; | |
| 590 | |
| 591 SubexonGraph::InputSubexon( buffer, alignments, se, false) ; | |
| 592 // Record all the intron rentention, overhang from the samples | |
| 593 if ( ( se.leftType == 2 && se.rightType == 1 ) | |
| 594 || ( se.leftType == 2 && se.rightType == 0 ) | |
| 595 || ( se.leftType == 0 && se.rightType == 1 ) ) | |
| 596 { | |
| 597 struct _interval si ; | |
| 598 si.chrId = se.chrId ; | |
| 599 si.start = se.start ; | |
| 600 si.end = se.end ; | |
| 601 | |
| 602 intervalIrOverhang.push_back( si ) ; | |
| 603 } | |
| 604 | |
| 605 // Ignore overhang subexons and ir subexons for now. | |
| 606 if ( ( se.leftType == 0 && se.rightType == 1 ) | |
| 607 || ( se.leftType == 2 && se.rightType == 0 ) | |
| 608 || ( se.leftType == 2 && se.rightType == 1 ) ) | |
| 609 continue ; | |
| 610 | |
| 611 if ( se.leftType == 0 && se.rightType == 0 && ( se.leftClassifier == -1 || se.leftClassifier == 1 ) ) // ignore noisy single-exon island | |
| 612 continue ; | |
| 613 if ( se.leftType == 0 && se.rightType == 0 && ( fileCnt >= 10 && se.leftClassifier > 0.99 ) ) | |
| 614 continue ; | |
| 615 | |
| 616 if ( se.leftType == 1 && se.rightType == 2 ) // a full exon, we allow mixtured strand here. | |
| 617 { | |
| 618 struct _interval ni ; | |
| 619 ni.chrId = se.chrId ; | |
| 620 ni.start = se.start ; | |
| 621 ni.end = se.end ; | |
| 622 ni.strand = se.rightStrand ; | |
| 623 ni.sampleSupport = 1 ; | |
| 624 exons.push_back( ni ) ; | |
| 625 } | |
| 626 | |
| 627 | |
| 628 /*for ( i = 0 ; i < se.nextCnt ; ++i ) | |
| 629 { | |
| 630 struct _interval ni ; | |
| 631 ni.chrId = se.chrId ; | |
| 632 ni.start = se.end ; | |
| 633 ni.end = se.next[i] ; | |
| 634 ni.strand = se.rightStrand ; | |
| 635 ni.sampleSupport = 1 ; | |
| 636 if ( ni.start + 1 < ni.end ) | |
| 637 introns.push_back( ni ) ; | |
| 638 }*/ | |
| 639 | |
| 640 sp.chrId = se.chrId ; | |
| 641 sp.pos = se.start ; | |
| 642 sp.type = 1 ; | |
| 643 sp.splitType = se.leftType ; | |
| 644 sp.strand = se.leftStrand ; | |
| 645 sp.weight = 1 ; | |
| 646 subexonSplits.push_back( sp ) ; | |
| 647 | |
| 648 sp.chrId = se.chrId ; | |
| 649 sp.pos = se.end ; | |
| 650 sp.type = 2 ; | |
| 651 sp.splitType = se.rightType ; | |
| 652 sp.strand = se.rightStrand ; | |
| 653 sp.weight = 1 ; | |
| 654 subexonSplits.push_back( sp ) ; | |
| 655 | |
| 656 /*if ( se.prevCnt > 0 ) | |
| 657 delete[] se.prev ; | |
| 658 if ( se.nextCnt > 0 ) | |
| 659 delete[] se.next ;*/ | |
| 660 } | |
| 661 CoalesceIntervals( exons ) ; | |
| 662 CoalesceIntervals( introns ) ; | |
| 663 CoalesceSubexonSplits( subexonSplits, origSize ) ; | |
| 664 CleanIntervalIrOverhang( intervalIrOverhang ) ; | |
| 665 | |
| 666 fclose( fp ) ; | |
| 667 } | |
| 668 | |
| 669 CoalesceDifferentStrandSubexonSplits( subexonSplits ) ; | |
| 670 | |
| 671 // Obtain the split sites from the introns. | |
| 672 int intronCnt = introns.size() ; | |
| 673 std::vector<struct _subexonSplit> intronSplits ; | |
| 674 for ( i = 0 ; i < intronCnt ; ++i ) | |
| 675 { | |
| 676 /*if ( introns[i].sampleSupport < 0.05 * fileCnt ) | |
| 677 { | |
| 678 continue ; | |
| 679 }*/ | |
| 680 struct _interval &it = introns[i] ; | |
| 681 struct _subexonSplit sp ; | |
| 682 sp.chrId = it.chrId ; | |
| 683 sp.pos = it.start ; | |
| 684 sp.type = 2 ; | |
| 685 sp.splitType = 2 ; | |
| 686 sp.strand = it.strand ; | |
| 687 intronSplits.push_back( sp ) ; | |
| 688 | |
| 689 sp.chrId = it.chrId ; | |
| 690 sp.pos = it.end ; | |
| 691 sp.type = 1 ; | |
| 692 sp.splitType = 1 ; | |
| 693 sp.strand = it.strand ; | |
| 694 intronSplits.push_back( sp ) ; | |
| 695 } | |
| 696 | |
| 697 // Pair up the split sites to get subexons | |
| 698 std::sort( intronSplits.begin(), intronSplits.end(), CompSubexonSplit ) ; | |
| 699 //std::sort( subexonSplits.begin(), subexonSplits.end(), CompSubexonSplit ) ; | |
| 700 | |
| 701 // Convert the hard boundary to soft boundary if the split sites is filtered from the introns | |
| 702 // Seems NO need to do this now. | |
| 703 int splitCnt = subexonSplits.size() ; | |
| 704 int intronSplitCnt = intronSplits.size() ; | |
| 705 k = 0 ; | |
| 706 //for ( i = 0 ; i < splitCnt ; ++i ) | |
| 707 while ( 0 ) | |
| 708 { | |
| 709 if ( subexonSplits[i].type != subexonSplits[i].splitType ) | |
| 710 continue ; | |
| 711 | |
| 712 while ( k < intronSplitCnt && ( intronSplits[k].chrId < subexonSplits[i].chrId | |
| 713 || ( intronSplits[k].chrId == subexonSplits[i].chrId && intronSplits[k].pos < subexonSplits[i].pos ) ) ) | |
| 714 ++k ; | |
| 715 j = k ; | |
| 716 while ( j < intronSplitCnt && intronSplits[j].chrId == subexonSplits[i].chrId | |
| 717 && intronSplits[j].pos == subexonSplits[i].pos && intronSplits[j].splitType != subexonSplits[i].splitType ) | |
| 718 ++j ; | |
| 719 | |
| 720 // the split site is filtered. | |
| 721 if ( j >= intronSplitCnt || intronSplits[j].chrId != subexonSplits[i].chrId || | |
| 722 intronSplits[j].pos > subexonSplits[i].pos ) | |
| 723 { | |
| 724 //printf( "%d %d. %d %d\n", subexonSplits[i].pos, intronSplits[j].pos, intronSplits[j].chrId , subexonSplits[i].chrId ) ; | |
| 725 subexonSplits[i].splitType = 0 ; | |
| 726 | |
| 727 // Convert the adjacent subexon split. | |
| 728 for ( int l = i + 1 ; i < splitCnt && subexonSplits[l].chrId == subexonSplits[i].chrId | |
| 729 && subexonSplits[l].pos == subexonSplits[i].pos + 1 ; ++l ) | |
| 730 { | |
| 731 if ( subexonSplits[l].type != subexonSplits[i].type | |
| 732 && subexonSplits[l].splitType == subexonSplits[i].splitType ) | |
| 733 { | |
| 734 subexonSplits[l].splitType = 0 ; | |
| 735 } | |
| 736 } | |
| 737 | |
| 738 // And the other direction | |
| 739 for ( int l = i - 1 ; l >= 0 && subexonSplits[l].chrId == subexonSplits[i].chrId | |
| 740 && subexonSplits[l].pos == subexonSplits[i].pos - 1 ; --l ) | |
| 741 { | |
| 742 if ( subexonSplits[l].type != subexonSplits[i].type | |
| 743 && subexonSplits[l].splitType == subexonSplits[i].splitType ) | |
| 744 { | |
| 745 subexonSplits[l].splitType = 0 ; | |
| 746 } | |
| 747 } | |
| 748 } | |
| 749 } | |
| 750 intronSplits.clear() ; | |
| 751 std::vector<struct _subexonSplit>().swap( intronSplits ) ; | |
| 752 | |
| 753 // Force the soft boundary that collides with hard boundaries to be hard boundary. | |
| 754 for ( i = 0 ; i < splitCnt ; ++i ) | |
| 755 { | |
| 756 if ( subexonSplits[i].splitType != 0 ) | |
| 757 continue ; | |
| 758 int newSplitType = 0 ; | |
| 759 int newStrand = subexonSplits[i].strand ; | |
| 760 for ( j = i + 1 ; j < splitCnt ; ++j ) | |
| 761 { | |
| 762 if ( subexonSplits[i].type != subexonSplits[j].type || subexonSplits[i].pos != subexonSplits[j].pos || | |
| 763 subexonSplits[i].chrId != subexonSplits[j].chrId ) | |
| 764 break ; | |
| 765 if ( subexonSplits[j].splitType != 0 ) | |
| 766 { | |
| 767 newSplitType = subexonSplits[j].splitType ; | |
| 768 newStrand = subexonSplits[j].strand ; | |
| 769 break ; | |
| 770 } | |
| 771 } | |
| 772 | |
| 773 if ( newSplitType == 0 ) | |
| 774 { | |
| 775 for ( j = i - 1 ; j >= 0 ; --j ) | |
| 776 { | |
| 777 if ( subexonSplits[i].type != subexonSplits[j].type || subexonSplits[i].pos != subexonSplits[j].pos || | |
| 778 subexonSplits[i].chrId != subexonSplits[j].chrId ) | |
| 779 break ; | |
| 780 if ( subexonSplits[j].splitType != 0 ) | |
| 781 { | |
| 782 newSplitType = subexonSplits[j].splitType ; | |
| 783 newStrand = subexonSplits[j].strand ; | |
| 784 break ; | |
| 785 } | |
| 786 } | |
| 787 | |
| 788 } | |
| 789 /*if ( subexonSplits[i].pos == 154464157 ) | |
| 790 { | |
| 791 printf( "force conversion: %d %d %d. %d %d\n", subexonSplits[i].pos, subexonSplits[i].splitType, subexonSplits[i].weight, subexonSplits[i + 1].pos, subexonSplits[i + 1].splitType ) ; | |
| 792 }*/ | |
| 793 subexonSplits[i].splitType = newSplitType ; | |
| 794 subexonSplits[i].strand = newStrand ; | |
| 795 } | |
| 796 | |
| 797 /*for ( i = 0 ; i < splitCnt ; ++i ) | |
| 798 { | |
| 799 printf( "%d: type=%d splitType=%d weight=%d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, subexonSplits[i].weight ) ; | |
| 800 }*/ | |
| 801 | |
| 802 // Build subexons from the collected split sites. | |
| 803 | |
| 804 std::vector<struct _subexon> subexons ; | |
| 805 int diffCnt = 0 ; // |start of subexon split| - |end of subexon split| | |
| 806 int seCnt = 0 ; | |
| 807 for ( i = 0 ; i < splitCnt - 1 ; ++i ) | |
| 808 { | |
| 809 struct _subexon se ; | |
| 810 /*if ( subexonSplits[i + 1].pos == 144177260 ) | |
| 811 { | |
| 812 printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, | |
| 813 subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ; | |
| 814 }*/ | |
| 815 | |
| 816 if ( subexonSplits[i].type == 1 ) | |
| 817 diffCnt += subexonSplits[i].weight ; | |
| 818 else | |
| 819 diffCnt -= subexonSplits[i].weight ; | |
| 820 | |
| 821 if ( subexonSplits[i + 1].chrId != subexonSplits[i].chrId ) | |
| 822 { | |
| 823 diffCnt = 0 ; | |
| 824 continue ; | |
| 825 } | |
| 826 | |
| 827 if ( diffCnt == 0 ) // the interval between subexon | |
| 828 continue ; | |
| 829 | |
| 830 se.chrId = subexonSplits[i].chrId ; | |
| 831 se.start = subexonSplits[i].pos ; | |
| 832 se.leftType = subexonSplits[i].splitType ; | |
| 833 se.leftStrand = subexonSplits[i].strand ; | |
| 834 if ( subexonSplits[i].type == 2 ) | |
| 835 { | |
| 836 se.leftStrand = 0 ; | |
| 837 ++se.start ; | |
| 838 } | |
| 839 | |
| 840 se.end = subexonSplits[i + 1].pos ; | |
| 841 se.rightType = subexonSplits[i + 1].splitType ; | |
| 842 se.rightStrand = subexonSplits[i + 1].strand ; | |
| 843 if ( subexonSplits[i + 1].type == 1 ) | |
| 844 { | |
| 845 se.rightStrand = 0 ; | |
| 846 --se.end ; | |
| 847 } | |
| 848 | |
| 849 /*if ( se.end == 24613649 ) | |
| 850 { | |
| 851 //printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, | |
| 852 // subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ; | |
| 853 printf( "%d %d %d. %d %d %d\n", se.start, se.leftType, se.leftStrand, se.end, se.rightType, se.rightStrand ) ; | |
| 854 }*/ | |
| 855 | |
| 856 if ( se.start > se.end ) //Note: this handles the case of repeated subexon split. | |
| 857 { | |
| 858 // handle the case in sample 0: [...[..] | |
| 859 // in sample 1: [..]...] | |
| 860 if ( seCnt > 0 && se.end == subexons[seCnt - 1].end && subexons[seCnt - 1].rightType < se.rightType ) | |
| 861 { | |
| 862 subexons[seCnt - 1].rightType = se.rightType ; | |
| 863 subexons[seCnt - 1].rightStrand = se.rightStrand ; | |
| 864 } | |
| 865 continue ; | |
| 866 } | |
| 867 se.leftClassifier = se.rightClassifier = 0 ; | |
| 868 se.lcCnt = se.rcCnt = 0 ; | |
| 869 | |
| 870 /*if ( 1 ) //se.chrId == 25 ) | |
| 871 { | |
| 872 printf( "%d: %d-%d: %d %d\n", se.chrId, se.start, se.end, se.leftType, se.rightType ) ; | |
| 873 }*/ | |
| 874 | |
| 875 | |
| 876 se.next = se.prev = NULL ; | |
| 877 se.nextCnt = se.prevCnt = 0 ; | |
| 878 subexons.push_back( se ) ; | |
| 879 ++seCnt ; | |
| 880 } | |
| 881 subexonSplits.clear() ; | |
| 882 std::vector<struct _subexonSplit>().swap( subexonSplits ) ; | |
| 883 | |
| 884 // Adjust the split type. | |
| 885 seCnt = subexons.size() ; | |
| 886 for ( i = 1 ; i < seCnt ; ++i ) | |
| 887 { | |
| 888 if ( subexons[i - 1].end + 1 == subexons[i].start ) | |
| 889 { | |
| 890 if ( subexons[i - 1].rightType == 0 ) | |
| 891 subexons[i - 1].rightType = subexons[i].leftType ; | |
| 892 if ( subexons[i].leftType == 0 ) | |
| 893 subexons[i].leftType = subexons[i - 1].rightType ; | |
| 894 } | |
| 895 } | |
| 896 | |
| 897 // Merge the adjacent soft boundaries | |
| 898 std::vector<struct _subexon> rawSubexons = subexons ; | |
| 899 int exonCnt = exons.size() ; | |
| 900 subexons.clear() ; | |
| 901 | |
| 902 k = 0 ; // hold index for exon. | |
| 903 for ( i = 0 ; i < seCnt ; ) | |
| 904 { | |
| 905 /*if ( rawSubexons[k].rightType == 0 && rawSubexons[i].leftType == 0 | |
| 906 && rawSubexons[k].end + 1 == rawSubexons[i].start ) | |
| 907 { | |
| 908 rawSubexons[k].end = rawSubexons[i].end ; | |
| 909 rawSubexons[k].rightType = rawSubexons[i].rightType ; | |
| 910 rawSubexons[k].rightStrand = rawSubexons[i].rightStrand ; | |
| 911 } | |
| 912 else | |
| 913 { | |
| 914 subexons.push_back( rawSubexons[k] ) ; | |
| 915 k = i ; | |
| 916 }*/ | |
| 917 | |
| 918 while ( k < exonCnt && ( exons[k].chrId < rawSubexons[i].chrId | |
| 919 || ( exons[k].chrId == rawSubexons[i].chrId && exons[k].start < rawSubexons[i].start ) ) ) | |
| 920 ++k ; | |
| 921 | |
| 922 for ( j = i + 1 ; j < seCnt ; ++j ) | |
| 923 { | |
| 924 if ( rawSubexons[j - 1].chrId != rawSubexons[j].chrId || rawSubexons[j - 1].rightType != 0 || rawSubexons[j].leftType != 0 | |
| 925 || ( fileCnt > 1 && rawSubexons[j - 1].end + 1 != rawSubexons[j].start ) | |
| 926 || ( fileCnt == 1 && rawSubexons[j - 1].end + 50 < rawSubexons[j].start ) ) | |
| 927 break ; | |
| 928 } | |
| 929 // rawsubexons[i...j-1] will be merged. | |
| 930 | |
| 931 /*if ( rawSubexons[i].start == 119625875 ) | |
| 932 { | |
| 933 printf( "merge j-1: %d %d %d %d\n", rawSubexons[j - 1].end, rawSubexons[j - 1].rightType, | |
| 934 rawSubexons[j].start, rawSubexons[j].leftType ) ; | |
| 935 }*/ | |
| 936 bool merge = true ; | |
| 937 if ( rawSubexons[i].leftType == 1 && rawSubexons[j - 1].rightType == 2 && j - i > 1 | |
| 938 && rawSubexons[j - 1].end - rawSubexons[i].start >= 1000 ) | |
| 939 { | |
| 940 merge = false ; | |
| 941 int sampleSupport = 0 ; | |
| 942 for ( int l = k ; l < exonCnt ; ++l ) | |
| 943 { | |
| 944 if ( exons[l].chrId != rawSubexons[i].chrId || exons[l].start > rawSubexons[i].start ) | |
| 945 break ; | |
| 946 if ( exons[l].end == rawSubexons[j - 1].end ) | |
| 947 { | |
| 948 merge = true ; | |
| 949 sampleSupport = exons[l].sampleSupport ; | |
| 950 break ; | |
| 951 } | |
| 952 } | |
| 953 | |
| 954 if ( merge == true && rawSubexons[j - 1].end - rawSubexons[i].start >= 1000 ) | |
| 955 { | |
| 956 if ( sampleSupport <= 0.2 * fileCnt ) | |
| 957 { | |
| 958 merge = false ; | |
| 959 } | |
| 960 } | |
| 961 | |
| 962 if ( merge == false ) | |
| 963 { | |
| 964 if ( j - i >= 3 ) | |
| 965 { | |
| 966 rawSubexons[i].end = rawSubexons[ (i + j - 1) / 2 ].start ; | |
| 967 rawSubexons[j - 1].start = rawSubexons[ (i + j - 1) / 2].end ; | |
| 968 } | |
| 969 | |
| 970 if ( rawSubexons[i].end + 1 == rawSubexons[j - 1].start ) | |
| 971 { | |
| 972 --rawSubexons[i].end ; | |
| 973 ++rawSubexons[j - 1].start ; | |
| 974 } | |
| 975 subexons.push_back( rawSubexons[i] ) ; | |
| 976 subexons.push_back( rawSubexons[j - 1] ) ; | |
| 977 } | |
| 978 } | |
| 979 | |
| 980 if ( merge ) | |
| 981 { | |
| 982 rawSubexons[i].end = rawSubexons[j - 1].end ; | |
| 983 rawSubexons[i].rightType = rawSubexons[j - 1].rightType ; | |
| 984 rawSubexons[i].rightStrand = rawSubexons[j - 1].rightStrand ; | |
| 985 | |
| 986 if ( rawSubexons[i].leftType == 0 && rawSubexons[i].rightType != 0 ) | |
| 987 { | |
| 988 rawSubexons[i].start = rawSubexons[ ( i + j - 1 ) / 2 ].start ; | |
| 989 } | |
| 990 else if ( rawSubexons[i].rightType == 0 && rawSubexons[i].leftType != 0 ) | |
| 991 { | |
| 992 rawSubexons[i].end = rawSubexons[ ( i + j - 1 ) / 2 ].end ; | |
| 993 } | |
| 994 | |
| 995 subexons.push_back( rawSubexons[i] ) ; | |
| 996 } | |
| 997 | |
| 998 i = j ; | |
| 999 } | |
| 1000 exons.clear() ; | |
| 1001 std::vector<struct _interval>().swap( exons ) ; | |
| 1002 | |
| 1003 // Remove overhang, ir subexons intron created after putting multiple sample to gether. | |
| 1004 // eg: s0: [......) | |
| 1005 // s1: [...]--------[....] | |
| 1006 // s2: [...]..)-----[....] | |
| 1007 // Though the overhang from s2 is filtered in readin, there will a new overhang created combining s0,s1. | |
| 1008 // But be careful about how to compute the classifier for the overhang part contributed from s0. | |
| 1009 // Furthermore, note that the case of single-exon island showed up in intron retention region after combining is not possible when get here. | |
| 1010 // eg: s0:[...]-----[...] | |
| 1011 // s1: (.) | |
| 1012 // s2:[.............] | |
| 1013 // After merge adjacent soft boundaries, the single-exon island will disappear. | |
| 1014 rawSubexons = subexons ; | |
| 1015 seCnt = subexons.size() ; | |
| 1016 subexons.clear() ; | |
| 1017 for ( i = 0 ; i < seCnt ; ++i ) | |
| 1018 { | |
| 1019 if ( ( rawSubexons[i].leftType == 2 && rawSubexons[i].rightType == 1 ) // ir | |
| 1020 || ( rawSubexons[i].leftType == 2 && rawSubexons[i].rightType == 0 ) // overhang | |
| 1021 || ( rawSubexons[i].leftType == 0 && rawSubexons[i].rightType == 1 ) ) | |
| 1022 continue ; | |
| 1023 subexons.push_back( rawSubexons[i] ) ; | |
| 1024 } | |
| 1025 | |
| 1026 // Remove the single-exon island if it overlaps with intron retentioned or overhang. | |
| 1027 rawSubexons = subexons ; | |
| 1028 seCnt = subexons.size() ; | |
| 1029 subexons.clear() ; | |
| 1030 k = 0 ; | |
| 1031 std::sort( intervalIrOverhang.begin(), intervalIrOverhang.end(), CompInterval ) ; | |
| 1032 int irOverhangCnt = intervalIrOverhang.size() ; | |
| 1033 | |
| 1034 for ( i = 0 ; i < seCnt ; ++i ) | |
| 1035 { | |
| 1036 if ( rawSubexons[i].leftType != 0 || rawSubexons[i].rightType != 0 ) | |
| 1037 { | |
| 1038 subexons.push_back( rawSubexons[i] ) ; | |
| 1039 continue ; | |
| 1040 } | |
| 1041 | |
| 1042 while ( k < irOverhangCnt ) | |
| 1043 { | |
| 1044 // Locate the interval that before the island | |
| 1045 if ( intervalIrOverhang[k].chrId < rawSubexons[i].chrId | |
| 1046 || ( intervalIrOverhang[k].chrId == rawSubexons[i].chrId && intervalIrOverhang[k].end < rawSubexons[i].start ) ) | |
| 1047 { | |
| 1048 ++k ; | |
| 1049 continue ; | |
| 1050 } | |
| 1051 break ; | |
| 1052 } | |
| 1053 bool overlap = false ; | |
| 1054 for ( j = k ; j < irOverhangCnt ; ++j ) | |
| 1055 { | |
| 1056 if ( intervalIrOverhang[j].chrId > rawSubexons[i].chrId || intervalIrOverhang[j].start > rawSubexons[i].end ) | |
| 1057 break ; | |
| 1058 if ( ( intervalIrOverhang[j].start <= rawSubexons[i].start && intervalIrOverhang[j].end >= rawSubexons[i].start ) | |
| 1059 || ( intervalIrOverhang[j].start <= rawSubexons[i].end && intervalIrOverhang[j].end >= rawSubexons[i].end ) ) | |
| 1060 { | |
| 1061 overlap = true ; | |
| 1062 break ; | |
| 1063 } | |
| 1064 } | |
| 1065 | |
| 1066 if ( !overlap ) | |
| 1067 subexons.push_back( rawSubexons[i] ) ; | |
| 1068 } | |
| 1069 rawSubexons.clear() ; | |
| 1070 std::vector<struct _subexon>().swap( rawSubexons ) ; | |
| 1071 | |
| 1072 intervalIrOverhang.clear() ; | |
| 1073 std::vector<struct _interval>().swap( intervalIrOverhang ) ; | |
| 1074 | |
| 1075 // Create the dummy intervals. | |
| 1076 seCnt = subexons.size() ; | |
| 1077 std::vector<struct _intronicInfo> intronicInfos ; | |
| 1078 std::vector<struct _seInterval> seIntervals ; | |
| 1079 std::vector<struct _subexonSupplement> subexonInfo ; | |
| 1080 | |
| 1081 //subexonInfo.resize( seCnt ) ; | |
| 1082 for ( i = 0 ; i < seCnt ; ++i ) | |
| 1083 { | |
| 1084 struct _seInterval ni ; // new interval | |
| 1085 ni.start = subexons[i].start ; | |
| 1086 ni.end = subexons[i].end ; | |
| 1087 ni.type = 0 ; | |
| 1088 ni.idx = i ; | |
| 1089 ni.chrId = subexons[i].chrId ; | |
| 1090 seIntervals.push_back( ni ) ; | |
| 1091 | |
| 1092 /*if ( subexons[i].end == 42671717 ) | |
| 1093 { | |
| 1094 printf( "%d: %d-%d: %d\n", subexons[i].chrId, subexons[i].start, subexons[i].end, subexons[i].rightType ) ; | |
| 1095 }*/ | |
| 1096 //subexonInfo[i].prevSupport = subexonInfo[i].nextSupport = NULL ; | |
| 1097 | |
| 1098 /*int nexti ; | |
| 1099 for ( nexti = i + 1 ; nexti < seCnt ; ++nexti ) | |
| 1100 if ( subexons[ nexti ].leftType == 0 && subexons[nexti].rightType == 0 )*/ | |
| 1101 | |
| 1102 if ( i < seCnt - 1 && subexons[i].chrId == subexons[i + 1].chrId && | |
| 1103 subexons[i].end + 1 < subexons[i + 1].start && | |
| 1104 subexons[i].rightType + subexons[i + 1].leftType != 0 ) | |
| 1105 { | |
| 1106 // Only consider the intervals like ]..[,]...(, )...[ | |
| 1107 // The case like ]...] is actaully things like ][...] in subexon perspective, | |
| 1108 // so they won't pass the if-statement | |
| 1109 struct _intronicInfo nii ; // new intronic info | |
| 1110 ni.start = subexons[i].end + 1 ; | |
| 1111 ni.end = subexons[i + 1].start - 1 ; | |
| 1112 ni.type = 1 ; | |
| 1113 ni.idx = intronicInfos.size() ; | |
| 1114 seIntervals.push_back( ni ) ; | |
| 1115 | |
| 1116 nii.chrId = subexons[i].chrId ; | |
| 1117 nii.start = ni.start ; | |
| 1118 nii.end = ni.end ; | |
| 1119 nii.leftSubexonIdx = i ; | |
| 1120 nii.rightSubexonIdx = i + 1 ; | |
| 1121 nii.irClassifier = 0 ; | |
| 1122 nii.irCnt = 0 ; | |
| 1123 nii.validIrCnt = 0 ; | |
| 1124 nii.leftOverhang.cnt = 0 ; | |
| 1125 nii.leftOverhang.validCnt = 0 ; | |
| 1126 nii.leftOverhang.length = 0 ; | |
| 1127 nii.leftOverhang.classifier = 0 ; | |
| 1128 nii.rightOverhang.cnt = 0 ; | |
| 1129 nii.rightOverhang.validCnt = 0 ; | |
| 1130 nii.rightOverhang.length = 0 ; | |
| 1131 nii.rightOverhang.classifier = 0 ; | |
| 1132 intronicInfos.push_back( nii ) ; | |
| 1133 /*if ( nii.end == 23667 ) | |
| 1134 { | |
| 1135 printf( "%d %d. %d (%d %d %d)\n", nii.start, nii.end, subexons[i].rightType, subexons[i+1].start, subexons[i + 1].end, subexons[i + 1].leftType ) ; | |
| 1136 }*/ | |
| 1137 } | |
| 1138 } | |
| 1139 | |
| 1140 // Go through all the files to get some statistics number | |
| 1141 double avgIrPiRatio = 0 ; | |
| 1142 double avgIrPiCov = 0 ; | |
| 1143 double irPiRatio, irKRatio[2], irThetaRatio[2] ; // Some statistical results | |
| 1144 double irPiCov, irKCov[2], irThetaCov[2] ; | |
| 1145 | |
| 1146 double avgOverhangPiRatio = 0 ; | |
| 1147 double avgOverhangPiCov = 0 ; | |
| 1148 double overhangPiRatio, overhangKRatio[2], overhangThetaRatio[2] ; // Some statistical results | |
| 1149 double overhangPiCov, overhangKCov[2], overhangThetaCov[2] ; | |
| 1150 | |
| 1151 for ( k = 0 ; k < fileCnt ; ++k ) | |
| 1152 { | |
| 1153 fp = fopen( files[k], "r" ) ; | |
| 1154 | |
| 1155 while ( fgets( buffer, sizeof( buffer), fp ) != NULL ) | |
| 1156 { | |
| 1157 if ( buffer[0] == '#' ) | |
| 1158 { | |
| 1159 char buffer2[100] ; | |
| 1160 sscanf( buffer, "%s", buffer2 ) ; | |
| 1161 if ( !strcmp( buffer2, "#fitted_ir_parameter_ratio:" ) ) | |
| 1162 { | |
| 1163 // TODO: ignore certain samples if the coverage seems wrong. | |
| 1164 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", | |
| 1165 buffer2, buffer2, &irPiRatio, buffer2, &irKRatio[0], buffer2, &irThetaRatio[0], | |
| 1166 buffer2, &irKRatio[1], buffer2, &irThetaRatio[1] ) ; | |
| 1167 avgIrPiRatio += irPiRatio ; | |
| 1168 } | |
| 1169 else if ( !strcmp( buffer2, "#fitted_ir_parameter_cov:" ) ) | |
| 1170 { | |
| 1171 } | |
| 1172 else if ( !strcmp( buffer2, "#fitted_overhang_parameter_ratio:" ) ) | |
| 1173 { | |
| 1174 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", | |
| 1175 buffer2, buffer2, &overhangPiRatio, buffer2, &overhangKRatio[0], buffer2, &overhangThetaRatio[0], | |
| 1176 buffer2, &overhangKRatio[1], buffer2, &overhangThetaRatio[1] ) ; | |
| 1177 avgOverhangPiRatio += overhangPiRatio ; | |
| 1178 } | |
| 1179 } | |
| 1180 else | |
| 1181 break ; | |
| 1182 } | |
| 1183 fclose( fp ) ; | |
| 1184 } | |
| 1185 avgIrPiRatio /= fileCnt ; | |
| 1186 avgOverhangPiRatio /= fileCnt ; | |
| 1187 | |
| 1188 // Go through all the files to put statistical results into each subexon. | |
| 1189 std::vector< struct _subexon > sampleSubexons ; | |
| 1190 int subexonCnt = subexons.size() ; | |
| 1191 for ( k = 0 ; k < fileCnt ; ++k ) | |
| 1192 { | |
| 1193 //if ( k == 220 ) | |
| 1194 // exit( 1 ) ; | |
| 1195 fp = fopen( files[k], "r" ) ; | |
| 1196 struct _subexon se ; | |
| 1197 struct _subexonSplit sp ; | |
| 1198 char chrName[50] ; | |
| 1199 | |
| 1200 sampleSubexons.clear() ; | |
| 1201 | |
| 1202 int tag = 0 ; | |
| 1203 while ( fgets( buffer, sizeof( buffer), fp ) != NULL ) | |
| 1204 { | |
| 1205 if ( buffer[0] == '#' ) | |
| 1206 { | |
| 1207 char buffer2[200] ; | |
| 1208 sscanf( buffer, "%s", buffer2 ) ; | |
| 1209 if ( !strcmp( buffer2, "#fitted_ir_parameter_ratio:" ) ) | |
| 1210 { | |
| 1211 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", | |
| 1212 buffer2, buffer2, &irPiRatio, buffer2, &irKRatio[0], buffer2, &irThetaRatio[0], | |
| 1213 buffer2, &irKRatio[1], buffer2, &irThetaRatio[1] ) ; | |
| 1214 } | |
| 1215 else if ( !strcmp( buffer2, "#fitted_ir_parameter_cov:" ) ) | |
| 1216 { | |
| 1217 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", | |
| 1218 buffer2, buffer2, &irPiCov, buffer2, &irKCov[0], buffer2, &irThetaCov[0], | |
| 1219 buffer2, &irKCov[1], buffer2, &irThetaCov[1] ) ; | |
| 1220 } | |
| 1221 else if ( !strcmp( buffer2, "#fitted_overhang_parameter_ratio:" ) ) | |
| 1222 { | |
| 1223 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", | |
| 1224 buffer2, buffer2, &overhangPiRatio, buffer2, &overhangKRatio[0], buffer2, &overhangThetaRatio[0], | |
| 1225 buffer2, &overhangKRatio[1], buffer2, &overhangThetaRatio[1] ) ; | |
| 1226 } | |
| 1227 else if ( !strcmp( buffer2, "#fitted_overhang_parameter_cov:" ) ) | |
| 1228 { | |
| 1229 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf", | |
| 1230 buffer2, buffer2, &overhangPiCov, buffer2, &overhangKCov[0], buffer2, &overhangThetaCov[0], | |
| 1231 buffer2, &overhangKCov[1], buffer2, &overhangThetaCov[1] ) ; | |
| 1232 } | |
| 1233 continue ; | |
| 1234 } | |
| 1235 else | |
| 1236 break ; | |
| 1237 | |
| 1238 //SubexonGraph::InputSubexon( buffer, alignments, se, true ) ; | |
| 1239 //sampleSubexons.push_back( se ) ; | |
| 1240 } | |
| 1241 | |
| 1242 //int sampleSubexonCnt = sampleSubexons.size() ; | |
| 1243 int intervalCnt = seIntervals.size() ; | |
| 1244 //for ( i = 0 ; i < sampleSubexonCnt ; ++i ) | |
| 1245 int iterCnt = 0 ; | |
| 1246 while ( 1 ) | |
| 1247 { | |
| 1248 if ( iterCnt > 0 && fgets( buffer, sizeof( buffer), fp ) == NULL) | |
| 1249 break ; | |
| 1250 ++iterCnt ; | |
| 1251 | |
| 1252 struct _subexon se ; | |
| 1253 SubexonGraph::InputSubexon( buffer, alignments, se, true ) ; | |
| 1254 | |
| 1255 while ( tag < intervalCnt ) | |
| 1256 { | |
| 1257 if ( seIntervals[tag].chrId < se.chrId || | |
| 1258 ( seIntervals[tag].chrId == se.chrId && seIntervals[tag].end < se.start ) ) | |
| 1259 { | |
| 1260 ++tag ; | |
| 1261 continue ; | |
| 1262 } | |
| 1263 else | |
| 1264 break ; | |
| 1265 } | |
| 1266 | |
| 1267 for ( j = tag ; j < intervalCnt ; ++j ) | |
| 1268 { | |
| 1269 if ( seIntervals[j].start > se.end || seIntervals[j].chrId > se.chrId ) // terminate if no overlap. | |
| 1270 break ; | |
| 1271 int idx ; | |
| 1272 | |
| 1273 if ( seIntervals[j].type == 0 ) | |
| 1274 { | |
| 1275 idx = seIntervals[j].idx ; | |
| 1276 if ( subexons[idx].leftType == 1 && se.leftType == 1 && subexons[idx].start == se.start ) | |
| 1277 { | |
| 1278 double tmp = se.leftClassifier ; | |
| 1279 if ( se.leftClassifier == 0 ) | |
| 1280 tmp = 1e-7 ; | |
| 1281 subexons[idx].leftClassifier -= 2.0 * log( tmp ) ; | |
| 1282 ++subexons[idx].lcCnt ; | |
| 1283 subexons[idx].prev = MergePositions( subexons[idx].prev, subexons[idx].prevCnt, | |
| 1284 se.prev, se.prevCnt, subexons[idx].prevCnt ) ; | |
| 1285 | |
| 1286 if ( se.rightType == 0 ) // a gene end here | |
| 1287 { | |
| 1288 for ( int l = idx ; l < subexonCnt ; ++l ) | |
| 1289 { | |
| 1290 if ( l > idx && ( subexons[l].end > subexons[l - 1].start + 1 | |
| 1291 || subexons[l].chrId != subexons[l - 1].chrId ) ) | |
| 1292 break ; | |
| 1293 if ( subexons[l].rightType == 2 ) | |
| 1294 { | |
| 1295 double adjustAvgDepth = se.avgDepth ; | |
| 1296 if ( se.end - se.start + 1 >= 100 ) | |
| 1297 adjustAvgDepth += se.avgDepth * 100.0 / ( se.end - se.start + 1 ) ; | |
| 1298 else | |
| 1299 adjustAvgDepth *= 2 ; | |
| 1300 double p = GetPValueOfGeneEnd( adjustAvgDepth ) ; | |
| 1301 //if ( se.end - se.start + 1 >= 500 && p > 0.001 ) | |
| 1302 // p = 0.001 ; | |
| 1303 | |
| 1304 subexons[l].rightClassifier -= 2.0 * log( p ) ; | |
| 1305 ++subexons[l].rcCnt ; | |
| 1306 break ; | |
| 1307 } | |
| 1308 } | |
| 1309 } | |
| 1310 } | |
| 1311 //if ( se.chrId == 25 ) | |
| 1312 // printf( "(%d %d %d. %d) (%d %d %d. %d)\n", se.chrId, se.start, se.end, se.rightType, subexons[idx].chrId, subexons[idx].start, subexons[idx].end, subexons[idx].rightType ) ; | |
| 1313 if ( subexons[idx].rightType == 2 && se.rightType == 2 && subexons[idx].end == se.end ) | |
| 1314 { | |
| 1315 double tmp = se.rightClassifier ; | |
| 1316 if ( se.rightClassifier == 0 ) | |
| 1317 tmp = 1e-7 ; | |
| 1318 subexons[idx].rightClassifier -= 2.0 * log( tmp ) ; | |
| 1319 ++subexons[idx].rcCnt ; | |
| 1320 | |
| 1321 subexons[idx].next = MergePositions( subexons[idx].next, subexons[idx].nextCnt, | |
| 1322 se.next, se.nextCnt, subexons[idx].nextCnt ) ; | |
| 1323 | |
| 1324 if ( se.leftType == 0 ) | |
| 1325 { | |
| 1326 for ( int l = idx ; l >= 0 ; --l ) | |
| 1327 { | |
| 1328 if ( l < idx && ( subexons[l].end < subexons[l + 1].start - 1 | |
| 1329 || subexons[l].chrId != subexons[l + 1].chrId ) ) | |
| 1330 break ; | |
| 1331 if ( subexons[l].leftType == 1 ) | |
| 1332 { | |
| 1333 double adjustAvgDepth = se.avgDepth ; | |
| 1334 if ( se.end - se.start + 1 >= 100 ) | |
| 1335 adjustAvgDepth += se.avgDepth * 100.0 / ( se.end - se.start + 1 ) ; | |
| 1336 else | |
| 1337 adjustAvgDepth *= 2 ; | |
| 1338 double p = GetPValueOfGeneEnd( adjustAvgDepth ) ; | |
| 1339 //if ( se.end - se.start + 1 >= 500 && p >= 0.001 ) | |
| 1340 // p = 0.001 ; | |
| 1341 subexons[l].leftClassifier -= 2.0 * log( p ) ; | |
| 1342 ++subexons[l].lcCnt ; | |
| 1343 break ; | |
| 1344 } | |
| 1345 } | |
| 1346 } | |
| 1347 } | |
| 1348 | |
| 1349 if ( subexons[idx].leftType == 0 && subexons[idx].rightType == 0 | |
| 1350 && se.leftType == 0 && se.rightType == 0 ) // the single-exon island. | |
| 1351 { | |
| 1352 double tmp = se.leftClassifier ; | |
| 1353 if ( se.leftClassifier == 0 ) | |
| 1354 tmp = 1e-7 ; | |
| 1355 subexons[idx].leftClassifier -= 2.0 * log( tmp ) ; | |
| 1356 subexons[idx].rightClassifier = subexons[idx].leftClassifier ; | |
| 1357 ++subexons[idx].lcCnt ; | |
| 1358 ++subexons[idx].rcCnt ; | |
| 1359 } | |
| 1360 } | |
| 1361 else if ( seIntervals[j].type == 1 ) | |
| 1362 { | |
| 1363 idx = seIntervals[j].idx ; | |
| 1364 // Overlap on the left part of intron | |
| 1365 if ( se.start <= intronicInfos[idx].start && se.end < intronicInfos[idx].end | |
| 1366 && subexons[ intronicInfos[idx].leftSubexonIdx ].rightType != 0 ) | |
| 1367 { | |
| 1368 int len = se.end - intronicInfos[idx].start + 1 ; | |
| 1369 intronicInfos[idx].leftOverhang.length += len ; | |
| 1370 ++intronicInfos[idx].leftOverhang.cnt ; | |
| 1371 | |
| 1372 // Note that the sample subexon must have a soft boundary at right hand side, | |
| 1373 // otherwise, this part is not an intron and won't show up in intronic Info. | |
| 1374 if ( se.leftType == 2 ) | |
| 1375 { | |
| 1376 if ( se.leftRatio > 0 && se.avgDepth > 1 ) | |
| 1377 { | |
| 1378 ++intronicInfos[idx].leftOverhang.validCnt ; | |
| 1379 | |
| 1380 double update = GetUpdateMixtureGammaClassifier( se.leftRatio, se.avgDepth, | |
| 1381 overhangPiRatio, overhangKRatio, overhangThetaRatio, | |
| 1382 overhangPiCov, overhangKCov, overhangThetaCov, false ) ; | |
| 1383 intronicInfos[idx].leftOverhang.classifier += update ; | |
| 1384 } | |
| 1385 } | |
| 1386 else if ( se.leftType == 1 ) | |
| 1387 { | |
| 1388 ++intronicInfos[idx].leftOverhang.validCnt ; | |
| 1389 double update = GetUpdateMixtureGammaClassifier( 1.0, se.avgDepth, | |
| 1390 overhangPiRatio, overhangKRatio, overhangThetaRatio, | |
| 1391 overhangPiCov, overhangKCov, overhangThetaCov, true ) ; | |
| 1392 intronicInfos[idx].leftOverhang.classifier += update ; | |
| 1393 | |
| 1394 int seIdx = intronicInfos[idx].leftSubexonIdx ; | |
| 1395 subexons[seIdx].rightClassifier -= 2.0 * log( GetPValueOfGeneEnd( se.avgDepth ) ) ; | |
| 1396 ++subexons[ seIdx ].rcCnt ; | |
| 1397 } | |
| 1398 // ignore the contribution of single-exon island here? | |
| 1399 } | |
| 1400 // Overlap on the right part of intron | |
| 1401 else if ( se.start > intronicInfos[idx].start && se.end >= intronicInfos[idx].end | |
| 1402 && subexons[ intronicInfos[idx].rightSubexonIdx ].leftType != 0 ) | |
| 1403 { | |
| 1404 int len = intronicInfos[idx].end - se.start + 1 ; | |
| 1405 intronicInfos[idx].rightOverhang.length += len ; | |
| 1406 ++intronicInfos[idx].rightOverhang.cnt ; | |
| 1407 | |
| 1408 // Note that the sample subexon must have a soft boundary at left hand side, | |
| 1409 // otherwise, this won't show up in intronic Info | |
| 1410 if ( se.rightType == 1 ) | |
| 1411 { | |
| 1412 if ( se.rightRatio > 0 && se.avgDepth > 1 ) | |
| 1413 { | |
| 1414 ++intronicInfos[idx].rightOverhang.validCnt ; | |
| 1415 | |
| 1416 double update = GetUpdateMixtureGammaClassifier( se.rightRatio, se.avgDepth, | |
| 1417 overhangPiRatio, overhangKRatio, overhangThetaRatio, | |
| 1418 overhangPiCov, overhangKCov, overhangThetaCov, false ) ; | |
| 1419 intronicInfos[idx].rightOverhang.classifier += update ; | |
| 1420 } | |
| 1421 } | |
| 1422 else if ( se.rightType == 2 ) | |
| 1423 { | |
| 1424 ++intronicInfos[idx].rightOverhang.validCnt ; | |
| 1425 | |
| 1426 double update = GetUpdateMixtureGammaClassifier( 1, se.avgDepth, | |
| 1427 overhangPiRatio, overhangKRatio, overhangThetaRatio, | |
| 1428 overhangPiCov, overhangKCov, overhangThetaCov, true ) ; | |
| 1429 intronicInfos[idx].rightOverhang.classifier += update ; | |
| 1430 | |
| 1431 int seIdx = intronicInfos[idx].rightSubexonIdx ; | |
| 1432 /*if ( subexons[ seIdx ].start == 6873648 ) | |
| 1433 { | |
| 1434 printf( "%lf %lf: %lf %lf %lf\n", subexons[seIdx].leftClassifier, GetPValueOfGeneEnd( se.avgDepth ), se.avgDepth, sqrt( se.avgDepth ), log( se.avgDepth ) ) ; | |
| 1435 }*/ | |
| 1436 subexons[seIdx].leftClassifier -= 2.0 * log( GetPValueOfGeneEnd( se.avgDepth ) ) ; | |
| 1437 ++subexons[ seIdx ].lcCnt ; | |
| 1438 } | |
| 1439 } | |
| 1440 // Intron is fully contained in this sample subexon, then it is a ir candidate | |
| 1441 else if ( se.start <= intronicInfos[idx].start && se.end >= intronicInfos[idx].end ) | |
| 1442 { | |
| 1443 if ( se.leftType == 2 && se.rightType == 1 ) | |
| 1444 { | |
| 1445 double ratio = regions.PickLeftAndRightRatio( se.leftRatio, se.rightRatio ) ; | |
| 1446 ++intronicInfos[idx].irCnt ; | |
| 1447 if ( ratio > 0 && se.avgDepth > 1 ) | |
| 1448 { | |
| 1449 double update = GetUpdateMixtureGammaClassifier( ratio, se.avgDepth, | |
| 1450 irPiRatio, irKRatio, irThetaRatio, | |
| 1451 irPiCov, irKCov, irThetaCov, true ) ; | |
| 1452 //if ( intronicInfos[idx].start == 37617368 ) | |
| 1453 // printf( "hi %lf %d %d: %d %d\n", update, se.start, se.end, intronicInfos[idx].start, intronicInfos[idx].end ) ; | |
| 1454 intronicInfos[idx].irClassifier += update ; | |
| 1455 ++intronicInfos[idx].validIrCnt ; | |
| 1456 } | |
| 1457 } | |
| 1458 else if ( se.leftType == 1 || se.rightType == 2 ) | |
| 1459 { | |
| 1460 //intronicInfos[idx].irClassifier += LogGammaDensity( 4.0, irKRatio[1], irThetaRatio[1] ) | |
| 1461 // - LogGammaDensity( 4.0, irKRatio[0], irThetaRatio[0] ) ; | |
| 1462 /*if ( se.start == 37617368 ) | |
| 1463 { | |
| 1464 printf( "%lf: %lf %lf\n", se.avgDepth, MixtureGammaAssignment( ( irKCov[0] - 1 ) * irThetaCov[0], irPiRatio, irKCov, irThetaCov ), | |
| 1465 MixtureGammaAssignment( TransformCov( 4.0 ), irPiRatio, irKCov, irThetaCov ) ) ; | |
| 1466 }*/ | |
| 1467 if ( se.avgDepth > 1 ) | |
| 1468 { | |
| 1469 // let the depth be the threshold to determine. | |
| 1470 double update = GetUpdateMixtureGammaClassifier( 4.0, se.avgDepth, | |
| 1471 irPiRatio, irKRatio, irThetaRatio, | |
| 1472 irPiCov, irKCov, irThetaCov, true ) ; | |
| 1473 //if ( intronicInfos[idx].start == 36266630 ) | |
| 1474 // printf( "hi %lf %d %d: %d %d\n", update, se.start, se.end, intronicInfos[idx].start, intronicInfos[idx].end ) ; | |
| 1475 intronicInfos[idx].irClassifier += update ; | |
| 1476 ++intronicInfos[idx].irCnt ; | |
| 1477 ++intronicInfos[idx].validIrCnt ; | |
| 1478 } | |
| 1479 } | |
| 1480 else | |
| 1481 { | |
| 1482 // the intron is contained in a overhang subexon from the sample or single-exon island | |
| 1483 } | |
| 1484 } | |
| 1485 // sample subexon is contained in the intron. | |
| 1486 else | |
| 1487 { | |
| 1488 // Do nothing. | |
| 1489 } | |
| 1490 } | |
| 1491 } | |
| 1492 | |
| 1493 //if ( se.nextCnt > 0 ) | |
| 1494 delete[] se.next ; | |
| 1495 //if ( se.prevCnt > 0 ) | |
| 1496 delete[] se.prev ; | |
| 1497 } | |
| 1498 fclose( fp ) ; | |
| 1499 | |
| 1500 /*for ( i = 0 ; i < sampleSubexonCnt ; ++i ) | |
| 1501 { | |
| 1502 if ( sampleSubexons[i].nextCnt > 0 ) | |
| 1503 delete[] sampleSubexons[i].next ; | |
| 1504 if ( sampleSubexons[i].prevCnt > 0 ) | |
| 1505 delete[] sampleSubexons[i].prev ; | |
| 1506 }*/ | |
| 1507 } | |
| 1508 | |
| 1509 CleanUpSubexonConnections( subexons ) ; | |
| 1510 | |
| 1511 // Convert the temporary statistics number into formal statistics result. | |
| 1512 for ( i = 0 ; i < subexonCnt ; ++i ) | |
| 1513 { | |
| 1514 struct _subexon &se = subexons[i] ; | |
| 1515 if ( se.leftType == 0 && se.rightType == 0 ) // single-exon txpt. | |
| 1516 { | |
| 1517 se.leftClassifier = se.rightClassifier = 1 - chicdf( se.rightClassifier, 2 * se.rcCnt ) ; | |
| 1518 } | |
| 1519 else | |
| 1520 { | |
| 1521 if ( se.leftType == 1 ) | |
| 1522 { | |
| 1523 se.leftClassifier = 1 - chicdf( se.leftClassifier, 2 * se.lcCnt ) ; | |
| 1524 } | |
| 1525 else | |
| 1526 se.leftClassifier = -1 ; | |
| 1527 | |
| 1528 if ( se.rightType == 2 ) | |
| 1529 se.rightClassifier = 1 - chicdf( se.rightClassifier, 2 * se.rcCnt ) ; | |
| 1530 else | |
| 1531 se.rightClassifier = -1 ; | |
| 1532 } | |
| 1533 } | |
| 1534 | |
| 1535 int iiCnt = intronicInfos.size() ; //intronicInfo count | |
| 1536 for ( i = 0 ; i < iiCnt ; ++i ) | |
| 1537 { | |
| 1538 struct _intronicInfo &ii = intronicInfos[i] ; | |
| 1539 if ( ii.validIrCnt > 0 ) | |
| 1540 { | |
| 1541 for ( j = 0 ; j < fileCnt - ii.validIrCnt ; ++j ) | |
| 1542 { | |
| 1543 ii.irClassifier -= log( 10.0 ) ; | |
| 1544 } | |
| 1545 /*if ( ii.validIrCnt < fileCnt * 0.15 ) | |
| 1546 ii.irClassifier -= log( 1000.0 ) ; | |
| 1547 else if ( ii.validIrCnt < fileCnt * 0.5 ) | |
| 1548 ii.irClassifier -= log( 100.0 ) ;*/ | |
| 1549 ii.irClassifier = (double)1.0 / ( 1.0 + exp( ii.irClassifier + log( 1 - avgIrPiRatio ) - log( avgIrPiRatio ) ) ) ; | |
| 1550 } | |
| 1551 else | |
| 1552 ii.irClassifier = -1 ; | |
| 1553 | |
| 1554 if ( ii.leftOverhang.validCnt > 0 ) | |
| 1555 ii.leftOverhang.classifier = (double)1.0 / ( 1.0 + exp( ii.leftOverhang.classifier + | |
| 1556 log( 1 - avgOverhangPiRatio ) - log( avgOverhangPiRatio ) ) ) ; | |
| 1557 else | |
| 1558 ii.leftOverhang.classifier = -1 ; | |
| 1559 | |
| 1560 if ( ii.rightOverhang.validCnt > 0 ) | |
| 1561 ii.rightOverhang.classifier = (double)1.0 / ( 1.0 + exp( ii.rightOverhang.classifier + | |
| 1562 log( 1 - avgOverhangPiRatio ) - log( avgOverhangPiRatio ) ) ) ; | |
| 1563 else | |
| 1564 ii.rightOverhang.classifier = -1 ; | |
| 1565 } | |
| 1566 | |
| 1567 // Change the classifier for the hard boundaries if its adjacent intron has intron retention classifier | |
| 1568 // which collide with overhang subexon. | |
| 1569 int intervalCnt = seIntervals.size() ; | |
| 1570 for ( i = 0 ; i < intervalCnt ; ++i ) | |
| 1571 { | |
| 1572 if ( seIntervals[i].type == 1 && intronicInfos[ seIntervals[i].idx ].irCnt > 0 ) | |
| 1573 { | |
| 1574 int idx = seIntervals[i].idx ; | |
| 1575 if ( intronicInfos[idx].leftOverhang.cnt > 0 ) | |
| 1576 { | |
| 1577 int k = seIntervals[i - 1].idx ; | |
| 1578 // Should aim for more conservative? | |
| 1579 if ( subexons[k].rightClassifier > intronicInfos[idx].leftOverhang.classifier ) | |
| 1580 subexons[k].rightClassifier = intronicInfos[idx].leftOverhang.classifier ; | |
| 1581 } | |
| 1582 | |
| 1583 if ( intronicInfos[idx].rightOverhang.cnt > 0 ) | |
| 1584 { | |
| 1585 int k = seIntervals[i + 1].idx ; | |
| 1586 if ( subexons[k].leftClassifier > intronicInfos[idx].rightOverhang.classifier ) | |
| 1587 subexons[k].leftClassifier = intronicInfos[idx].rightOverhang.classifier ; | |
| 1588 } | |
| 1589 } | |
| 1590 } | |
| 1591 | |
| 1592 // Output the result. | |
| 1593 for ( i = 0 ; i < intervalCnt ; ++i ) | |
| 1594 { | |
| 1595 if ( seIntervals[i].type == 0 ) | |
| 1596 { | |
| 1597 struct _subexon &se = subexons[ seIntervals[i].idx ] ; | |
| 1598 | |
| 1599 char ls, rs ; | |
| 1600 | |
| 1601 ls = StrandNumToSymbol( se.leftStrand ) ; | |
| 1602 rs = StrandNumToSymbol( se.rightStrand ) ; | |
| 1603 | |
| 1604 printf( "%s %d %d %d %d %c %c -1 -1 -1 %lf %lf ", alignments.GetChromName( se.chrId ), se.start, se.end, | |
| 1605 se.leftType, se.rightType, ls, rs, se.leftClassifier, se.rightClassifier ) ; | |
| 1606 if ( i > 0 && seIntervals[i - 1].chrId == seIntervals[i].chrId | |
| 1607 && seIntervals[i - 1].end + 1 == seIntervals[i].start | |
| 1608 && !( seIntervals[i - 1].type == 0 && | |
| 1609 subexons[ seIntervals[i - 1].idx ].rightType != se.leftType ) | |
| 1610 && !( seIntervals[i - 1].type == 1 && intronicInfos[ seIntervals[i - 1].idx ].irCnt == 0 | |
| 1611 && intronicInfos[ seIntervals[i - 1].idx ].rightOverhang.cnt == 0 ) | |
| 1612 && ( se.prevCnt == 0 || se.start - 1 != se.prev[ se.prevCnt - 1 ] ) ) // The connection showed up in the subexon file. | |
| 1613 { | |
| 1614 printf( "%d ", se.prevCnt + 1 ) ; | |
| 1615 for ( j = 0 ; j < se.prevCnt ; ++j ) | |
| 1616 printf( "%d ", se.prev[j] ) ; | |
| 1617 printf( "%d ", se.start - 1 ) ; | |
| 1618 } | |
| 1619 else | |
| 1620 { | |
| 1621 printf( "%d ", se.prevCnt ) ; | |
| 1622 for ( j = 0 ; j < se.prevCnt ; ++j ) | |
| 1623 printf( "%d ", se.prev[j] ) ; | |
| 1624 } | |
| 1625 | |
| 1626 if ( i < intervalCnt - 1 && seIntervals[i].chrId == seIntervals[i + 1].chrId | |
| 1627 && seIntervals[i].end == seIntervals[i + 1].start - 1 | |
| 1628 && !( seIntervals[i + 1].type == 0 && | |
| 1629 subexons[ seIntervals[i + 1].idx ].leftType != se.rightType ) | |
| 1630 && !( seIntervals[i + 1].type == 1 && intronicInfos[ seIntervals[i + 1].idx ].irCnt == 0 | |
| 1631 && intronicInfos[ seIntervals[i + 1].idx ].leftOverhang.cnt == 0 ) | |
| 1632 && ( se.nextCnt == 0 || se.end + 1 != se.next[0] ) ) | |
| 1633 { | |
| 1634 printf( "%d %d ", se.nextCnt + 1, se.end + 1 ) ; | |
| 1635 } | |
| 1636 else | |
| 1637 printf( "%d ", se.nextCnt ) ; | |
| 1638 for ( j = 0 ; j < se.nextCnt ; ++j ) | |
| 1639 printf( "%d ", se.next[j] ) ; | |
| 1640 printf( "\n" ) ; | |
| 1641 } | |
| 1642 else if ( seIntervals[i].type == 1 ) | |
| 1643 { | |
| 1644 struct _intronicInfo &ii = intronicInfos[ seIntervals[i].idx ] ; | |
| 1645 if ( ii.irCnt > 0 ) | |
| 1646 { | |
| 1647 printf( "%s %d %d 2 1 . . -1 -1 -1 %lf %lf 1 %d 1 %d\n", | |
| 1648 alignments.GetChromName( ii.chrId ), ii.start, ii.end, | |
| 1649 ii.irClassifier, ii.irClassifier, | |
| 1650 seIntervals[i - 1].end, seIntervals[i + 1].start ) ; | |
| 1651 } | |
| 1652 else | |
| 1653 { | |
| 1654 // left overhang. | |
| 1655 if ( ii.leftOverhang.cnt > 0 ) | |
| 1656 { | |
| 1657 printf( "%s %d %d 2 0 . . -1 -1 -1 %lf %lf 1 %d 0\n", | |
| 1658 alignments.GetChromName( ii.chrId ), ii.start, | |
| 1659 ii.start + ( ii.leftOverhang.length / ii.leftOverhang.cnt ) - 1, | |
| 1660 ii.leftOverhang.classifier, ii.leftOverhang.classifier, | |
| 1661 ii.start - 1 ) ; | |
| 1662 } | |
| 1663 | |
| 1664 // right overhang. | |
| 1665 if ( ii.rightOverhang.cnt > 0 ) | |
| 1666 { | |
| 1667 printf( "%s %d %d 0 1 . . -1 -1 -1 %lf %lf 0 1 %d\n", | |
| 1668 alignments.GetChromName( ii.chrId ), | |
| 1669 ii.end - ( ii.rightOverhang.length / ii.rightOverhang.cnt ) + 1, ii.end, | |
| 1670 ii.rightOverhang.classifier, ii.rightOverhang.classifier, | |
| 1671 ii.end + 1 ) ; | |
| 1672 } | |
| 1673 | |
| 1674 } | |
| 1675 } | |
| 1676 } | |
| 1677 | |
| 1678 return 0 ; | |
| 1679 } | |
| 1680 |
