comparison PsiCLASS-1.0.2/CombineSubexons.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 #include <stdio.h>
2 #include <string.h>
3
4 #include <algorithm>
5 #include <vector>
6 #include <map>
7 #include <string>
8
9 #include "alignments.hpp"
10 #include "blocks.hpp"
11 #include "stats.hpp"
12 #include "SubexonGraph.hpp"
13
14 char usage[] = "combineSubexons [options]\n"
15 "Required options:\n"
16 "\t-s STRING: the path to the predicted subexon information. Can use multiple -s to specify multiple subexon prediction files\n"
17 "\t\tor\n"
18 "\t--ls STRING: the path to file of the list of the predicted subexon information.\n" ;
19
20 struct _overhang
21 {
22 int cnt ; // the number of samples support this subexon.
23 int validCnt ; // The number of samples that are used for compute probability.
24 int length ;
25 double classifier ;
26 } ;
27
28 struct _intronicInfo
29 {
30 int chrId ;
31 int start, end ;
32 int leftSubexonIdx, rightSubexonIdx ;
33 double irClassifier ;
34 int irCnt ;
35 int validIrCnt ;
36 struct _overhang leftOverhang, rightOverhang ; // leftOverhangClassifier is for the overhang subexon at the left side of this intron.
37 } ;
38
39 struct _seInterval
40 {
41 int chrId ;
42 int start, end ;
43 int type ; // 0-subexon, 1-intronicInfo
44 int idx ;
45 } ;
46
47 struct _subexonSplit
48 {
49 int chrId ;
50 int pos ;
51 int type ; //1-start of a subexon. 2-end of a subexon
52 int splitType ; //0-soft boundary, 1-start of an exon, 2-end of an exon.
53 int strand ;
54
55 int weight ;
56 } ;
57
58 struct _interval // exon or intron
59 {
60 int chrId ;
61 int start, end ;
62 int strand ;
63 int sampleSupport ;
64 } ;
65
66 struct _subexonSupplement // supplement the subexon structure defined in SubexonGraph.
67 {
68 int *nextSupport ;
69 int *prevSupport ;
70 } ;
71
72 char buffer[4096] ;
73
74 bool CompSubexonSplit( struct _subexonSplit a, struct _subexonSplit b )
75 {
76 if ( a.chrId < b.chrId )
77 return true ;
78 else if ( a.chrId > b.chrId )
79 return false ;
80 else if ( a.pos != b.pos )
81 return a.pos < b.pos ;
82 else if ( a.type != b.type )
83 {
84 // the split site with no strand information should come first.
85 /*if ( a.splitType != b.splitType )
86 {
87 if ( a.splitType == 0 )
88 return true ;
89 else if ( b.splitType == 0 )
90 return false ;
91 }*/
92 return a.type < b.type ;
93 }
94 else if ( a.splitType != b.splitType )
95 {
96 //return a.splitType < b.splitType ;
97 if ( a.splitType == 0 )
98 return true ;
99 else if ( b.splitType == 0 )
100 return false ;
101
102 if ( a.type == 1 )
103 return a.splitType > b.splitType ;
104 else
105 return a.splitType < b.splitType ;
106 }
107
108 return false ;
109 }
110
111 bool CompInterval( struct _interval a, struct _interval b )
112 {
113 if ( a.chrId < b.chrId )
114 return true ;
115 else if ( a.chrId > b.chrId )
116 return false ;
117 else if ( a.start != b.start )
118 return a.start < b.start ;
119 else if ( a.end != b.end )
120 return a.end < b.end ;
121 return false ;
122 }
123
124 bool CompSeInterval( struct _seInterval a, struct _seInterval b )
125 {
126 if ( a.chrId < b.chrId )
127 return true ;
128 else if ( a.chrId > b.chrId )
129 return false ;
130 else if ( a.start < b.start )
131 return true ;
132 else if ( a.start > b.start )
133 return false ;
134 else if ( a.end < b.end )
135 return true ;
136 else
137 return false ;
138 }
139
140 // Keep this the same as in SubexonInfo.cpp.
141 double TransformCov( double c )
142 {
143 double ret ;
144 //return sqrt( c ) - 1 ;
145
146 if ( c <= 2 + 1e-6 )
147 ret = 1e-6 ;
148 else
149 ret = c - 2 ;
150
151 return ret ;
152 }
153
154 double GetUpdateMixtureGammaClassifier( double ratio, double cov, double piRatio, double kRatio[2], double thetaRatio[2],
155 double piCov, double kCov[2], double thetaCov[2], bool conservative )
156 {
157 double p1 = 0, p2 ;
158
159 cov = TransformCov( cov ) ;
160 if ( cov < ( kCov[0] - 1 ) * thetaCov[0] )
161 cov = ( kCov[0] - 1 ) * thetaCov[0] ;
162
163 if ( ratio > 0 )
164 p1 = MixtureGammaAssignment( ratio, piRatio, kRatio, thetaRatio ) ;
165 // Make sure cov > 1?
166 p2 = MixtureGammaAssignment( cov, piCov, kCov, thetaCov ) ;
167 double ret = 0 ;
168
169 if ( conservative )
170 {
171 if ( p1 >= p2 ) // we should use ratio.
172 ret = LogGammaDensity( ratio, kRatio[1], thetaRatio[1] )
173 - LogGammaDensity( ratio, kRatio[0], thetaRatio[0] ) ;
174 else
175 ret = LogGammaDensity( cov, kCov[1], thetaCov[1] )
176 - LogGammaDensity( cov, kCov[0], thetaCov[0] ) ;
177 }
178 else
179 {
180 if ( p1 >= p2 ) // we should use ratio.
181 ret = LogGammaDensity( ratio, kRatio[1], thetaRatio[1] )
182 - LogGammaDensity( ratio, kRatio[0], thetaRatio[0] ) ;
183 else
184 ret = LogGammaDensity( cov, kCov[1], thetaCov[1] )
185 - LogGammaDensity( cov, kCov[0], thetaCov[0] ) ;
186 }
187 return ret ;
188 }
189
190 double GetPValueOfGeneEnd( double cov )
191 {
192 if ( cov <= 2.0 )
193 return 1.0 ;
194 double tmp = 2.0 * ( sqrt( cov ) - log( cov ) ) ;
195 if ( tmp <= 0 )
196 return 1.0 ;
197 return 2.0 * alnorm( tmp, true ) ;
198 }
199
200 char StrandNumToSymbol( int strand )
201 {
202 if ( strand > 0 )
203 return '+' ;
204 else if ( strand < 0 )
205 return '-' ;
206 else
207 return '.' ;
208 }
209
210 int StrandSymbolToNum( char c )
211 {
212 if ( c == '+' )
213 return 1 ;
214 else if ( c == '-' )
215 return -1 ;
216 else
217 return 0 ;
218 }
219
220 int *MergePositions( int *old, int ocnt, int *add, int acnt, int &newCnt ) //, int **support )
221 {
222 int i, j, k ;
223 int *ret ;
224 if ( acnt == 0 )
225 {
226 newCnt = ocnt ;
227 return old ;
228 }
229 if ( ocnt == 0 )
230 {
231 newCnt = acnt ;
232 ret = new int[acnt] ;
233 //*support = new int[acnt] ;
234 for ( i = 0 ; i < acnt ; ++i )
235 {
236 //(*support)[i] = 1 ;
237 ret[i] = add[i] ;
238 }
239 return ret ;
240 }
241 newCnt = 0 ;
242 for ( i = 0, j = 0 ; i < ocnt && j < acnt ; )
243 {
244 if ( old[i] < add[j] )
245 {
246 ++i ;
247 ++newCnt ;
248 }
249 else if ( old[i] == add[j] )
250 {
251 ++i ; ++j ;
252 ++newCnt ;
253 }
254 else
255 {
256 ++j ;
257 ++newCnt ;
258 }
259 }
260 newCnt = newCnt + ( ocnt - i ) + ( acnt - j ) ;
261 // no new elements.
262 if ( newCnt == ocnt )
263 {
264 /*i = 0 ;
265 for ( j = 0 ; j < acnt ; ++j )
266 {
267 for ( ; old[i] < add[j] ; ++i )
268 ;
269 ++(*support)[i] ;
270 }*/
271 return old ;
272 }
273 k = 0 ;
274 //delete []old ;
275 ret = new int[ newCnt ] ;
276 //int *bufferSupport = new int[newCnt] ;
277 for ( i = 0, j = 0 ; i < ocnt && j < acnt ; )
278 {
279 if ( old[i] < add[j] )
280 {
281 ret[k] = old[i] ;
282 //bufferSupport[k] = (*support)[i] ;
283 ++i ;
284 ++k ;
285 }
286 else if ( old[i] == add[j] )
287 {
288 ret[k] = old[i] ;
289 //bufferSupport[k] = (*support)[i] + 1 ;
290 ++i ; ++j ;
291 ++k ;
292 }
293 else
294 {
295 ret[k] = add[j] ;
296 //bufferSupport[k] = 1 ;
297 ++j ;
298 ++k ;
299 }
300 }
301 for ( ; i < ocnt ; ++i, ++k )
302 {
303 ret[k] = old[i] ;
304 //bufferSupport[k] = (*support)[i] ;
305 }
306 for ( ; j < acnt ; ++j, ++k )
307 {
308 ret[k] = add[j] ;
309 //bufferSupport[k] = 1 ;
310 }
311 delete[] old ;
312 //delete[] *support ;
313 //*support = bufferSupport ;
314 return ret ;
315 }
316
317 void CoalesceSubexonSplits( std::vector<struct _subexonSplit> &splits, int mid )
318 {
319 int i, j, k ;
320 int cnt = splits.size() ;
321 //std::sort( splits.begin(), splits.end(), CompSubexonSplit ) ;
322
323 std::vector<struct _subexonSplit> sortedSplits ;
324 sortedSplits.resize( cnt ) ;
325
326 k = 0 ;
327 for ( i = 0, j = mid ; i < mid && j < cnt ; ++k )
328 {
329 if ( CompSubexonSplit( splits[i], splits[j] ) )
330 {
331 sortedSplits[k] = splits[i] ;
332 ++i ;
333 }
334 else
335 {
336 sortedSplits[k] = splits[j] ;
337 ++j ;
338 }
339 }
340 for ( ; i < mid ; ++i, ++k )
341 sortedSplits[k] = splits[i] ;
342 for ( ; j < cnt ; ++j, ++k )
343 sortedSplits[k] = splits[j] ;
344 splits = sortedSplits ;
345
346 k = 0 ;
347 for ( i = 1 ; i < cnt ; ++i )
348 {
349 if ( splits[i].chrId == splits[k].chrId && splits[i].pos == splits[k].pos && splits[i].type == splits[k].type && splits[i].splitType == splits[k].splitType
350 && splits[i].strand == splits[k].strand )
351 {
352 splits[k].weight += splits[i].weight ;
353 }
354 else
355 {
356 ++k ;
357 splits[k] = splits[i] ;
358 }
359 }
360 splits.resize( k + 1 ) ;
361 }
362
363 void CoalesceDifferentStrandSubexonSplits( std::vector<struct _subexonSplit> &splits )
364 {
365 int i, j, k, l ;
366 int cnt = splits.size() ;
367 k = 0 ;
368 for ( i = 0 ; i < cnt ; )
369 {
370 for ( j = i + 1 ; j < cnt ; ++j )
371 {
372 if ( splits[i].chrId == splits[j].chrId && splits[i].pos == splits[j].pos && splits[i].type == splits[j].type && splits[i].splitType == splits[j].splitType )
373 continue ;
374 break ;
375 }
376
377 int maxWeight = -1 ;
378 int weightSum = 0 ;
379 int strand = splits[i].strand ;
380 for ( l = i ; l < j ; ++l )
381 {
382 weightSum += splits[l].weight ;
383 if ( splits[l].strand != 0 && splits[l].weight > maxWeight )
384 {
385 strand = splits[l].strand ;
386 maxWeight = splits[l].weight ;
387 }
388 }
389 //printf( "%d: %d %d %d\n", splits[i].pos, i, j, strand ) ;
390 splits[k] = splits[i] ;
391 splits[k].strand = strand ;
392 splits[k].weight = weightSum ;
393 ++k ;
394
395 i = j ;
396 }
397 splits.resize( k ) ;
398 }
399
400
401 void CoalesceIntervals( std::vector<struct _interval> &intervals )
402 {
403 int i, k ;
404 std::sort( intervals.begin(), intervals.end(), CompInterval ) ;
405 int cnt = intervals.size() ;
406 k = 0 ;
407 for ( i = 1 ; i < cnt ; ++i )
408 {
409 if ( intervals[i].chrId == intervals[k].chrId && intervals[i].start == intervals[k].start && intervals[i].end == intervals[k].end )
410 intervals[k].sampleSupport += intervals[i].sampleSupport ;
411 else
412 {
413 ++k ;
414 intervals[k] = intervals[i] ;
415 }
416 }
417 intervals.resize( k + 1 ) ;
418 }
419
420 void CleanIntervalIrOverhang( std::vector<struct _interval> &irOverhang )
421 {
422 int i, j, k ;
423 std::sort( irOverhang.begin(), irOverhang.end(), CompInterval ) ;
424
425 int cnt = irOverhang.size() ;
426 for ( i = 0 ; i < cnt ; ++i )
427 {
428 if ( irOverhang[i].start == -1 )
429 continue ;
430
431
432 // locate the longest interval start at the same coordinate.
433 int tag = i ;
434
435 for ( j = i + 1 ; j < cnt ; ++j )
436 {
437 if ( irOverhang[j].chrId != irOverhang[i].chrId || irOverhang[j].start != irOverhang[i].start )
438 break ;
439 if ( irOverhang[j].start == -1 )
440 continue ;
441 tag = j ;
442 }
443
444 for ( k = i ; k < tag ; ++k )
445 {
446 irOverhang[k].start = -1 ;
447 }
448
449 for ( k = tag + 1 ; k < cnt ; ++k )
450 {
451 if ( irOverhang[k].chrId != irOverhang[tag].chrId || irOverhang[k].start > irOverhang[tag].end )
452 break ;
453 if ( irOverhang[k].end <= irOverhang[tag].end )
454 {
455 irOverhang[k].start = -1 ;
456 }
457 }
458 }
459
460 k = 0 ;
461 for ( i = 0 ; i < cnt ; ++i )
462 {
463 if ( irOverhang[i].start == -1 )
464 continue ;
465 irOverhang[k] = irOverhang[i] ;
466 ++k ;
467 }
468 irOverhang.resize( k ) ;
469 }
470
471 // Remove the connection that does not match the boundary
472 // of subexons.
473 void CleanUpSubexonConnections( std::vector<struct _subexon> &subexons )
474 {
475 int seCnt = subexons.size() ;
476 int i, j, k, m ;
477 for ( i = 0 ; i < seCnt ; ++i )
478 {
479 if ( subexons[i].prevCnt > 0 )
480 {
481 for ( k = i ; k >= 0 ; --k )
482 if ( subexons[k].chrId != subexons[i].chrId || subexons[k].end <= subexons[i].prev[0] )
483 break ;
484 if ( subexons[k].chrId != subexons[i].chrId )
485 ++k ;
486 m = 0 ;
487 for ( j = 0 ; j < subexons[i].prevCnt ; ++j )
488 {
489 for ( ; k <= i ; ++k )
490 if ( subexons[k].end >= subexons[i].prev[j] )
491 break ;
492 if ( subexons[k].end == subexons[i].prev[j]
493 && ( SubexonGraph::IsSameStrand( subexons[k].rightStrand, subexons[i].leftStrand ) || subexons[k].end + 1 == subexons[i].start ) )
494 {
495 subexons[i].prev[m] = subexons[i].prev[j] ;
496 ++m ;
497 }
498 }
499 subexons[i].prevCnt = m ;
500 }
501
502 m = 0 ;
503 k = i ;
504 for ( j = 0 ; j < subexons[i].nextCnt ; ++j )
505 {
506 for ( ; k < seCnt ; ++k )
507 if ( subexons[k].chrId != subexons[i].chrId || subexons[k].start >= subexons[i].next[j] )
508 break ;
509 if ( subexons[k].start == subexons[i].next[j]
510 && ( SubexonGraph::IsSameStrand( subexons[i].rightStrand, subexons[k].leftStrand ) || subexons[i].end + 1 == subexons[k].start ) )
511 {
512 subexons[i].next[m] = subexons[i].next[j] ;
513 ++m ;
514 }
515 }
516 subexons[i].nextCnt = m ;
517 }
518 }
519
520 int main( int argc, char *argv[] )
521 {
522 int i, j, k ;
523 FILE *fp ;
524 std::vector<char *> files ;
525
526 Blocks regions ;
527 Alignments alignments ;
528
529 if ( argc == 1 )
530 {
531 printf( "%s", usage ) ;
532 return 0 ;
533 }
534
535 for ( i = 1 ; i < argc ; ++i )
536 {
537 if ( !strcmp( argv[i], "-s" ) )
538 {
539 files.push_back( argv[i + 1] ) ;
540 ++i ;
541 continue ;
542 }
543 else if ( !strcmp( argv[i], "--ls" ) )
544 {
545 FILE *fpLs = fopen( argv[i + 1], "r" ) ;
546 char buffer[1024] ;
547 while ( fgets( buffer, sizeof( buffer ), fpLs ) != NULL )
548 {
549 int len = strlen( buffer ) ;
550 if ( buffer[len - 1] == '\n' )
551 {
552 buffer[len - 1] = '\0' ;
553 --len ;
554
555 }
556 char *fileName = strdup( buffer ) ;
557 files.push_back( fileName ) ;
558 }
559 }
560 }
561 int fileCnt = files.size() ;
562 // Obtain the chromosome ids through bam file.
563 fp = fopen( files[0], "r" ) ;
564 if ( fgets( buffer, sizeof( buffer ), fp ) != NULL )
565 {
566 int len = strlen( buffer ) ;
567 buffer[len - 1] = '\0' ;
568 alignments.Open( buffer + 1 ) ;
569 }
570 fclose( fp ) ;
571
572 // Collect the split sites of subexons.
573 std::vector<struct _subexonSplit> subexonSplits ;
574 std::vector<struct _interval> intervalIrOverhang ; // intervals contains ir and overhang.
575 std::vector<struct _interval> introns ;
576 std::vector<struct _interval> exons ;
577
578
579 for ( k = 0 ; k < fileCnt ; ++k )
580 {
581 fp = fopen( files[k], "r" ) ;
582 struct _subexon se ;
583 struct _subexonSplit sp ;
584 char chrName[50] ;
585 int origSize = subexonSplits.size() ;
586 while ( fgets( buffer, sizeof( buffer), fp ) != NULL )
587 {
588 if ( buffer[0] == '#' )
589 continue ;
590
591 SubexonGraph::InputSubexon( buffer, alignments, se, false) ;
592 // Record all the intron rentention, overhang from the samples
593 if ( ( se.leftType == 2 && se.rightType == 1 )
594 || ( se.leftType == 2 && se.rightType == 0 )
595 || ( se.leftType == 0 && se.rightType == 1 ) )
596 {
597 struct _interval si ;
598 si.chrId = se.chrId ;
599 si.start = se.start ;
600 si.end = se.end ;
601
602 intervalIrOverhang.push_back( si ) ;
603 }
604
605 // Ignore overhang subexons and ir subexons for now.
606 if ( ( se.leftType == 0 && se.rightType == 1 )
607 || ( se.leftType == 2 && se.rightType == 0 )
608 || ( se.leftType == 2 && se.rightType == 1 ) )
609 continue ;
610
611 if ( se.leftType == 0 && se.rightType == 0 && ( se.leftClassifier == -1 || se.leftClassifier == 1 ) ) // ignore noisy single-exon island
612 continue ;
613 if ( se.leftType == 0 && se.rightType == 0 && ( fileCnt >= 10 && se.leftClassifier > 0.99 ) )
614 continue ;
615
616 if ( se.leftType == 1 && se.rightType == 2 ) // a full exon, we allow mixtured strand here.
617 {
618 struct _interval ni ;
619 ni.chrId = se.chrId ;
620 ni.start = se.start ;
621 ni.end = se.end ;
622 ni.strand = se.rightStrand ;
623 ni.sampleSupport = 1 ;
624 exons.push_back( ni ) ;
625 }
626
627
628 /*for ( i = 0 ; i < se.nextCnt ; ++i )
629 {
630 struct _interval ni ;
631 ni.chrId = se.chrId ;
632 ni.start = se.end ;
633 ni.end = se.next[i] ;
634 ni.strand = se.rightStrand ;
635 ni.sampleSupport = 1 ;
636 if ( ni.start + 1 < ni.end )
637 introns.push_back( ni ) ;
638 }*/
639
640 sp.chrId = se.chrId ;
641 sp.pos = se.start ;
642 sp.type = 1 ;
643 sp.splitType = se.leftType ;
644 sp.strand = se.leftStrand ;
645 sp.weight = 1 ;
646 subexonSplits.push_back( sp ) ;
647
648 sp.chrId = se.chrId ;
649 sp.pos = se.end ;
650 sp.type = 2 ;
651 sp.splitType = se.rightType ;
652 sp.strand = se.rightStrand ;
653 sp.weight = 1 ;
654 subexonSplits.push_back( sp ) ;
655
656 /*if ( se.prevCnt > 0 )
657 delete[] se.prev ;
658 if ( se.nextCnt > 0 )
659 delete[] se.next ;*/
660 }
661 CoalesceIntervals( exons ) ;
662 CoalesceIntervals( introns ) ;
663 CoalesceSubexonSplits( subexonSplits, origSize ) ;
664 CleanIntervalIrOverhang( intervalIrOverhang ) ;
665
666 fclose( fp ) ;
667 }
668
669 CoalesceDifferentStrandSubexonSplits( subexonSplits ) ;
670
671 // Obtain the split sites from the introns.
672 int intronCnt = introns.size() ;
673 std::vector<struct _subexonSplit> intronSplits ;
674 for ( i = 0 ; i < intronCnt ; ++i )
675 {
676 /*if ( introns[i].sampleSupport < 0.05 * fileCnt )
677 {
678 continue ;
679 }*/
680 struct _interval &it = introns[i] ;
681 struct _subexonSplit sp ;
682 sp.chrId = it.chrId ;
683 sp.pos = it.start ;
684 sp.type = 2 ;
685 sp.splitType = 2 ;
686 sp.strand = it.strand ;
687 intronSplits.push_back( sp ) ;
688
689 sp.chrId = it.chrId ;
690 sp.pos = it.end ;
691 sp.type = 1 ;
692 sp.splitType = 1 ;
693 sp.strand = it.strand ;
694 intronSplits.push_back( sp ) ;
695 }
696
697 // Pair up the split sites to get subexons
698 std::sort( intronSplits.begin(), intronSplits.end(), CompSubexonSplit ) ;
699 //std::sort( subexonSplits.begin(), subexonSplits.end(), CompSubexonSplit ) ;
700
701 // Convert the hard boundary to soft boundary if the split sites is filtered from the introns
702 // Seems NO need to do this now.
703 int splitCnt = subexonSplits.size() ;
704 int intronSplitCnt = intronSplits.size() ;
705 k = 0 ;
706 //for ( i = 0 ; i < splitCnt ; ++i )
707 while ( 0 )
708 {
709 if ( subexonSplits[i].type != subexonSplits[i].splitType )
710 continue ;
711
712 while ( k < intronSplitCnt && ( intronSplits[k].chrId < subexonSplits[i].chrId
713 || ( intronSplits[k].chrId == subexonSplits[i].chrId && intronSplits[k].pos < subexonSplits[i].pos ) ) )
714 ++k ;
715 j = k ;
716 while ( j < intronSplitCnt && intronSplits[j].chrId == subexonSplits[i].chrId
717 && intronSplits[j].pos == subexonSplits[i].pos && intronSplits[j].splitType != subexonSplits[i].splitType )
718 ++j ;
719
720 // the split site is filtered.
721 if ( j >= intronSplitCnt || intronSplits[j].chrId != subexonSplits[i].chrId ||
722 intronSplits[j].pos > subexonSplits[i].pos )
723 {
724 //printf( "%d %d. %d %d\n", subexonSplits[i].pos, intronSplits[j].pos, intronSplits[j].chrId , subexonSplits[i].chrId ) ;
725 subexonSplits[i].splitType = 0 ;
726
727 // Convert the adjacent subexon split.
728 for ( int l = i + 1 ; i < splitCnt && subexonSplits[l].chrId == subexonSplits[i].chrId
729 && subexonSplits[l].pos == subexonSplits[i].pos + 1 ; ++l )
730 {
731 if ( subexonSplits[l].type != subexonSplits[i].type
732 && subexonSplits[l].splitType == subexonSplits[i].splitType )
733 {
734 subexonSplits[l].splitType = 0 ;
735 }
736 }
737
738 // And the other direction
739 for ( int l = i - 1 ; l >= 0 && subexonSplits[l].chrId == subexonSplits[i].chrId
740 && subexonSplits[l].pos == subexonSplits[i].pos - 1 ; --l )
741 {
742 if ( subexonSplits[l].type != subexonSplits[i].type
743 && subexonSplits[l].splitType == subexonSplits[i].splitType )
744 {
745 subexonSplits[l].splitType = 0 ;
746 }
747 }
748 }
749 }
750 intronSplits.clear() ;
751 std::vector<struct _subexonSplit>().swap( intronSplits ) ;
752
753 // Force the soft boundary that collides with hard boundaries to be hard boundary.
754 for ( i = 0 ; i < splitCnt ; ++i )
755 {
756 if ( subexonSplits[i].splitType != 0 )
757 continue ;
758 int newSplitType = 0 ;
759 int newStrand = subexonSplits[i].strand ;
760 for ( j = i + 1 ; j < splitCnt ; ++j )
761 {
762 if ( subexonSplits[i].type != subexonSplits[j].type || subexonSplits[i].pos != subexonSplits[j].pos ||
763 subexonSplits[i].chrId != subexonSplits[j].chrId )
764 break ;
765 if ( subexonSplits[j].splitType != 0 )
766 {
767 newSplitType = subexonSplits[j].splitType ;
768 newStrand = subexonSplits[j].strand ;
769 break ;
770 }
771 }
772
773 if ( newSplitType == 0 )
774 {
775 for ( j = i - 1 ; j >= 0 ; --j )
776 {
777 if ( subexonSplits[i].type != subexonSplits[j].type || subexonSplits[i].pos != subexonSplits[j].pos ||
778 subexonSplits[i].chrId != subexonSplits[j].chrId )
779 break ;
780 if ( subexonSplits[j].splitType != 0 )
781 {
782 newSplitType = subexonSplits[j].splitType ;
783 newStrand = subexonSplits[j].strand ;
784 break ;
785 }
786 }
787
788 }
789 /*if ( subexonSplits[i].pos == 154464157 )
790 {
791 printf( "force conversion: %d %d %d. %d %d\n", subexonSplits[i].pos, subexonSplits[i].splitType, subexonSplits[i].weight, subexonSplits[i + 1].pos, subexonSplits[i + 1].splitType ) ;
792 }*/
793 subexonSplits[i].splitType = newSplitType ;
794 subexonSplits[i].strand = newStrand ;
795 }
796
797 /*for ( i = 0 ; i < splitCnt ; ++i )
798 {
799 printf( "%d: type=%d splitType=%d weight=%d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType, subexonSplits[i].weight ) ;
800 }*/
801
802 // Build subexons from the collected split sites.
803
804 std::vector<struct _subexon> subexons ;
805 int diffCnt = 0 ; // |start of subexon split| - |end of subexon split|
806 int seCnt = 0 ;
807 for ( i = 0 ; i < splitCnt - 1 ; ++i )
808 {
809 struct _subexon se ;
810 /*if ( subexonSplits[i + 1].pos == 144177260 )
811 {
812 printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType,
813 subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ;
814 }*/
815
816 if ( subexonSplits[i].type == 1 )
817 diffCnt += subexonSplits[i].weight ;
818 else
819 diffCnt -= subexonSplits[i].weight ;
820
821 if ( subexonSplits[i + 1].chrId != subexonSplits[i].chrId )
822 {
823 diffCnt = 0 ;
824 continue ;
825 }
826
827 if ( diffCnt == 0 ) // the interval between subexon
828 continue ;
829
830 se.chrId = subexonSplits[i].chrId ;
831 se.start = subexonSplits[i].pos ;
832 se.leftType = subexonSplits[i].splitType ;
833 se.leftStrand = subexonSplits[i].strand ;
834 if ( subexonSplits[i].type == 2 )
835 {
836 se.leftStrand = 0 ;
837 ++se.start ;
838 }
839
840 se.end = subexonSplits[i + 1].pos ;
841 se.rightType = subexonSplits[i + 1].splitType ;
842 se.rightStrand = subexonSplits[i + 1].strand ;
843 if ( subexonSplits[i + 1].type == 1 )
844 {
845 se.rightStrand = 0 ;
846 --se.end ;
847 }
848
849 /*if ( se.end == 24613649 )
850 {
851 //printf( "%d %d %d: %d %d %d. %d\n", subexonSplits[i].pos, subexonSplits[i].type, subexonSplits[i].splitType,
852 // subexonSplits[i + 1].pos, subexonSplits[i + 1].type, subexonSplits[i + 1].splitType, diffCnt ) ;
853 printf( "%d %d %d. %d %d %d\n", se.start, se.leftType, se.leftStrand, se.end, se.rightType, se.rightStrand ) ;
854 }*/
855
856 if ( se.start > se.end ) //Note: this handles the case of repeated subexon split.
857 {
858 // handle the case in sample 0: [...[..]
859 // in sample 1: [..]...]
860 if ( seCnt > 0 && se.end == subexons[seCnt - 1].end && subexons[seCnt - 1].rightType < se.rightType )
861 {
862 subexons[seCnt - 1].rightType = se.rightType ;
863 subexons[seCnt - 1].rightStrand = se.rightStrand ;
864 }
865 continue ;
866 }
867 se.leftClassifier = se.rightClassifier = 0 ;
868 se.lcCnt = se.rcCnt = 0 ;
869
870 /*if ( 1 ) //se.chrId == 25 )
871 {
872 printf( "%d: %d-%d: %d %d\n", se.chrId, se.start, se.end, se.leftType, se.rightType ) ;
873 }*/
874
875
876 se.next = se.prev = NULL ;
877 se.nextCnt = se.prevCnt = 0 ;
878 subexons.push_back( se ) ;
879 ++seCnt ;
880 }
881 subexonSplits.clear() ;
882 std::vector<struct _subexonSplit>().swap( subexonSplits ) ;
883
884 // Adjust the split type.
885 seCnt = subexons.size() ;
886 for ( i = 1 ; i < seCnt ; ++i )
887 {
888 if ( subexons[i - 1].end + 1 == subexons[i].start )
889 {
890 if ( subexons[i - 1].rightType == 0 )
891 subexons[i - 1].rightType = subexons[i].leftType ;
892 if ( subexons[i].leftType == 0 )
893 subexons[i].leftType = subexons[i - 1].rightType ;
894 }
895 }
896
897 // Merge the adjacent soft boundaries
898 std::vector<struct _subexon> rawSubexons = subexons ;
899 int exonCnt = exons.size() ;
900 subexons.clear() ;
901
902 k = 0 ; // hold index for exon.
903 for ( i = 0 ; i < seCnt ; )
904 {
905 /*if ( rawSubexons[k].rightType == 0 && rawSubexons[i].leftType == 0
906 && rawSubexons[k].end + 1 == rawSubexons[i].start )
907 {
908 rawSubexons[k].end = rawSubexons[i].end ;
909 rawSubexons[k].rightType = rawSubexons[i].rightType ;
910 rawSubexons[k].rightStrand = rawSubexons[i].rightStrand ;
911 }
912 else
913 {
914 subexons.push_back( rawSubexons[k] ) ;
915 k = i ;
916 }*/
917
918 while ( k < exonCnt && ( exons[k].chrId < rawSubexons[i].chrId
919 || ( exons[k].chrId == rawSubexons[i].chrId && exons[k].start < rawSubexons[i].start ) ) )
920 ++k ;
921
922 for ( j = i + 1 ; j < seCnt ; ++j )
923 {
924 if ( rawSubexons[j - 1].chrId != rawSubexons[j].chrId || rawSubexons[j - 1].rightType != 0 || rawSubexons[j].leftType != 0
925 || ( fileCnt > 1 && rawSubexons[j - 1].end + 1 != rawSubexons[j].start )
926 || ( fileCnt == 1 && rawSubexons[j - 1].end + 50 < rawSubexons[j].start ) )
927 break ;
928 }
929 // rawsubexons[i...j-1] will be merged.
930
931 /*if ( rawSubexons[i].start == 119625875 )
932 {
933 printf( "merge j-1: %d %d %d %d\n", rawSubexons[j - 1].end, rawSubexons[j - 1].rightType,
934 rawSubexons[j].start, rawSubexons[j].leftType ) ;
935 }*/
936 bool merge = true ;
937 if ( rawSubexons[i].leftType == 1 && rawSubexons[j - 1].rightType == 2 && j - i > 1
938 && rawSubexons[j - 1].end - rawSubexons[i].start >= 1000 )
939 {
940 merge = false ;
941 int sampleSupport = 0 ;
942 for ( int l = k ; l < exonCnt ; ++l )
943 {
944 if ( exons[l].chrId != rawSubexons[i].chrId || exons[l].start > rawSubexons[i].start )
945 break ;
946 if ( exons[l].end == rawSubexons[j - 1].end )
947 {
948 merge = true ;
949 sampleSupport = exons[l].sampleSupport ;
950 break ;
951 }
952 }
953
954 if ( merge == true && rawSubexons[j - 1].end - rawSubexons[i].start >= 1000 )
955 {
956 if ( sampleSupport <= 0.2 * fileCnt )
957 {
958 merge = false ;
959 }
960 }
961
962 if ( merge == false )
963 {
964 if ( j - i >= 3 )
965 {
966 rawSubexons[i].end = rawSubexons[ (i + j - 1) / 2 ].start ;
967 rawSubexons[j - 1].start = rawSubexons[ (i + j - 1) / 2].end ;
968 }
969
970 if ( rawSubexons[i].end + 1 == rawSubexons[j - 1].start )
971 {
972 --rawSubexons[i].end ;
973 ++rawSubexons[j - 1].start ;
974 }
975 subexons.push_back( rawSubexons[i] ) ;
976 subexons.push_back( rawSubexons[j - 1] ) ;
977 }
978 }
979
980 if ( merge )
981 {
982 rawSubexons[i].end = rawSubexons[j - 1].end ;
983 rawSubexons[i].rightType = rawSubexons[j - 1].rightType ;
984 rawSubexons[i].rightStrand = rawSubexons[j - 1].rightStrand ;
985
986 if ( rawSubexons[i].leftType == 0 && rawSubexons[i].rightType != 0 )
987 {
988 rawSubexons[i].start = rawSubexons[ ( i + j - 1 ) / 2 ].start ;
989 }
990 else if ( rawSubexons[i].rightType == 0 && rawSubexons[i].leftType != 0 )
991 {
992 rawSubexons[i].end = rawSubexons[ ( i + j - 1 ) / 2 ].end ;
993 }
994
995 subexons.push_back( rawSubexons[i] ) ;
996 }
997
998 i = j ;
999 }
1000 exons.clear() ;
1001 std::vector<struct _interval>().swap( exons ) ;
1002
1003 // Remove overhang, ir subexons intron created after putting multiple sample to gether.
1004 // eg: s0: [......)
1005 // s1: [...]--------[....]
1006 // s2: [...]..)-----[....]
1007 // Though the overhang from s2 is filtered in readin, there will a new overhang created combining s0,s1.
1008 // But be careful about how to compute the classifier for the overhang part contributed from s0.
1009 // Furthermore, note that the case of single-exon island showed up in intron retention region after combining is not possible when get here.
1010 // eg: s0:[...]-----[...]
1011 // s1: (.)
1012 // s2:[.............]
1013 // After merge adjacent soft boundaries, the single-exon island will disappear.
1014 rawSubexons = subexons ;
1015 seCnt = subexons.size() ;
1016 subexons.clear() ;
1017 for ( i = 0 ; i < seCnt ; ++i )
1018 {
1019 if ( ( rawSubexons[i].leftType == 2 && rawSubexons[i].rightType == 1 ) // ir
1020 || ( rawSubexons[i].leftType == 2 && rawSubexons[i].rightType == 0 ) // overhang
1021 || ( rawSubexons[i].leftType == 0 && rawSubexons[i].rightType == 1 ) )
1022 continue ;
1023 subexons.push_back( rawSubexons[i] ) ;
1024 }
1025
1026 // Remove the single-exon island if it overlaps with intron retentioned or overhang.
1027 rawSubexons = subexons ;
1028 seCnt = subexons.size() ;
1029 subexons.clear() ;
1030 k = 0 ;
1031 std::sort( intervalIrOverhang.begin(), intervalIrOverhang.end(), CompInterval ) ;
1032 int irOverhangCnt = intervalIrOverhang.size() ;
1033
1034 for ( i = 0 ; i < seCnt ; ++i )
1035 {
1036 if ( rawSubexons[i].leftType != 0 || rawSubexons[i].rightType != 0 )
1037 {
1038 subexons.push_back( rawSubexons[i] ) ;
1039 continue ;
1040 }
1041
1042 while ( k < irOverhangCnt )
1043 {
1044 // Locate the interval that before the island
1045 if ( intervalIrOverhang[k].chrId < rawSubexons[i].chrId
1046 || ( intervalIrOverhang[k].chrId == rawSubexons[i].chrId && intervalIrOverhang[k].end < rawSubexons[i].start ) )
1047 {
1048 ++k ;
1049 continue ;
1050 }
1051 break ;
1052 }
1053 bool overlap = false ;
1054 for ( j = k ; j < irOverhangCnt ; ++j )
1055 {
1056 if ( intervalIrOverhang[j].chrId > rawSubexons[i].chrId || intervalIrOverhang[j].start > rawSubexons[i].end )
1057 break ;
1058 if ( ( intervalIrOverhang[j].start <= rawSubexons[i].start && intervalIrOverhang[j].end >= rawSubexons[i].start )
1059 || ( intervalIrOverhang[j].start <= rawSubexons[i].end && intervalIrOverhang[j].end >= rawSubexons[i].end ) )
1060 {
1061 overlap = true ;
1062 break ;
1063 }
1064 }
1065
1066 if ( !overlap )
1067 subexons.push_back( rawSubexons[i] ) ;
1068 }
1069 rawSubexons.clear() ;
1070 std::vector<struct _subexon>().swap( rawSubexons ) ;
1071
1072 intervalIrOverhang.clear() ;
1073 std::vector<struct _interval>().swap( intervalIrOverhang ) ;
1074
1075 // Create the dummy intervals.
1076 seCnt = subexons.size() ;
1077 std::vector<struct _intronicInfo> intronicInfos ;
1078 std::vector<struct _seInterval> seIntervals ;
1079 std::vector<struct _subexonSupplement> subexonInfo ;
1080
1081 //subexonInfo.resize( seCnt ) ;
1082 for ( i = 0 ; i < seCnt ; ++i )
1083 {
1084 struct _seInterval ni ; // new interval
1085 ni.start = subexons[i].start ;
1086 ni.end = subexons[i].end ;
1087 ni.type = 0 ;
1088 ni.idx = i ;
1089 ni.chrId = subexons[i].chrId ;
1090 seIntervals.push_back( ni ) ;
1091
1092 /*if ( subexons[i].end == 42671717 )
1093 {
1094 printf( "%d: %d-%d: %d\n", subexons[i].chrId, subexons[i].start, subexons[i].end, subexons[i].rightType ) ;
1095 }*/
1096 //subexonInfo[i].prevSupport = subexonInfo[i].nextSupport = NULL ;
1097
1098 /*int nexti ;
1099 for ( nexti = i + 1 ; nexti < seCnt ; ++nexti )
1100 if ( subexons[ nexti ].leftType == 0 && subexons[nexti].rightType == 0 )*/
1101
1102 if ( i < seCnt - 1 && subexons[i].chrId == subexons[i + 1].chrId &&
1103 subexons[i].end + 1 < subexons[i + 1].start &&
1104 subexons[i].rightType + subexons[i + 1].leftType != 0 )
1105 {
1106 // Only consider the intervals like ]..[,]...(, )...[
1107 // The case like ]...] is actaully things like ][...] in subexon perspective,
1108 // so they won't pass the if-statement
1109 struct _intronicInfo nii ; // new intronic info
1110 ni.start = subexons[i].end + 1 ;
1111 ni.end = subexons[i + 1].start - 1 ;
1112 ni.type = 1 ;
1113 ni.idx = intronicInfos.size() ;
1114 seIntervals.push_back( ni ) ;
1115
1116 nii.chrId = subexons[i].chrId ;
1117 nii.start = ni.start ;
1118 nii.end = ni.end ;
1119 nii.leftSubexonIdx = i ;
1120 nii.rightSubexonIdx = i + 1 ;
1121 nii.irClassifier = 0 ;
1122 nii.irCnt = 0 ;
1123 nii.validIrCnt = 0 ;
1124 nii.leftOverhang.cnt = 0 ;
1125 nii.leftOverhang.validCnt = 0 ;
1126 nii.leftOverhang.length = 0 ;
1127 nii.leftOverhang.classifier = 0 ;
1128 nii.rightOverhang.cnt = 0 ;
1129 nii.rightOverhang.validCnt = 0 ;
1130 nii.rightOverhang.length = 0 ;
1131 nii.rightOverhang.classifier = 0 ;
1132 intronicInfos.push_back( nii ) ;
1133 /*if ( nii.end == 23667 )
1134 {
1135 printf( "%d %d. %d (%d %d %d)\n", nii.start, nii.end, subexons[i].rightType, subexons[i+1].start, subexons[i + 1].end, subexons[i + 1].leftType ) ;
1136 }*/
1137 }
1138 }
1139
1140 // Go through all the files to get some statistics number
1141 double avgIrPiRatio = 0 ;
1142 double avgIrPiCov = 0 ;
1143 double irPiRatio, irKRatio[2], irThetaRatio[2] ; // Some statistical results
1144 double irPiCov, irKCov[2], irThetaCov[2] ;
1145
1146 double avgOverhangPiRatio = 0 ;
1147 double avgOverhangPiCov = 0 ;
1148 double overhangPiRatio, overhangKRatio[2], overhangThetaRatio[2] ; // Some statistical results
1149 double overhangPiCov, overhangKCov[2], overhangThetaCov[2] ;
1150
1151 for ( k = 0 ; k < fileCnt ; ++k )
1152 {
1153 fp = fopen( files[k], "r" ) ;
1154
1155 while ( fgets( buffer, sizeof( buffer), fp ) != NULL )
1156 {
1157 if ( buffer[0] == '#' )
1158 {
1159 char buffer2[100] ;
1160 sscanf( buffer, "%s", buffer2 ) ;
1161 if ( !strcmp( buffer2, "#fitted_ir_parameter_ratio:" ) )
1162 {
1163 // TODO: ignore certain samples if the coverage seems wrong.
1164 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf",
1165 buffer2, buffer2, &irPiRatio, buffer2, &irKRatio[0], buffer2, &irThetaRatio[0],
1166 buffer2, &irKRatio[1], buffer2, &irThetaRatio[1] ) ;
1167 avgIrPiRatio += irPiRatio ;
1168 }
1169 else if ( !strcmp( buffer2, "#fitted_ir_parameter_cov:" ) )
1170 {
1171 }
1172 else if ( !strcmp( buffer2, "#fitted_overhang_parameter_ratio:" ) )
1173 {
1174 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf",
1175 buffer2, buffer2, &overhangPiRatio, buffer2, &overhangKRatio[0], buffer2, &overhangThetaRatio[0],
1176 buffer2, &overhangKRatio[1], buffer2, &overhangThetaRatio[1] ) ;
1177 avgOverhangPiRatio += overhangPiRatio ;
1178 }
1179 }
1180 else
1181 break ;
1182 }
1183 fclose( fp ) ;
1184 }
1185 avgIrPiRatio /= fileCnt ;
1186 avgOverhangPiRatio /= fileCnt ;
1187
1188 // Go through all the files to put statistical results into each subexon.
1189 std::vector< struct _subexon > sampleSubexons ;
1190 int subexonCnt = subexons.size() ;
1191 for ( k = 0 ; k < fileCnt ; ++k )
1192 {
1193 //if ( k == 220 )
1194 // exit( 1 ) ;
1195 fp = fopen( files[k], "r" ) ;
1196 struct _subexon se ;
1197 struct _subexonSplit sp ;
1198 char chrName[50] ;
1199
1200 sampleSubexons.clear() ;
1201
1202 int tag = 0 ;
1203 while ( fgets( buffer, sizeof( buffer), fp ) != NULL )
1204 {
1205 if ( buffer[0] == '#' )
1206 {
1207 char buffer2[200] ;
1208 sscanf( buffer, "%s", buffer2 ) ;
1209 if ( !strcmp( buffer2, "#fitted_ir_parameter_ratio:" ) )
1210 {
1211 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf",
1212 buffer2, buffer2, &irPiRatio, buffer2, &irKRatio[0], buffer2, &irThetaRatio[0],
1213 buffer2, &irKRatio[1], buffer2, &irThetaRatio[1] ) ;
1214 }
1215 else if ( !strcmp( buffer2, "#fitted_ir_parameter_cov:" ) )
1216 {
1217 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf",
1218 buffer2, buffer2, &irPiCov, buffer2, &irKCov[0], buffer2, &irThetaCov[0],
1219 buffer2, &irKCov[1], buffer2, &irThetaCov[1] ) ;
1220 }
1221 else if ( !strcmp( buffer2, "#fitted_overhang_parameter_ratio:" ) )
1222 {
1223 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf",
1224 buffer2, buffer2, &overhangPiRatio, buffer2, &overhangKRatio[0], buffer2, &overhangThetaRatio[0],
1225 buffer2, &overhangKRatio[1], buffer2, &overhangThetaRatio[1] ) ;
1226 }
1227 else if ( !strcmp( buffer2, "#fitted_overhang_parameter_cov:" ) )
1228 {
1229 sscanf( buffer, "%s %s %lf %s %lf %s %lf %s %lf %s %lf",
1230 buffer2, buffer2, &overhangPiCov, buffer2, &overhangKCov[0], buffer2, &overhangThetaCov[0],
1231 buffer2, &overhangKCov[1], buffer2, &overhangThetaCov[1] ) ;
1232 }
1233 continue ;
1234 }
1235 else
1236 break ;
1237
1238 //SubexonGraph::InputSubexon( buffer, alignments, se, true ) ;
1239 //sampleSubexons.push_back( se ) ;
1240 }
1241
1242 //int sampleSubexonCnt = sampleSubexons.size() ;
1243 int intervalCnt = seIntervals.size() ;
1244 //for ( i = 0 ; i < sampleSubexonCnt ; ++i )
1245 int iterCnt = 0 ;
1246 while ( 1 )
1247 {
1248 if ( iterCnt > 0 && fgets( buffer, sizeof( buffer), fp ) == NULL)
1249 break ;
1250 ++iterCnt ;
1251
1252 struct _subexon se ;
1253 SubexonGraph::InputSubexon( buffer, alignments, se, true ) ;
1254
1255 while ( tag < intervalCnt )
1256 {
1257 if ( seIntervals[tag].chrId < se.chrId ||
1258 ( seIntervals[tag].chrId == se.chrId && seIntervals[tag].end < se.start ) )
1259 {
1260 ++tag ;
1261 continue ;
1262 }
1263 else
1264 break ;
1265 }
1266
1267 for ( j = tag ; j < intervalCnt ; ++j )
1268 {
1269 if ( seIntervals[j].start > se.end || seIntervals[j].chrId > se.chrId ) // terminate if no overlap.
1270 break ;
1271 int idx ;
1272
1273 if ( seIntervals[j].type == 0 )
1274 {
1275 idx = seIntervals[j].idx ;
1276 if ( subexons[idx].leftType == 1 && se.leftType == 1 && subexons[idx].start == se.start )
1277 {
1278 double tmp = se.leftClassifier ;
1279 if ( se.leftClassifier == 0 )
1280 tmp = 1e-7 ;
1281 subexons[idx].leftClassifier -= 2.0 * log( tmp ) ;
1282 ++subexons[idx].lcCnt ;
1283 subexons[idx].prev = MergePositions( subexons[idx].prev, subexons[idx].prevCnt,
1284 se.prev, se.prevCnt, subexons[idx].prevCnt ) ;
1285
1286 if ( se.rightType == 0 ) // a gene end here
1287 {
1288 for ( int l = idx ; l < subexonCnt ; ++l )
1289 {
1290 if ( l > idx && ( subexons[l].end > subexons[l - 1].start + 1
1291 || subexons[l].chrId != subexons[l - 1].chrId ) )
1292 break ;
1293 if ( subexons[l].rightType == 2 )
1294 {
1295 double adjustAvgDepth = se.avgDepth ;
1296 if ( se.end - se.start + 1 >= 100 )
1297 adjustAvgDepth += se.avgDepth * 100.0 / ( se.end - se.start + 1 ) ;
1298 else
1299 adjustAvgDepth *= 2 ;
1300 double p = GetPValueOfGeneEnd( adjustAvgDepth ) ;
1301 //if ( se.end - se.start + 1 >= 500 && p > 0.001 )
1302 // p = 0.001 ;
1303
1304 subexons[l].rightClassifier -= 2.0 * log( p ) ;
1305 ++subexons[l].rcCnt ;
1306 break ;
1307 }
1308 }
1309 }
1310 }
1311 //if ( se.chrId == 25 )
1312 // printf( "(%d %d %d. %d) (%d %d %d. %d)\n", se.chrId, se.start, se.end, se.rightType, subexons[idx].chrId, subexons[idx].start, subexons[idx].end, subexons[idx].rightType ) ;
1313 if ( subexons[idx].rightType == 2 && se.rightType == 2 && subexons[idx].end == se.end )
1314 {
1315 double tmp = se.rightClassifier ;
1316 if ( se.rightClassifier == 0 )
1317 tmp = 1e-7 ;
1318 subexons[idx].rightClassifier -= 2.0 * log( tmp ) ;
1319 ++subexons[idx].rcCnt ;
1320
1321 subexons[idx].next = MergePositions( subexons[idx].next, subexons[idx].nextCnt,
1322 se.next, se.nextCnt, subexons[idx].nextCnt ) ;
1323
1324 if ( se.leftType == 0 )
1325 {
1326 for ( int l = idx ; l >= 0 ; --l )
1327 {
1328 if ( l < idx && ( subexons[l].end < subexons[l + 1].start - 1
1329 || subexons[l].chrId != subexons[l + 1].chrId ) )
1330 break ;
1331 if ( subexons[l].leftType == 1 )
1332 {
1333 double adjustAvgDepth = se.avgDepth ;
1334 if ( se.end - se.start + 1 >= 100 )
1335 adjustAvgDepth += se.avgDepth * 100.0 / ( se.end - se.start + 1 ) ;
1336 else
1337 adjustAvgDepth *= 2 ;
1338 double p = GetPValueOfGeneEnd( adjustAvgDepth ) ;
1339 //if ( se.end - se.start + 1 >= 500 && p >= 0.001 )
1340 // p = 0.001 ;
1341 subexons[l].leftClassifier -= 2.0 * log( p ) ;
1342 ++subexons[l].lcCnt ;
1343 break ;
1344 }
1345 }
1346 }
1347 }
1348
1349 if ( subexons[idx].leftType == 0 && subexons[idx].rightType == 0
1350 && se.leftType == 0 && se.rightType == 0 ) // the single-exon island.
1351 {
1352 double tmp = se.leftClassifier ;
1353 if ( se.leftClassifier == 0 )
1354 tmp = 1e-7 ;
1355 subexons[idx].leftClassifier -= 2.0 * log( tmp ) ;
1356 subexons[idx].rightClassifier = subexons[idx].leftClassifier ;
1357 ++subexons[idx].lcCnt ;
1358 ++subexons[idx].rcCnt ;
1359 }
1360 }
1361 else if ( seIntervals[j].type == 1 )
1362 {
1363 idx = seIntervals[j].idx ;
1364 // Overlap on the left part of intron
1365 if ( se.start <= intronicInfos[idx].start && se.end < intronicInfos[idx].end
1366 && subexons[ intronicInfos[idx].leftSubexonIdx ].rightType != 0 )
1367 {
1368 int len = se.end - intronicInfos[idx].start + 1 ;
1369 intronicInfos[idx].leftOverhang.length += len ;
1370 ++intronicInfos[idx].leftOverhang.cnt ;
1371
1372 // Note that the sample subexon must have a soft boundary at right hand side,
1373 // otherwise, this part is not an intron and won't show up in intronic Info.
1374 if ( se.leftType == 2 )
1375 {
1376 if ( se.leftRatio > 0 && se.avgDepth > 1 )
1377 {
1378 ++intronicInfos[idx].leftOverhang.validCnt ;
1379
1380 double update = GetUpdateMixtureGammaClassifier( se.leftRatio, se.avgDepth,
1381 overhangPiRatio, overhangKRatio, overhangThetaRatio,
1382 overhangPiCov, overhangKCov, overhangThetaCov, false ) ;
1383 intronicInfos[idx].leftOverhang.classifier += update ;
1384 }
1385 }
1386 else if ( se.leftType == 1 )
1387 {
1388 ++intronicInfos[idx].leftOverhang.validCnt ;
1389 double update = GetUpdateMixtureGammaClassifier( 1.0, se.avgDepth,
1390 overhangPiRatio, overhangKRatio, overhangThetaRatio,
1391 overhangPiCov, overhangKCov, overhangThetaCov, true ) ;
1392 intronicInfos[idx].leftOverhang.classifier += update ;
1393
1394 int seIdx = intronicInfos[idx].leftSubexonIdx ;
1395 subexons[seIdx].rightClassifier -= 2.0 * log( GetPValueOfGeneEnd( se.avgDepth ) ) ;
1396 ++subexons[ seIdx ].rcCnt ;
1397 }
1398 // ignore the contribution of single-exon island here?
1399 }
1400 // Overlap on the right part of intron
1401 else if ( se.start > intronicInfos[idx].start && se.end >= intronicInfos[idx].end
1402 && subexons[ intronicInfos[idx].rightSubexonIdx ].leftType != 0 )
1403 {
1404 int len = intronicInfos[idx].end - se.start + 1 ;
1405 intronicInfos[idx].rightOverhang.length += len ;
1406 ++intronicInfos[idx].rightOverhang.cnt ;
1407
1408 // Note that the sample subexon must have a soft boundary at left hand side,
1409 // otherwise, this won't show up in intronic Info
1410 if ( se.rightType == 1 )
1411 {
1412 if ( se.rightRatio > 0 && se.avgDepth > 1 )
1413 {
1414 ++intronicInfos[idx].rightOverhang.validCnt ;
1415
1416 double update = GetUpdateMixtureGammaClassifier( se.rightRatio, se.avgDepth,
1417 overhangPiRatio, overhangKRatio, overhangThetaRatio,
1418 overhangPiCov, overhangKCov, overhangThetaCov, false ) ;
1419 intronicInfos[idx].rightOverhang.classifier += update ;
1420 }
1421 }
1422 else if ( se.rightType == 2 )
1423 {
1424 ++intronicInfos[idx].rightOverhang.validCnt ;
1425
1426 double update = GetUpdateMixtureGammaClassifier( 1, se.avgDepth,
1427 overhangPiRatio, overhangKRatio, overhangThetaRatio,
1428 overhangPiCov, overhangKCov, overhangThetaCov, true ) ;
1429 intronicInfos[idx].rightOverhang.classifier += update ;
1430
1431 int seIdx = intronicInfos[idx].rightSubexonIdx ;
1432 /*if ( subexons[ seIdx ].start == 6873648 )
1433 {
1434 printf( "%lf %lf: %lf %lf %lf\n", subexons[seIdx].leftClassifier, GetPValueOfGeneEnd( se.avgDepth ), se.avgDepth, sqrt( se.avgDepth ), log( se.avgDepth ) ) ;
1435 }*/
1436 subexons[seIdx].leftClassifier -= 2.0 * log( GetPValueOfGeneEnd( se.avgDepth ) ) ;
1437 ++subexons[ seIdx ].lcCnt ;
1438 }
1439 }
1440 // Intron is fully contained in this sample subexon, then it is a ir candidate
1441 else if ( se.start <= intronicInfos[idx].start && se.end >= intronicInfos[idx].end )
1442 {
1443 if ( se.leftType == 2 && se.rightType == 1 )
1444 {
1445 double ratio = regions.PickLeftAndRightRatio( se.leftRatio, se.rightRatio ) ;
1446 ++intronicInfos[idx].irCnt ;
1447 if ( ratio > 0 && se.avgDepth > 1 )
1448 {
1449 double update = GetUpdateMixtureGammaClassifier( ratio, se.avgDepth,
1450 irPiRatio, irKRatio, irThetaRatio,
1451 irPiCov, irKCov, irThetaCov, true ) ;
1452 //if ( intronicInfos[idx].start == 37617368 )
1453 // printf( "hi %lf %d %d: %d %d\n", update, se.start, se.end, intronicInfos[idx].start, intronicInfos[idx].end ) ;
1454 intronicInfos[idx].irClassifier += update ;
1455 ++intronicInfos[idx].validIrCnt ;
1456 }
1457 }
1458 else if ( se.leftType == 1 || se.rightType == 2 )
1459 {
1460 //intronicInfos[idx].irClassifier += LogGammaDensity( 4.0, irKRatio[1], irThetaRatio[1] )
1461 // - LogGammaDensity( 4.0, irKRatio[0], irThetaRatio[0] ) ;
1462 /*if ( se.start == 37617368 )
1463 {
1464 printf( "%lf: %lf %lf\n", se.avgDepth, MixtureGammaAssignment( ( irKCov[0] - 1 ) * irThetaCov[0], irPiRatio, irKCov, irThetaCov ),
1465 MixtureGammaAssignment( TransformCov( 4.0 ), irPiRatio, irKCov, irThetaCov ) ) ;
1466 }*/
1467 if ( se.avgDepth > 1 )
1468 {
1469 // let the depth be the threshold to determine.
1470 double update = GetUpdateMixtureGammaClassifier( 4.0, se.avgDepth,
1471 irPiRatio, irKRatio, irThetaRatio,
1472 irPiCov, irKCov, irThetaCov, true ) ;
1473 //if ( intronicInfos[idx].start == 36266630 )
1474 // printf( "hi %lf %d %d: %d %d\n", update, se.start, se.end, intronicInfos[idx].start, intronicInfos[idx].end ) ;
1475 intronicInfos[idx].irClassifier += update ;
1476 ++intronicInfos[idx].irCnt ;
1477 ++intronicInfos[idx].validIrCnt ;
1478 }
1479 }
1480 else
1481 {
1482 // the intron is contained in a overhang subexon from the sample or single-exon island
1483 }
1484 }
1485 // sample subexon is contained in the intron.
1486 else
1487 {
1488 // Do nothing.
1489 }
1490 }
1491 }
1492
1493 //if ( se.nextCnt > 0 )
1494 delete[] se.next ;
1495 //if ( se.prevCnt > 0 )
1496 delete[] se.prev ;
1497 }
1498 fclose( fp ) ;
1499
1500 /*for ( i = 0 ; i < sampleSubexonCnt ; ++i )
1501 {
1502 if ( sampleSubexons[i].nextCnt > 0 )
1503 delete[] sampleSubexons[i].next ;
1504 if ( sampleSubexons[i].prevCnt > 0 )
1505 delete[] sampleSubexons[i].prev ;
1506 }*/
1507 }
1508
1509 CleanUpSubexonConnections( subexons ) ;
1510
1511 // Convert the temporary statistics number into formal statistics result.
1512 for ( i = 0 ; i < subexonCnt ; ++i )
1513 {
1514 struct _subexon &se = subexons[i] ;
1515 if ( se.leftType == 0 && se.rightType == 0 ) // single-exon txpt.
1516 {
1517 se.leftClassifier = se.rightClassifier = 1 - chicdf( se.rightClassifier, 2 * se.rcCnt ) ;
1518 }
1519 else
1520 {
1521 if ( se.leftType == 1 )
1522 {
1523 se.leftClassifier = 1 - chicdf( se.leftClassifier, 2 * se.lcCnt ) ;
1524 }
1525 else
1526 se.leftClassifier = -1 ;
1527
1528 if ( se.rightType == 2 )
1529 se.rightClassifier = 1 - chicdf( se.rightClassifier, 2 * se.rcCnt ) ;
1530 else
1531 se.rightClassifier = -1 ;
1532 }
1533 }
1534
1535 int iiCnt = intronicInfos.size() ; //intronicInfo count
1536 for ( i = 0 ; i < iiCnt ; ++i )
1537 {
1538 struct _intronicInfo &ii = intronicInfos[i] ;
1539 if ( ii.validIrCnt > 0 )
1540 {
1541 for ( j = 0 ; j < fileCnt - ii.validIrCnt ; ++j )
1542 {
1543 ii.irClassifier -= log( 10.0 ) ;
1544 }
1545 /*if ( ii.validIrCnt < fileCnt * 0.15 )
1546 ii.irClassifier -= log( 1000.0 ) ;
1547 else if ( ii.validIrCnt < fileCnt * 0.5 )
1548 ii.irClassifier -= log( 100.0 ) ;*/
1549 ii.irClassifier = (double)1.0 / ( 1.0 + exp( ii.irClassifier + log( 1 - avgIrPiRatio ) - log( avgIrPiRatio ) ) ) ;
1550 }
1551 else
1552 ii.irClassifier = -1 ;
1553
1554 if ( ii.leftOverhang.validCnt > 0 )
1555 ii.leftOverhang.classifier = (double)1.0 / ( 1.0 + exp( ii.leftOverhang.classifier +
1556 log( 1 - avgOverhangPiRatio ) - log( avgOverhangPiRatio ) ) ) ;
1557 else
1558 ii.leftOverhang.classifier = -1 ;
1559
1560 if ( ii.rightOverhang.validCnt > 0 )
1561 ii.rightOverhang.classifier = (double)1.0 / ( 1.0 + exp( ii.rightOverhang.classifier +
1562 log( 1 - avgOverhangPiRatio ) - log( avgOverhangPiRatio ) ) ) ;
1563 else
1564 ii.rightOverhang.classifier = -1 ;
1565 }
1566
1567 // Change the classifier for the hard boundaries if its adjacent intron has intron retention classifier
1568 // which collide with overhang subexon.
1569 int intervalCnt = seIntervals.size() ;
1570 for ( i = 0 ; i < intervalCnt ; ++i )
1571 {
1572 if ( seIntervals[i].type == 1 && intronicInfos[ seIntervals[i].idx ].irCnt > 0 )
1573 {
1574 int idx = seIntervals[i].idx ;
1575 if ( intronicInfos[idx].leftOverhang.cnt > 0 )
1576 {
1577 int k = seIntervals[i - 1].idx ;
1578 // Should aim for more conservative?
1579 if ( subexons[k].rightClassifier > intronicInfos[idx].leftOverhang.classifier )
1580 subexons[k].rightClassifier = intronicInfos[idx].leftOverhang.classifier ;
1581 }
1582
1583 if ( intronicInfos[idx].rightOverhang.cnt > 0 )
1584 {
1585 int k = seIntervals[i + 1].idx ;
1586 if ( subexons[k].leftClassifier > intronicInfos[idx].rightOverhang.classifier )
1587 subexons[k].leftClassifier = intronicInfos[idx].rightOverhang.classifier ;
1588 }
1589 }
1590 }
1591
1592 // Output the result.
1593 for ( i = 0 ; i < intervalCnt ; ++i )
1594 {
1595 if ( seIntervals[i].type == 0 )
1596 {
1597 struct _subexon &se = subexons[ seIntervals[i].idx ] ;
1598
1599 char ls, rs ;
1600
1601 ls = StrandNumToSymbol( se.leftStrand ) ;
1602 rs = StrandNumToSymbol( se.rightStrand ) ;
1603
1604 printf( "%s %d %d %d %d %c %c -1 -1 -1 %lf %lf ", alignments.GetChromName( se.chrId ), se.start, se.end,
1605 se.leftType, se.rightType, ls, rs, se.leftClassifier, se.rightClassifier ) ;
1606 if ( i > 0 && seIntervals[i - 1].chrId == seIntervals[i].chrId
1607 && seIntervals[i - 1].end + 1 == seIntervals[i].start
1608 && !( seIntervals[i - 1].type == 0 &&
1609 subexons[ seIntervals[i - 1].idx ].rightType != se.leftType )
1610 && !( seIntervals[i - 1].type == 1 && intronicInfos[ seIntervals[i - 1].idx ].irCnt == 0
1611 && intronicInfos[ seIntervals[i - 1].idx ].rightOverhang.cnt == 0 )
1612 && ( se.prevCnt == 0 || se.start - 1 != se.prev[ se.prevCnt - 1 ] ) ) // The connection showed up in the subexon file.
1613 {
1614 printf( "%d ", se.prevCnt + 1 ) ;
1615 for ( j = 0 ; j < se.prevCnt ; ++j )
1616 printf( "%d ", se.prev[j] ) ;
1617 printf( "%d ", se.start - 1 ) ;
1618 }
1619 else
1620 {
1621 printf( "%d ", se.prevCnt ) ;
1622 for ( j = 0 ; j < se.prevCnt ; ++j )
1623 printf( "%d ", se.prev[j] ) ;
1624 }
1625
1626 if ( i < intervalCnt - 1 && seIntervals[i].chrId == seIntervals[i + 1].chrId
1627 && seIntervals[i].end == seIntervals[i + 1].start - 1
1628 && !( seIntervals[i + 1].type == 0 &&
1629 subexons[ seIntervals[i + 1].idx ].leftType != se.rightType )
1630 && !( seIntervals[i + 1].type == 1 && intronicInfos[ seIntervals[i + 1].idx ].irCnt == 0
1631 && intronicInfos[ seIntervals[i + 1].idx ].leftOverhang.cnt == 0 )
1632 && ( se.nextCnt == 0 || se.end + 1 != se.next[0] ) )
1633 {
1634 printf( "%d %d ", se.nextCnt + 1, se.end + 1 ) ;
1635 }
1636 else
1637 printf( "%d ", se.nextCnt ) ;
1638 for ( j = 0 ; j < se.nextCnt ; ++j )
1639 printf( "%d ", se.next[j] ) ;
1640 printf( "\n" ) ;
1641 }
1642 else if ( seIntervals[i].type == 1 )
1643 {
1644 struct _intronicInfo &ii = intronicInfos[ seIntervals[i].idx ] ;
1645 if ( ii.irCnt > 0 )
1646 {
1647 printf( "%s %d %d 2 1 . . -1 -1 -1 %lf %lf 1 %d 1 %d\n",
1648 alignments.GetChromName( ii.chrId ), ii.start, ii.end,
1649 ii.irClassifier, ii.irClassifier,
1650 seIntervals[i - 1].end, seIntervals[i + 1].start ) ;
1651 }
1652 else
1653 {
1654 // left overhang.
1655 if ( ii.leftOverhang.cnt > 0 )
1656 {
1657 printf( "%s %d %d 2 0 . . -1 -1 -1 %lf %lf 1 %d 0\n",
1658 alignments.GetChromName( ii.chrId ), ii.start,
1659 ii.start + ( ii.leftOverhang.length / ii.leftOverhang.cnt ) - 1,
1660 ii.leftOverhang.classifier, ii.leftOverhang.classifier,
1661 ii.start - 1 ) ;
1662 }
1663
1664 // right overhang.
1665 if ( ii.rightOverhang.cnt > 0 )
1666 {
1667 printf( "%s %d %d 0 1 . . -1 -1 -1 %lf %lf 0 1 %d\n",
1668 alignments.GetChromName( ii.chrId ),
1669 ii.end - ( ii.rightOverhang.length / ii.rightOverhang.cnt ) + 1, ii.end,
1670 ii.rightOverhang.classifier, ii.rightOverhang.classifier,
1671 ii.end + 1 ) ;
1672 }
1673
1674 }
1675 }
1676 }
1677
1678 return 0 ;
1679 }
1680