comparison PsiCLASS-1.0.2/blocks.hpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 // The class manage the blocks
2 // Li Song
3
4 #ifndef _LSONG_CLASSES_BLOCKS_HEADER
5 #define _LSONG_CLASSES_BLOCKS_HEADER
6
7 #include <stdlib.h>
8 #include <vector>
9 #include <map>
10 #include <assert.h>
11 #include <math.h>
12 #include <set>
13 #include <inttypes.h>
14
15 #include "defs.h"
16
17 extern bool VERBOSE ;
18 extern FILE *fpOut ;
19
20 extern int gMinDepth ;
21
22
23 struct _splitSite // means the next position belongs to another block
24 {
25 int64_t pos ;
26 char strand ;
27 int chrId ;
28 int type ; // 1-start of an exon. 2-end of an exon.
29
30 int support ;
31 int uniqSupport ;
32
33 int mismatchSum ;
34
35 int64_t oppositePos ; // the position of the other sites to form the intron.
36 } ;
37
38 struct _block
39 {
40 int chrId ;
41 int contigId ;
42 int64_t start, end ;
43 int64_t leftSplice, rightSplice ; // Some information about the left splice site and right splice site.
44 // record the leftmost and rightmost coordinates of a splice site within this block
45 //or the length of read alignment support the splice sites.
46 //Or the number of support.
47 int64_t depthSum ;
48
49 int leftType, rightType ; //0-soft boundary, 1-start of an exon, 2-end of an exon.
50
51 double leftRatio, rightRatio ; // the adjusted ratio-like value to the left and right anchor subexons.
52 double ratio ;
53
54 char leftStrand, rightStrand ;
55
56 int *depth ;
57 int prevCnt, nextCnt ; // Some connection information for the subexons.
58 int *prev ;
59 int *next ;
60 } ;
61
62 // adjacent graph
63 struct _adj
64 {
65 int ind ;
66 int info ;
67 int support ;
68 int next ;
69 } ;
70
71 class Blocks
72 {
73 private:
74 std::map<int, int> exonBlocksChrIdOffset ;
75
76 int64_t Overlap( int64_t s0, int64_t e0, int64_t s1, int64_t e1, int64_t &s, int64_t &e )
77 {
78 s = e = -1 ;
79 if ( e0 < s1 || s0 > e1 )
80 return 0 ;
81 s = s0 > s1 ? s0 : s1 ;
82 e = e0 < e1 ? e0 : e1 ;
83 return e - s + 1 ;
84 }
85
86 void Split( const char *s, char delimit, std::vector<std::string> &fields )
87 {
88 int i ;
89 fields.clear() ;
90 if ( s == NULL )
91 return ;
92
93 std::string f ;
94 for ( i = 0 ; s[i] ; ++i )
95 {
96 if ( s[i] == delimit || s[i] == '\n' )
97 {
98 fields.push_back( f ) ;
99 f.clear() ;
100 }
101 else
102 f.append( 1, s[i] ) ;
103 }
104 fields.push_back( f ) ;
105 f.clear() ;
106 }
107
108 void BuildBlockChrIdOffset()
109 {
110 // Build the map for the offsets of chr id in the exonBlock list.
111 exonBlocksChrIdOffset[ exonBlocks[0].chrId] = 0 ;
112 int cnt = exonBlocks.size() ;
113 for ( int i = 1 ; i < cnt ; ++i )
114 {
115 if ( exonBlocks[i].chrId != exonBlocks[i - 1].chrId )
116 exonBlocksChrIdOffset[ exonBlocks[i].chrId ] = i ;
117 }
118 }
119
120 bool CanReach( int from, int to, struct _adj *adj, bool *visited )
121 {
122 if ( visited[from] )
123 return false ;
124 visited[from] = true ;
125 int p ;
126 p = adj[from].next ;
127 while ( p != -1 )
128 {
129 if ( adj[p].ind == to )
130 return true ;
131 if ( adj[p].ind < to
132 && CanReach( adj[p].ind, to, adj, visited ) )
133 return true ;
134 p = adj[p].next ;
135 }
136 return false ;
137 }
138
139 void AdjustAndCreateExonBlocks( int tag, std::vector<struct _block> &newExonBlocks )
140 {
141 int i, j ;
142 if ( exonBlocks[tag].depth != NULL )
143 {
144 // Convert the increment and decrement into actual depth.
145 int len = exonBlocks[tag].end - exonBlocks[tag].start + 1 ;
146 int *depth = exonBlocks[tag].depth ;
147 for ( i = 1 ; i < len ; ++i )
148 depth[i] = depth[i - 1] + depth[i] ;
149
150 struct _block island ; // the portion created by the hollow.
151 island.start = island.end = -1 ;
152 island.depthSum = 0 ;
153
154 /*if ( exonBlocks[tag].start == 1562344 )
155 {
156 for ( i = 0 ; i < len ; ++i )
157 printf( "%d\n", depth[i] ) ;
158 }*/
159 // Adjust boundary accordingly.
160 int64_t adjustStart = exonBlocks[tag].start ;
161 int64_t adjustEnd = exonBlocks[tag].end ;
162
163 if ( exonBlocks[tag].leftType == 0 && exonBlocks[tag].rightType != 0 )
164 {
165 for ( i = len - 1 ; i >= 0 ; --i )
166 if ( depth[i] < gMinDepth )
167 break ;
168 ++i ;
169 if ( exonBlocks[tag].rightType == 2 && i + exonBlocks[tag].start > exonBlocks[tag].rightSplice )
170 i = exonBlocks[tag].rightSplice - exonBlocks[tag].start ;
171 adjustStart = i + exonBlocks[tag].start ;
172
173 if ( adjustStart > exonBlocks[tag].start )
174 {
175 int s, e ;
176 // firstly [s,e] is the range of depth array.
177 for ( s = 0 ; s < i ; ++s )
178 if ( depth[s] >= gMinDepth )
179 break ;
180 for ( e = i - 1 ; e >= s ; -- e )
181 if ( depth[e] >= gMinDepth )
182 break ;
183 if ( e >= s )
184 {
185 island = exonBlocks[tag] ;
186 island.depthSum = 0 ;
187 island.leftType = island.rightType = 0 ;
188 for ( j = s ; j <= e ; ++j )
189 island.depthSum += depth[j] ;
190 island.start = s + exonBlocks[tag].start ; // offset the coordinate.
191 island.end = e + exonBlocks[tag].start ;
192 }
193 }
194 }
195 else if ( exonBlocks[tag].leftType != 0 && exonBlocks[tag].rightType == 0 )
196 {
197 for ( i = 0 ; i < len ; ++i )
198 if ( depth[i] < gMinDepth )
199 break ;
200 --i ;
201 if ( exonBlocks[tag].leftType == 1 && i + exonBlocks[tag].start < exonBlocks[tag].leftSplice )
202 i = exonBlocks[tag].leftSplice - exonBlocks[tag].start ;
203 adjustEnd = i + exonBlocks[tag].start ;
204
205 if ( adjustEnd < exonBlocks[tag].end )
206 {
207 int s, e ;
208 // firstly [s,e] is the range of depth array.
209 for ( s = i + 1 ; s < len ; ++s )
210 if ( depth[s] >= gMinDepth )
211 break ;
212 for ( e = len - 1 ; e >= s ; --e )
213 if ( depth[e] >= gMinDepth )
214 break ;
215 if ( e >= s )
216 {
217 island = exonBlocks[tag] ;
218 island.depthSum = 0 ;
219 island.leftType = island.rightType = 0 ;
220 for ( j = s ; j <= e ; ++j )
221 island.depthSum += depth[j] ;
222 island.start = s + exonBlocks[tag].start ; // offset the coordinate.
223 island.end = e + exonBlocks[tag].start ;
224 }
225 }
226 }
227 else if ( exonBlocks[tag].leftType == 0 && exonBlocks[tag].rightType == 0 )
228 {
229 for ( i = 0 ; i < len ; ++i )
230 if ( depth[i] >= gMinDepth )
231 break ;
232 adjustStart = i + exonBlocks[tag].start ;
233
234 for ( i = len - 1 ; i >= 0 ; --i )
235 if ( depth[i] >= gMinDepth )
236 break ;
237 adjustEnd = i + exonBlocks[tag].start ;
238 }
239 else if ( exonBlocks[tag].leftType == 1 && exonBlocks[tag].rightType == 2
240 && exonBlocks[tag].end - exonBlocks[tag].start + 1 >= 1000 )
241 {
242 // The possible merge of two genes or merge of UTRs, if we don't break the low coverage part.
243 // If we decide to cut, I'll reuse the variable "island" to represent the subexon on right hand side.
244 int l ;
245 int gapSize = 30 ;
246 for ( i = 0 ; i <= len - ( gapSize + 1 ) ; ++i )
247 {
248 if ( depth[i] < gMinDepth )
249 {
250 for ( l = i ; l < i + ( gapSize + 1 ) ; ++l )
251 if ( depth[l] >= gMinDepth )
252 break ;
253 if ( l < i + ( gapSize + 1 ) )
254 {
255 i = l - 1 ;
256 continue ;
257 }
258 else
259 break ;
260 }
261 }
262
263 for ( j = len - 1 ; j >= ( gapSize + 1 ) - 1 ; --j )
264 {
265 if ( depth[j] < gMinDepth )
266 {
267 for ( l = j ; l >= j - ( gapSize + 1 ) + 1 ; --l )
268 if ( depth[l] >= gMinDepth )
269 break ;
270
271 if ( l >= j - ( gapSize + 1 ) + 1 )
272 {
273 j = l + 1 ;
274 continue ;
275 }
276 else
277 break ;
278 }
279 }
280
281 if ( j - i + 1 > gapSize )
282 {
283 // we break.
284 --i ; ++j ;
285
286 adjustEnd = i + exonBlocks[tag].start ;
287
288 if ( j < len - 1 )
289 {
290 island = exonBlocks[tag] ;
291 island.depthSum = 0 ;
292 island.leftType = 0 ;
293 for ( l = j ; l <= len - 1 ; ++l )
294 island.depthSum += depth[j] ;
295 island.start = j + exonBlocks[tag].start ; // offset the coordinate.
296 island.end = exonBlocks[tag].end ;
297 }
298
299 exonBlocks[tag].rightType = 0 ; // put it here, so the island can get the right info
300 //fprintf( stderr, "break %d %d %d %d.\n", (int)exonBlocks[tag].start, (int)adjustEnd, i + (int)exonBlocks[tag].start, j + (int)exonBlocks[tag].start ) ;
301 }
302 }
303
304 int lostDepthSum = 0 ;
305 for ( i = exonBlocks[tag].start ; i < adjustStart ; ++i )
306 lostDepthSum += depth[i - exonBlocks[tag].start ] ;
307 for ( i = adjustEnd + 1 ; i < exonBlocks[tag].end ; ++i )
308 lostDepthSum += depth[i - exonBlocks[tag].start ] ;
309 exonBlocks[tag].depthSum -= lostDepthSum ;
310
311 delete[] exonBlocks[tag].depth ;
312 exonBlocks[tag].depth = NULL ;
313
314 //if ( exonBlocks[tag].start == 1562344 )
315 // printf( "%d %d\n", adjustStart, adjustEnd ) ;
316 if ( ( len > 1 && adjustEnd - adjustStart + 1 <= 1 ) || ( adjustEnd - adjustStart + 1 <= 0 ) )
317 return ;
318 exonBlocks[tag].start = adjustStart ;
319 exonBlocks[tag].end = adjustEnd ;
320
321 if ( island.start != -1 && island.end < exonBlocks[tag].start )
322 newExonBlocks.push_back( island ) ;
323
324 newExonBlocks.push_back( exonBlocks[tag] ) ;
325 if ( island.start != -1 && island.start > exonBlocks[tag].end )
326 newExonBlocks.push_back( island ) ;
327 }
328 }
329 public:
330 std::vector<struct _block> exonBlocks ;
331
332 Blocks() { }
333 ~Blocks()
334 {
335 int blockCnt = exonBlocks.size() ;
336 for ( int i = 0 ; i < blockCnt ; ++i )
337 {
338 if ( exonBlocks[i].next != NULL )
339 {
340 delete[] exonBlocks[i].next ;
341 }
342 if ( exonBlocks[i].prev != NULL )
343 {
344 delete[] exonBlocks[i].prev ;
345 }
346 }
347 }
348
349 double GetAvgDepth( const struct _block &block )
350 {
351 return block.depthSum / (double)( block.end - block.start + 1 ) ;
352 }
353
354 int BuildExonBlocks( Alignments &alignments )
355 {
356 int tag = 0 ;
357 while ( alignments.Next() )
358 {
359 int i, j, k ;
360 int segCnt = alignments.segCnt ;
361 struct _pair *segments = alignments.segments ;
362 int eid = 0 ; // the exonblock id that the segment update
363
364 // locate the first exonblock that beyond the start of the read.
365 while ( tag < (int)exonBlocks.size() && ( exonBlocks[tag].end < segments[0].a - 1
366 || exonBlocks[tag].chrId != alignments.GetChromId() ) )
367 {
368 ++tag ;
369 }
370 // due to the merge of exon blocks, we might need roll back
371 --tag ;
372 while ( tag >= 0 && ( exonBlocks[tag].end >= segments[0].a - 1
373 && exonBlocks[tag].chrId == alignments.GetChromId() ) )
374 {
375 --tag ;
376 }
377 ++tag ;
378
379 for ( i = 0 ; i < segCnt ; ++i )
380 {
381 //if ( i == 0 )
382 // printf( "hi %d %s %d %d\n", i, alignments.GetReadId(), segments[i].a, segments[i].b ) ;
383 for ( j = tag ; j < (int)exonBlocks.size() ; ++j )
384 {
385 if ( exonBlocks[j].end >= segments[i].a - 1 )
386 break ;
387 }
388 if ( j >= (int)exonBlocks.size() )
389 {
390 // Append a new block
391 struct _block newSeg ;
392 newSeg.chrId = alignments.GetChromId() ;
393 newSeg.start = segments[i].a ;
394 newSeg.end = segments[i].b ;
395
396 newSeg.leftSplice = -1 ;
397 newSeg.rightSplice = -1 ;
398 if ( i > 0 )
399 newSeg.leftSplice = segments[i].a ;
400 if ( i < segCnt - 1 )
401 newSeg.rightSplice = segments[i].b ;
402
403 exonBlocks.push_back( newSeg ) ;
404
405 eid = exonBlocks.size() - 1 ;
406 }
407 else if ( exonBlocks[j].end < segments[i].b ||
408 ( exonBlocks[j].start > segments[i].a && exonBlocks[j].start <= segments[i].b + 1 ) )
409 {
410 // If overlaps with a current exon block, so we extend it
411 if ( exonBlocks[j].end < segments[i].b )
412 {
413 // extends toward right
414 exonBlocks[j].end = segments[i].b ;
415 if ( i > 0 && ( exonBlocks[j].leftSplice == -1 || segments[i].a < exonBlocks[j].leftSplice ) )
416 exonBlocks[j].leftSplice = segments[i].a ;
417 if ( i < segCnt - 1 && segments[i].b > exonBlocks[j].rightSplice )
418 exonBlocks[j].rightSplice = segments[i].b ;
419 eid = j ;
420
421 // Merge with next few exon blocks
422 for ( k = j + 1 ; k < (int)exonBlocks.size() ; ++k )
423 {
424 if ( exonBlocks[k].start <= exonBlocks[j].end + 1 )
425 {
426 if ( exonBlocks[k].end > exonBlocks[j].end )
427 exonBlocks[j].end = exonBlocks[k].end ;
428
429 if ( exonBlocks[k].leftSplice != -1 && ( exonBlocks[j].leftSplice == -1 || exonBlocks[k].leftSplice < exonBlocks[j].leftSplice ) )
430 exonBlocks[j].leftSplice = exonBlocks[k].leftSplice ;
431 if ( exonBlocks[k].rightSplice != -1 && exonBlocks[k].rightSplice > exonBlocks[j].rightSplice )
432 exonBlocks[j].rightSplice = exonBlocks[k].rightSplice ;
433
434 }
435 else
436 break ;
437 }
438
439 if ( k > j + 1 )
440 {
441 // Remove the merged blocks
442 int a, b ;
443 for ( a = j + 1, b = k ; b < (int)exonBlocks.size() ; ++a, ++b )
444 exonBlocks[a] = exonBlocks[b] ;
445 for ( a = 0 ; a < k - ( j + 1 ) ; ++a )
446 exonBlocks.pop_back() ;
447 }
448 }
449 else if ( exonBlocks[j].start > segments[i].a && exonBlocks[j].start <= segments[i].b + 1 )
450 {
451 // extends toward left
452 exonBlocks[j].start = segments[i].a ;
453 if ( i > 0 && ( exonBlocks[j].leftSplice == -1 || segments[i].a < exonBlocks[j].leftSplice ) )
454 exonBlocks[j].leftSplice = segments[i].a ;
455 if ( i < segCnt - 1 && segments[i].b > exonBlocks[j].rightSplice )
456 exonBlocks[j].rightSplice = segments[i].b ;
457 eid = j ;
458
459 // Merge with few previous exon blocks
460 for ( k = j - 1 ; k >= 0 ; --k )
461 {
462 if ( exonBlocks[k].end >= exonBlocks[k + 1].start - 1 )
463 {
464 if ( exonBlocks[k + 1].start < exonBlocks[k].start )
465 {
466 exonBlocks[k].start = exonBlocks[k + 1].start ;
467 }
468
469 if ( exonBlocks[k].leftSplice != -1 && ( exonBlocks[j].leftSplice == -1 || exonBlocks[k].leftSplice < exonBlocks[j].leftSplice ) )
470 exonBlocks[j].leftSplice = exonBlocks[k].leftSplice ;
471 if ( exonBlocks[k].rightSplice != -1 && exonBlocks[k].rightSplice > exonBlocks[j].rightSplice )
472 exonBlocks[j].rightSplice = exonBlocks[k].rightSplice ;
473
474 }
475 else
476 break ;
477 }
478
479 if ( k < j - 1 )
480 {
481 int a, b ;
482 eid = k + 1 ;
483 for ( a = k + 2, b = j + 1 ; b < (int)exonBlocks.size() ; ++a, ++b )
484 exonBlocks[a] = exonBlocks[b] ;
485 for ( a = 0 ; a < ( j - 1 ) - k ; ++a )
486 exonBlocks.pop_back() ;
487
488 }
489 }
490 }
491 else if ( exonBlocks[j].start > segments[i].b + 1 )
492 {
493 int size = exonBlocks.size() ;
494 int a ;
495 // No overlap, insert a new block
496 struct _block newSeg ;
497 newSeg.chrId = alignments.GetChromId() ;
498 newSeg.start = segments[i].a ;
499 newSeg.end = segments[i].b ;
500
501 newSeg.leftSplice = -1 ;
502 newSeg.rightSplice = -1 ;
503 if ( i > 0 )
504 newSeg.leftSplice = segments[i].a ;
505 if ( i < segCnt - 1 )
506 newSeg.rightSplice = segments[i].b ;
507
508 // Insert at position j
509 exonBlocks.push_back( newSeg ) ;
510 for ( a = size ; a > j ; --a )
511 exonBlocks[a] = exonBlocks[a - 1] ;
512 exonBlocks[a] = newSeg ;
513
514 eid = a ;
515 }
516 else
517 {
518 // The segment is contained in j
519 eid = -1 ;
520 }
521
522 // Merge the block with the mate if the gap is very small
523 // Note that since reads are already sorted by coordinate,
524 // the previous exon blocks is built completely.
525 /*int64_t matePos ;
526 int mateChrId ;
527 alignments.GetMatePosition( mateChrId, matePos ) ;
528 if ( i == 0 && eid > 0 && mateChrId == exonBlocks[ eid ].chrId
529 && matePos < segments[0].a
530 && matePos >= exonBlocks[eid - 1].start
531 && matePos <= exonBlocks[eid - 1].end
532 && segments[0].a - matePos + 1 <= 500
533 && exonBlocks[eid-1].chrId == exonBlocks[eid].chrId
534 && exonBlocks[eid].start - exonBlocks[eid - 1].end - 1 <= 30 )
535 {
536 printf( "%d: (%d-%d) (%d-%d). %d %d\n", ( segments[0].a + alignments.readLen - 1 ) - matePos + 1, exonBlocks[eid - 1].start, exonBlocks[eid - 1].end,
537 exonBlocks[eid].start,
538 exonBlocks[eid].end, eid, exonBlocks.size() ) ;
539 exonBlocks[eid - 1].end = exonBlocks[eid].end ;
540
541 if ( exonBlocks[eid].leftSplice != -1 && ( exonBlocks[eid - 1].leftSplice == -1 || exonBlocks[eid].leftSplice < exonBlocks[eid - 1].leftSplice ) )
542 exonBlocks[eid - 1].leftSplice = exonBlocks[k].leftSplice ;
543 if ( exonBlocks[eid].rightSplice != -1 && exonBlocks[eid].rightSplice > exonBlocks[eid - 1].rightSplice )
544 exonBlocks[eid - 1].rightSplice = exonBlocks[eid].rightSplice ;
545
546 int size = exonBlocks.size() ;
547 for ( j = eid ; j < size - 1 ; ++j )
548 exonBlocks[j] = exonBlocks[j + 1] ;
549 exonBlocks.pop_back() ;
550 }*/
551 }
552 }
553 /*for ( int i = 0 ; i < (int)exonBlocks.size() ; ++i )
554 {
555 printf( "%d %d\n", exonBlocks[i].start, exonBlocks[i].end ) ;
556 }*/
557
558 if ( exonBlocks.size() > 0 )
559 {
560 BuildBlockChrIdOffset() ;
561 int cnt = exonBlocks.size() ;
562 for ( int i = 0 ; i < cnt ; ++i )
563 {
564 exonBlocks[i].contigId = exonBlocks[i].chrId ;
565
566 exonBlocks[i].leftType = 0 ;
567 exonBlocks[i].rightType = 0 ;
568 exonBlocks[i].depth = NULL ;
569 exonBlocks[i].nextCnt = 0 ;
570 exonBlocks[i].prevCnt = 0 ;
571 exonBlocks[i].leftStrand = '.' ;
572 exonBlocks[i].rightStrand = '.' ;
573 }
574 }
575 return exonBlocks.size() ;
576 }
577
578 void FilterSplitSitesInRegions( std::vector<struct _splitSite> &sites )
579 {
580 int i, j, k ;
581 int size = sites.size() ;
582 int bsize = exonBlocks.size() ;
583 int tag = 0 ;
584
585 for ( i = 0 ; i < bsize ; ++i )
586 {
587 while ( tag < size && ( sites[tag].chrId < exonBlocks[i].chrId ||
588 ( sites[tag].chrId == exonBlocks[i].chrId && sites[tag].pos < exonBlocks[i].start ) ) )
589 {
590 ++tag ;
591 }
592 if ( tag >= size )
593 break ;
594
595 if ( sites[tag].chrId != exonBlocks[i].chrId || sites[tag].pos > exonBlocks[i].end )
596 continue ;
597
598 for ( j = tag + 1 ; j < size ; ++j )
599 if ( sites[j].chrId != exonBlocks[i].chrId || sites[j].pos > exonBlocks[i].end )
600 break ;
601
602 // [tag,j) holds the split sites in this region.
603 int leftStrandSupport[2] = {0, 0} ;
604 int rightStrandSupport[2] = {0, 0} ;
605 int strandCnt[2] = { 0, 0 } ;
606 for ( k = tag ; k < j ; ++k )
607 {
608 if ( sites[k].strand == '-' )
609 {
610 ++strandCnt[0] ;
611 if ( sites[k].oppositePos < exonBlocks[i].start )
612 leftStrandSupport[0] += sites[k].support ;
613 else if ( sites[k].oppositePos > exonBlocks[i].end )
614 rightStrandSupport[0] += sites[k].support ;
615 }
616 else if ( sites[k].strand == '+' )
617 {
618 ++strandCnt[1] ;
619 if ( sites[k].oppositePos < exonBlocks[i].start )
620 leftStrandSupport[1] += sites[k].support ;
621 else if ( sites[k].oppositePos > exonBlocks[i].end )
622 rightStrandSupport[1] += sites[k].support ;
623 }
624
625 }
626
627
628 if ( leftStrandSupport[0] == 0 && rightStrandSupport[0] == 0
629 && leftStrandSupport[1] != 0 && rightStrandSupport[1] != 0 && strandCnt[0] == 2 )
630 {
631 // check whether a different strand accidentally show up in this region.
632 bool remove = false ;
633 for ( k = tag ; k < j ; ++k )
634 {
635 if ( sites[k].strand == '-' &&
636 sites[k].oppositePos >= sites[k].pos && sites[k].oppositePos <= exonBlocks[i].end &&
637 sites[k].support <= 2 )
638 {
639 remove = true ;
640 break ;
641 }
642 }
643
644 if ( remove )
645 for ( k = tag ; k < j ; ++k )
646 if ( sites[k].strand == '-' )
647 sites[k].support = -1 ;
648 }
649 else if ( leftStrandSupport[1] == 0 && rightStrandSupport[1] == 0
650 && leftStrandSupport[0] != 0 && rightStrandSupport[0] != 0 && strandCnt[1] == 2 )
651 {
652 // check whether a different strand accidentally show up in this region.
653 bool remove = false ;
654 for ( k = tag ; k < j ; ++k )
655 {
656 if ( sites[k].strand == '+' &&
657 sites[k].oppositePos >= sites[k].pos && sites[k].oppositePos <= exonBlocks[i].end &&
658 sites[k].support <= 2 )
659 {
660 remove = true ;
661 break ;
662 }
663 }
664
665 if ( remove )
666 for ( k = tag ; k < j ; ++k )
667 if ( sites[k].strand == '+' )
668 sites[k].support = -1 ;
669 }
670
671 tag = j ;
672 }
673
674 k = 0 ;
675 for ( i = 0 ; i < size ; ++i )
676 if ( sites[i].support > 0 )
677 {
678 sites[k] = sites[i] ;
679 ++k ;
680 }
681 sites.resize( k ) ;
682 }
683
684 // Filter the pair of split sites that is likely merge two genes.
685 // We filter the case like [..]------------------------[..]
686 // [..]--[..] [..]-[...]
687 void FilterGeneMergeSplitSites( std::vector<struct _splitSite> &sites )
688 {
689 int i, j, k ;
690 int bsize = exonBlocks.size() ;
691 int ssize = sites.size() ;
692
693 struct _pair32 *siteToBlock = new struct _pair32[ ssize ] ;
694 struct _adj *adj = new struct _adj[ ssize / 2 + bsize ] ;
695 bool *visited = new bool[bsize] ;
696
697 int adjCnt = 0 ;
698 for ( i = 0 ; i < bsize ; ++i )
699 {
700 adj[i].info = 0 ; // in the header, info means the number of next block
701 adj[i].support = 0 ;
702 adj[i].next = -1 ;
703 }
704 adjCnt = i ;
705 memset( siteToBlock, -1, sizeof( struct _pair32 ) * ssize ) ;
706 memset( visited, false, sizeof( bool ) * bsize ) ;
707
708 // Build the graph.
709 k = 0 ;
710 for ( i = 0 ; i < bsize ; ++i )
711 {
712 for ( ; k < ssize ; ++k )
713 {
714 if ( sites[k].oppositePos < sites[k].pos )
715 continue ;
716 if ( sites[k].chrId < exonBlocks[i].chrId
717 || ( sites[k].chrId == exonBlocks[i].chrId && sites[k].pos < exonBlocks[i].start ) )
718 continue ;
719 break ;
720 }
721
722 for ( ; k < ssize ; ++k )
723 {
724 if ( sites[k].chrId > exonBlocks[i].chrId
725 || sites[k].pos > exonBlocks[i].end )
726 break ;
727
728 if ( sites[k].oppositePos <= exonBlocks[i].end ) // ignore self-loop
729 continue ;
730
731 for ( j = i + 1 ; j < bsize ; ++j )
732 if ( sites[k].oppositePos >= exonBlocks[j].start && sites[k].oppositePos <= exonBlocks[j].end )
733 break ;
734 if ( sites[k].oppositePos >= exonBlocks[j].start && sites[k].oppositePos <= exonBlocks[j].end )
735 {
736 int p ;
737 p = adj[i].next ;
738 while ( p != -1 )
739 {
740 if ( adj[p].ind == j )
741 {
742 ++adj[p].info ;
743 adj[p].support += sites[k].uniqSupport ;
744 break ;
745 }
746 p = adj[p].next ;
747 }
748 if ( p == -1 )
749 {
750 adj[ adjCnt ].ind = j ;
751 adj[ adjCnt ].info = 1 ;
752 adj[ adjCnt ].support = sites[k].uniqSupport ;
753 adj[ adjCnt ].next = adj[i].next ;
754 adj[i].next = adjCnt ;
755 ++adj[i].info ;
756 ++adjCnt ;
757 }
758
759 siteToBlock[k].a = i ;
760 siteToBlock[k].b = j ;
761 }
762 }
763 }
764 for ( k = 0 ; k < ssize ; ++k )
765 {
766 if ( sites[k].oppositePos - sites[k].pos + 1 < 20000 || sites[k].uniqSupport >= 30 )
767 continue ;
768
769 int from = siteToBlock[k].a ;
770 int to = siteToBlock[k].b ;
771 if ( to - from - 1 < 2 || adj[from].info <= 1 )
772 continue ;
773
774 memset( &visited[from], false, sizeof( bool ) * ( to - from + 1 ) ) ;
775
776 int cnt = 0 ;
777 int p = adj[from].next ;
778 int max = -1 ;
779 int maxP = 0 ;
780 while ( p != -1 )
781 {
782 if ( adj[p].support >= 10 && adj[p].ind <= to )
783 ++cnt ;
784 if ( adj[p].support > max || ( adj[p].support == max && adj[p].ind == to ) )
785 {
786 max = adj[p].support ;
787 maxP = p ;
788 }
789
790 if ( adj[p].ind == to && adj[p].info > 1 )
791 {
792 cnt = 0 ;
793 break ;
794 }
795 p = adj[p].next ;
796 }
797 if ( cnt <= 1 )
798 continue ;
799
800 if ( max != -1 && adj[maxP].ind == to )
801 continue ;
802
803 // No other path can reach "to"
804 p = adj[from].next ;
805 while ( p != -1 )
806 {
807 if ( adj[p].ind != to )
808 {
809 //if ( sites[k].pos ==43917158 )
810 // printf( "hi %d %d. (%d %d)\n", from, adj[p].ind, exonBlocks[ adj[p].ind ].start, exonBlocks[ adj[p].ind ].end ) ;
811 if ( CanReach( adj[p].ind, to, adj, visited ) )
812 break ;
813 }
814 p = adj[p].next ;
815 }
816 if ( p != -1 )
817 continue ;
818
819 //if ( sites[k].pos == 34073267 )
820 // printf( "hi %d %d. (%d %d)\n", from, to, exonBlocks[to].start, exonBlocks[to].end ) ;
821
822 // There are some blocks between that can reach "to"
823 for ( i = from + 1 ; i < to ; ++i )
824 {
825 p = adj[i].next ;
826 while ( p != -1 )
827 {
828 if ( adj[p].ind == to && adj[p].support >= 10 )
829 break ;
830 p = adj[p].next ;
831 }
832 if ( p != -1 )
833 break ;
834 }
835 if ( i >= to )
836 continue ;
837
838 // Filter the site
839 //printf( "filtered: %d %d\n", sites[k].pos, sites[k].oppositePos ) ;
840 sites[k].support = -1 ;
841 for ( j = k + 1 ; j < ssize ; ++j )
842 {
843 if ( sites[j].pos == sites[k].oppositePos && sites[j].oppositePos == sites[k].pos )
844 sites[j].support = -1 ;
845 if ( sites[j].pos > sites[k].oppositePos )
846 break ;
847 }
848 }
849
850 k = 0 ;
851 for ( i = 0 ; i < ssize ; ++i )
852 if ( sites[i].support > 0 )
853 {
854 sites[k] = sites[i] ;
855 ++k ;
856 }
857 sites.resize( k ) ;
858
859 // Filter the intron on the different strand of a gene.
860 ssize = k ;
861 k = 0 ;
862 for ( i = 0 ; i < bsize ; )
863 {
864 int farthest = exonBlocks[i].end ;
865
866 for ( j = i ; j < bsize ; ++j )
867 {
868 if ( exonBlocks[j].start > farthest || exonBlocks[i].chrId != exonBlocks[j].chrId )
869 break ;
870 int p = adj[i].next ;
871 while ( p != -1 )
872 {
873 if ( exonBlocks[ adj[p].ind ].end > farthest )
874 farthest = exonBlocks[ adj[p].ind ].end ;
875 p = adj[p].next ;
876 }
877 }
878
879 for ( ; k < ssize ; ++k )
880 {
881 if ( sites[k].chrId < exonBlocks[i].chrId ||
882 ( sites[k].chrId == exonBlocks[i].chrId && sites[k].pos < exonBlocks[i].start ) )
883 continue ;
884 break ;
885 }
886
887 if ( sites[k].chrId > exonBlocks[i].chrId || sites[k].pos > farthest )
888 {
889 i = j ;
890 continue ;
891 }
892 //printf( "%d %d. %d %d. %d %d\n", i, j, exonBlocks[i].start, farthest, k, sites[k].pos ) ;
893
894
895 int from = k ;
896 int to ;
897 for ( to = k ; to < ssize ; ++to )
898 if ( sites[to].chrId != exonBlocks[i].chrId || sites[to].pos > farthest )
899 break ;
900
901 int strandSupport[2] = {0, 0};
902 int strandCount[2] = {0, 0};
903 for ( k = from ; k < to ; ++k )
904 {
905 if ( sites[k].oppositePos <= sites[k].pos )
906 continue ;
907
908 int tag = ( sites[k].strand == '+' ? 1 : 0 ) ;
909 strandSupport[tag] += sites[k].support ;
910 ++strandCount[tag] ;
911 }
912
913 // Not mixtured.
914 if ( strandCount[0] * strandCount[1] == 0 )
915 {
916 i = j ;
917 k = to ;
918 continue ;
919 }
920
921 char removeStrand = 0 ;
922 if ( strandCount[0] == 1 && strandCount[1] >= 3
923 && strandSupport[1] >= 20 * strandSupport[0] && strandSupport[0] <= 5 )
924 {
925 removeStrand = '-' ;
926 }
927 else if ( strandCount[1] == 1 && strandCount[0] >= 3
928 && strandSupport[0] >= 20 * strandSupport[1] && strandSupport[1] <= 5 )
929 {
930 removeStrand = '+' ;
931 }
932
933 if ( removeStrand == 0 )
934 {
935 i = j ; k = to ;
936 continue ;
937 }
938
939 for ( k = from ; k < to ; ++k )
940 {
941 if ( sites[k].strand == removeStrand && sites[k].oppositePos >= exonBlocks[i].start && sites[k].oppositePos <= farthest )
942 {
943 //printf( "filtered: %d %d\n", sites[k].pos, sites[k].oppositePos ) ;
944 sites[k].support = -1 ;
945 }
946 }
947
948 i = j ;
949 k = to ;
950 }
951
952 k = 0 ;
953 for ( i = 0 ; i < ssize ; ++i )
954 if ( sites[i].support > 0 )
955 {
956 sites[k] = sites[i] ;
957 ++k ;
958 }
959 sites.resize( k ) ;
960
961 delete[] visited ;
962 delete[] siteToBlock ;
963 delete[] adj ;
964 }
965
966 void SplitBlocks( Alignments &alignments, std::vector< struct _splitSite > &splitSites )
967 {
968 std::vector<struct _block> rawExonBlocks = exonBlocks ;
969 int i, j ;
970 int tag = 0 ;
971 int bsize = rawExonBlocks.size() ;
972 int ssize = splitSites.size() ;
973
974 // Make sure not overflow
975 struct _splitSite tmp ;
976 tmp.pos = -1 ;
977 splitSites.push_back( tmp ) ;
978
979 exonBlocks.clear() ;
980 for ( i = 0 ; i < bsize ; ++i )
981 {
982 while ( tag < ssize && ( splitSites[tag].chrId < rawExonBlocks[i].chrId ||
983 ( splitSites[tag].chrId == rawExonBlocks[i].chrId && splitSites[tag].pos < rawExonBlocks[i].start ) ) )
984 ++tag ;
985
986 int l ;
987 for ( l = tag ; l < ssize ; ++l )
988 if ( splitSites[l].chrId != rawExonBlocks[i].chrId || splitSites[l].pos > rawExonBlocks[i].end )
989 break ;
990 int64_t start = rawExonBlocks[i].start ;
991 int64_t end = rawExonBlocks[i].end ;
992 //printf( "%lld %lld\n", start, end ) ;
993 //printf( "%s %s: %d %d\n", alignments.GetChromName( splitSites[tag].chrId ), alignments.GetChromName( rawExonBlocks[i].chrId ), tag, l ) ;
994 for ( j = tag ; j <= l ; ++j )
995 {
996 int leftType = 0 ;
997 int rightType = 0 ;
998 if ( j > tag )
999 {
1000 start = splitSites[j - 1].pos ; // end is from previous stage
1001 leftType = splitSites[j - 1].type ;
1002 }
1003 else
1004 start = rawExonBlocks[i].start ;
1005
1006 if ( j <= l - 1 )
1007 {
1008 end = splitSites[j].pos ;
1009 rightType = splitSites[j].type ;
1010 }
1011 else
1012 end = rawExonBlocks[i].end ;
1013
1014 if ( leftType == 2 )
1015 ++start ;
1016 if ( rightType == 1 )
1017 --end ;
1018
1019 struct _block tmpB ;
1020 tmpB = rawExonBlocks[i] ;
1021 tmpB.start = start ;
1022 tmpB.end = end ;
1023 tmpB.depthSum = 0 ;
1024 tmpB.ratio = 0 ;
1025 //printf( "\t%lld %lld\n", start, end ) ;
1026 tmpB.leftType = leftType ;
1027 tmpB.rightType = rightType ;
1028 tmpB.depth = NULL ;
1029
1030 exonBlocks.push_back( tmpB ) ;
1031 // handle the soft boundary is the same as the hard boundary case
1032 // or adjacent splice sites
1033 /*if ( j == tag && start == end )
1034 {
1035 exonBlocks.pop_back() ;
1036 --end ;
1037 }
1038 else if ( j == l && start > end )
1039 {
1040 exonBlocks.pop_back() ;
1041 }*/
1042 if ( start > end
1043 //|| ( tmpB.start == rawExonBlocks[i].start && tmpB.end != rawExonBlocks[i].end && tmpB.end - tmpB.start + 1 <= 7 )
1044 //|| ( tmpB.end == rawExonBlocks[i].end && tmpB.start != rawExonBlocks[i].start && tmpB.end - tmpB.start + 1 <= 7 )) // the case of too short overhang
1045 || ( tmpB.leftType == 0 && tmpB.rightType == 2 && tmpB.end - tmpB.start + 1 <= 9 )
1046 || ( tmpB.leftType == 1 && tmpB.rightType == 0 && tmpB.end - tmpB.start + 1 <= 9 ) )
1047 {
1048 exonBlocks.pop_back() ;
1049 }
1050 /*else if ( start == end )
1051 {
1052
1053 }*/
1054 }
1055 }
1056 BuildBlockChrIdOffset() ;
1057 }
1058
1059 void ComputeDepth( Alignments &alignments )
1060 {
1061 // Go through the alignment list again to fill in the depthSum;
1062 int i ;
1063 int j ;
1064 int tag = 0 ;
1065 int blockCnt = exonBlocks.size() ;
1066
1067 std::vector<struct _block> newExonBlocks ;
1068 while ( alignments.Next() )
1069 {
1070 int segCnt = alignments.segCnt ;
1071 struct _pair *segments = alignments.segments ;
1072
1073 while ( tag < blockCnt && ( exonBlocks[tag].chrId < alignments.GetChromId() ||
1074 ( exonBlocks[tag].chrId == alignments.GetChromId() && exonBlocks[tag].end < segments[0].a ) ) )
1075 {
1076 AdjustAndCreateExonBlocks( tag, newExonBlocks ) ;
1077 ++tag ;
1078 }
1079 for ( i = 0 ; i < segCnt ; ++i )
1080 {
1081 for ( j = tag ; j < blockCnt && ( exonBlocks[j].chrId == alignments.GetChromId() && exonBlocks[j].start <= segments[i].b ) ; ++j )
1082 {
1083 int64_t s = -1, e = -1 ;
1084 exonBlocks[j].depthSum += Overlap( segments[i].a, segments[i].b, exonBlocks[j].start, exonBlocks[j].end, s, e ) ;
1085 if ( s == -1 )
1086 continue ;
1087
1088 if ( exonBlocks[j].depth == NULL )
1089 {
1090 int len = exonBlocks[j].end - exonBlocks[j].start + 1 ;
1091 exonBlocks[j].depth = new int[len + 1] ;
1092 memset( exonBlocks[j].depth, 0, sizeof( int ) * ( len + 1 ) ) ;
1093 exonBlocks[j].leftSplice = exonBlocks[j].start ;
1094 exonBlocks[j].rightSplice = exonBlocks[j].end ;
1095 }
1096 int *depth = exonBlocks[j].depth ;
1097 ++depth[s - exonBlocks[j].start] ;
1098 --depth[e - exonBlocks[j].start + 1] ; // notice the +1 here, since the decrease of coverage actually happens at next base.
1099
1100 // record the longest alignment stretch support the splice site.
1101 if ( exonBlocks[j].leftType == 1 && segments[i].a == exonBlocks[j].start
1102 && e > exonBlocks[j].leftSplice )
1103 {
1104 exonBlocks[j].leftSplice = e ;
1105 }
1106 if ( exonBlocks[j].rightType == 2 && segments[i].b == exonBlocks[j].end
1107 && s < exonBlocks[j].rightSplice )
1108 {
1109 exonBlocks[j].rightSplice = s ;
1110 }
1111 }
1112 }
1113 }
1114
1115 for ( ; tag < blockCnt ; ++tag )
1116 AdjustAndCreateExonBlocks( tag, newExonBlocks ) ;
1117 exonBlocks.clear() ;
1118
1119 // Due to multi-alignment, we may skip some alignments that determines leftSplice and
1120 // rightSplice, hence filtered some subexons. As a result, some subexons' anchor subexon will disapper
1121 // and we need to change its boundary type.
1122 blockCnt = newExonBlocks.size() ;
1123 for ( i = 0 ; i < blockCnt ; ++i )
1124 {
1125 if ( ( i == 0 && newExonBlocks[i].leftType == 2 ) ||
1126 ( i > 0 && newExonBlocks[i].leftType == 2 && newExonBlocks[i - 1].end + 1 != newExonBlocks[i].start ) )
1127 {
1128 newExonBlocks[i].leftType = 0 ;
1129 }
1130
1131 if ( ( i == blockCnt - 1 && newExonBlocks[i].rightType == 1 ) ||
1132 ( i < blockCnt - 1 && newExonBlocks[i].rightType == 1 &&
1133 newExonBlocks[i].end + 1 != newExonBlocks[i + 1].start ) )
1134 newExonBlocks[i].rightType = 0 ;
1135 }
1136 exonBlocks = newExonBlocks ;
1137 }
1138
1139 // If two blocks whose soft boundary are close to each other, we can merge them.
1140 void MergeNearBlocks()
1141 {
1142 std::vector<struct _block> rawExonBlocks = exonBlocks ;
1143 int i, k ;
1144 int bsize = rawExonBlocks.size() ;
1145 int *newIdx = new int[bsize] ; // used for adjust prev and next. Note that by merging, it won't change the number of prevCnt or nextCnt.
1146
1147 exonBlocks.clear() ;
1148 exonBlocks.push_back( rawExonBlocks[0] ) ;
1149 k = 0 ;
1150 newIdx[0] = 0 ;
1151 for ( i = 1 ; i < bsize ; ++i )
1152 {
1153 if ( rawExonBlocks[i].chrId == exonBlocks[k].chrId
1154 && rawExonBlocks[i].leftType == 0 && exonBlocks[k].rightType == 0
1155 && rawExonBlocks[i].start - exonBlocks[k].end - 1 <= 50
1156 && rawExonBlocks[i].depthSum / double( rawExonBlocks[i].end - rawExonBlocks[i].start + 1 ) < 3.0
1157 && exonBlocks[k].depthSum / double( exonBlocks[k].end - exonBlocks[i].start + 1 ) < 3.0 )
1158 {
1159 exonBlocks[k].end = rawExonBlocks[i].end ;
1160 exonBlocks[k].rightType = rawExonBlocks[i].rightType ;
1161 exonBlocks[k].rightStrand = rawExonBlocks[i].rightStrand ;
1162 exonBlocks[k].depthSum += rawExonBlocks[i].depthSum ;
1163 if ( rawExonBlocks[i].rightSplice != -1 )
1164 exonBlocks[k].rightSplice = rawExonBlocks[i].rightSplice ;
1165
1166 if ( exonBlocks[k].nextCnt > 0 )
1167 delete[] exonBlocks[k].next ;
1168 exonBlocks[k].nextCnt = rawExonBlocks[i].nextCnt ;
1169 exonBlocks[k].next = rawExonBlocks[i].next ;
1170 }
1171 else
1172 {
1173 exonBlocks.push_back( rawExonBlocks[i] ) ;
1174 ++k ;
1175 }
1176 newIdx[i] = k ;
1177 }
1178 BuildBlockChrIdOffset() ;
1179 bsize = exonBlocks.size() ;
1180 for ( i = 0 ; i < bsize ; ++i )
1181 {
1182 int cnt = exonBlocks[i].prevCnt ;
1183 for ( k = 0 ; k < cnt ; ++k )
1184 exonBlocks[i].prev[k] = newIdx[ exonBlocks[i].prev[k] ] ;
1185
1186 cnt = exonBlocks[i].nextCnt ;
1187 for ( k = 0 ; k < cnt ; ++k )
1188 exonBlocks[i].next[k] = newIdx[ exonBlocks[i].next[k] ] ;
1189 }
1190 delete[] newIdx ;
1191 }
1192
1193 void AddIntronInformation( std::vector<struct _splitSite> &sites, Alignments &alignments )
1194 {
1195 // Add the connection information, support information and the strand information.
1196 int i, j, k, tag ;
1197 tag = 0 ;
1198 int scnt = sites.size() ;
1199 int exonBlockCnt = exonBlocks.size() ;
1200 bool flag ;
1201
1202 for ( i = 0 ; i < exonBlockCnt ; ++i )
1203 {
1204 exonBlocks[i].prevCnt = exonBlocks[i].nextCnt = 0 ;
1205 exonBlocks[i].prev = exonBlocks[i].next = NULL ;
1206 exonBlocks[i].leftStrand = exonBlocks[i].rightStrand = '.' ;
1207 exonBlocks[i].leftSplice = exonBlocks[i].rightSplice = 0 ;
1208 }
1209
1210 // Now, we add the region id and compute the length of each region, here the region does not contain possible IR event.
1211 int regionId = 0 ;
1212 for ( i = 0 ; i < exonBlockCnt ; )
1213 {
1214 if ( exonBlocks[i].leftType == 2 && exonBlocks[i].rightType == 1 )
1215 {
1216 j = i + 1 ;
1217 }
1218 else
1219 for ( j = i + 1 ; j < exonBlockCnt ; ++j )
1220 if ( exonBlocks[j].start > exonBlocks[j - 1].end + 1 || ( exonBlocks[j].leftType == 2 && exonBlocks[j].rightType == 1 ) )
1221 break ;
1222 for ( k = i ; k < j ; ++k )
1223 exonBlocks[k].contigId = regionId ;
1224 ++regionId ;
1225 i = j ;
1226 }
1227 int *regionLength = new int[regionId] ;
1228 memset( regionLength, 0, sizeof( int ) * regionId ) ;
1229 for ( i = 0 ; i < exonBlockCnt ; ++i )
1230 {
1231 regionLength[ exonBlocks[i].contigId ] += exonBlocks[i].end - exonBlocks[i].start + 1 ;
1232 }
1233
1234 for ( i = 0 ; i < scnt ; )
1235 {
1236 for ( j = i ; j < scnt ; ++j )
1237 {
1238 if ( sites[j].chrId != sites[i].chrId ||
1239 sites[j].pos != sites[i].pos ||
1240 sites[j].type != sites[i].type )
1241 break ;
1242 }
1243 // [i,j-1] are the indices of the sites with same coordinate
1244 // It is possible a sites corresponds to 2 strands, then we should only keep one of them.
1245 char strand = '.' ;
1246 int strandSupport[2] = {0, 0};
1247 for ( k = i ; k < j ; ++k )
1248 {
1249 if ( sites[k].strand == '-' )
1250 strandSupport[0] += sites[k].support ;
1251 else if ( sites[k].strand == '+' )
1252 strandSupport[1] += sites[k].support ;
1253 }
1254 if ( strandSupport[0] > strandSupport[1] )
1255 strand = '-' ;
1256 else if ( strandSupport[1] > strandSupport[0] )
1257 strand = '+' ;
1258
1259
1260 int cnt = j - i ;
1261 // Locate the first subexon that can overlap with this site
1262 while ( tag < exonBlockCnt )
1263 {
1264 if ( ( exonBlocks[tag].chrId == sites[i].chrId && exonBlocks[tag].end >= sites[i].pos )
1265 || exonBlocks[tag].chrId > sites[i].chrId )
1266 break ;
1267 ++tag ;
1268 }
1269 flag = false ;
1270 for ( k = tag; k < exonBlockCnt ; ++k )
1271 {
1272 if ( exonBlocks[tag].start > sites[i].pos ||
1273 exonBlocks[tag].chrId > sites[i].chrId )
1274 break ;
1275
1276 if ( exonBlocks[tag].start == sites[i].pos ||
1277 exonBlocks[tag].end == sites[i].pos )
1278 {
1279 flag = true ;
1280 break ;
1281 }
1282 }
1283
1284 if ( !flag )
1285 {
1286 i = j ;
1287 continue ;
1288 }
1289
1290 if ( sites[i].type == 1 && exonBlocks[tag].start == sites[i].pos )
1291 {
1292 exonBlocks[tag].prevCnt = 0 ;
1293 exonBlocks[tag].prev = new int[cnt] ;
1294 exonBlocks[tag].leftStrand = strand ;
1295
1296 // And we also need to put the "next" here.
1297 // Here we assume the oppositePos sorted in increasing order
1298 k = tag - 1 ;
1299 for ( int l = j - 1 ; l >= i ; --l )
1300 {
1301 for ( ; k >= 0 ; --k )
1302 {
1303 if ( exonBlocks[k].end < sites[l].oppositePos || exonBlocks[k].chrId != sites[l].chrId )
1304 break ;
1305 if ( exonBlocks[k].end == sites[l].oppositePos ) //&& exonBlocks[k].rightType == sites[l].type )
1306 {
1307 if ( sites[l].strand != strand ||
1308 sites[l].strand != exonBlocks[k].rightStrand )
1309 break ;
1310 exonBlocks[k].next[ exonBlocks[k].nextCnt ] = tag ;
1311 exonBlocks[tag].prev[ exonBlocks[tag].prevCnt] = k ; // Note that the prev are sorted in decreasing order
1312 ++exonBlocks[k].nextCnt ;
1313 ++exonBlocks[tag].prevCnt ;
1314 double factor = 1.0 ;
1315 if ( regionLength[ exonBlocks[k].contigId ] < alignments.readLen )
1316 {
1317 int left ;
1318 for ( left = k ; left >= 0 ; --left )
1319 if ( exonBlocks[left].contigId != exonBlocks[k].contigId )
1320 break ;
1321 ++left ;
1322
1323 int prevType = 0 ;
1324 // only consider the portion upto k.
1325 for ( int m = left ; m <= k ; ++m )
1326 if ( exonBlocks[m].leftType == 1 )
1327 ++prevType ;
1328
1329 if ( prevType == 0 )
1330 factor = 2 * (double)alignments.readLen / regionLength[ exonBlocks[k].contigId ] ;
1331 }
1332 if ( regionLength[ exonBlocks[tag].contigId ] < alignments.readLen )
1333 {
1334 int right ;
1335 for ( right = tag ; right < exonBlockCnt ; ++right )
1336 if ( exonBlocks[right].contigId != exonBlocks[tag].contigId )
1337 break ;
1338 --right ;
1339
1340 int nextType = 0 ;
1341 for ( int m = tag ; m <= right ; ++m )
1342 if ( exonBlocks[m].rightType == 2 )
1343 ++nextType ;
1344
1345 if ( nextType == 0 )
1346 factor = 2 * (double)alignments.readLen / regionLength[ exonBlocks[tag].contigId ] ;
1347 }
1348 if ( factor > 10 )
1349 factor = 10 ;
1350
1351 if ( exonBlocks[k].contigId != exonBlocks[tag].contigId ) // the support of introns between regions.
1352 {
1353 // Adjust the support if one of the anchor exon is too short.
1354 // This avoid the creation of alternative 3'/5' end due to low coverage from too short anchor.
1355 exonBlocks[tag].leftSplice += sites[l].support * factor ;
1356 exonBlocks[k].rightSplice += sites[l].support * factor ;
1357 }
1358 break ;
1359 }
1360 }
1361 }
1362 }
1363 else if ( sites[i].type == 2 && exonBlocks[tag].end == sites[i].pos )
1364 {
1365 exonBlocks[tag].nextCnt = 0 ; // cnt ; it should reach cnt after putting the ids in
1366 exonBlocks[tag].next = new int[cnt] ;
1367 exonBlocks[tag].rightStrand = strand ;
1368 }
1369
1370 i = j ;
1371 }
1372 // Reverse the prev list
1373 for ( i = 0 ; i < exonBlockCnt ; ++i )
1374 {
1375 for ( j = 0, k = exonBlocks[i].prevCnt - 1 ; j < k ; ++j, --k )
1376 {
1377 int tmp = exonBlocks[i].prev[j] ;
1378 exonBlocks[i].prev[j] = exonBlocks[i].prev[k] ;
1379 exonBlocks[i].prev[k] = tmp ;
1380 }
1381 }
1382
1383 // If all of the subexon anchored the intron are filtered, we need to change the type of the other anchor.
1384 for ( i = 0 ; i < exonBlockCnt ; ++i )
1385 {
1386 if ( exonBlocks[i].leftType == 1 && exonBlocks[i].prevCnt == 0 )
1387 {
1388 exonBlocks[i].leftType = 0 ;
1389 exonBlocks[i].leftStrand = '.' ;
1390 if ( i > 0 && exonBlocks[i - 1].rightType == 1 )
1391 exonBlocks[i - 1].rightType = 0 ;
1392 }
1393 if ( exonBlocks[i].rightType == 2 && exonBlocks[i].nextCnt == 0 )
1394 {
1395 exonBlocks[i].rightType = 0 ;
1396 exonBlocks[i].rightStrand = '.' ;
1397 if ( i < exonBlockCnt - 1 && exonBlocks[i + 1].leftType == 2 )
1398 exonBlocks[i + 1].leftType = 0 ;
1399 }
1400 }
1401 MergeNearBlocks() ;
1402
1403 delete[] regionLength ;
1404 }
1405
1406 double PickLeftAndRightRatio( double l, double r )
1407 {
1408 if ( l >= 0 && r >= 0 )
1409 return l < r ? l : r ;
1410 else if ( l < 0 && r < 0 )
1411 return -1 ;
1412 else if ( l < 0 )
1413 return r ;
1414 else
1415 return l ;
1416 }
1417
1418 double PickLeftAndRightRatio( const struct _block &b )
1419 {
1420 return PickLeftAndRightRatio( b.leftRatio, b.rightRatio ) ;
1421 }
1422
1423 void ComputeRatios()
1424 {
1425 int i, j ;
1426 int exonBlockCnt = exonBlocks.size() ;
1427 for ( i = 0 ; i < exonBlockCnt ; ++i )
1428 {
1429 exonBlocks[i].leftRatio = -1 ;
1430 exonBlocks[i].rightRatio = -1 ;
1431 if ( exonBlocks[i].leftType == 2 && exonBlocks[i].rightType == 1 )
1432 {
1433 // ]...[
1434 double anchorAvg = 0 ;
1435 double avgDepth = GetAvgDepth( exonBlocks[i] ) ;
1436 if ( i >= 1 && exonBlocks[i - 1].chrId == exonBlocks[i].chrId )
1437 {
1438 anchorAvg = GetAvgDepth( exonBlocks[i - 1] ) ;
1439 if ( anchorAvg > 1 && avgDepth > 1 )
1440 exonBlocks[i].leftRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ;
1441 }
1442 if ( i < exonBlockCnt - 1 && exonBlocks[i + 1].chrId == exonBlocks[i].chrId )
1443 {
1444 anchorAvg = GetAvgDepth( exonBlocks[i + 1] ) ;
1445 if ( anchorAvg > 1 && avgDepth > 1 )
1446 exonBlocks[i].rightRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ;
1447 }
1448 }
1449
1450 if ( exonBlocks[i].leftType == 0 && exonBlocks[i].rightType == 1 )
1451 {
1452 // (..[ , overhang subexon.
1453 double avgDepth = GetAvgDepth( exonBlocks[i] ) ;
1454 double anchorAvg = GetAvgDepth( exonBlocks[i + 1] ) ;
1455 if ( avgDepth > 1 && anchorAvg > 1 )
1456 exonBlocks[i].rightRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ;
1457 }
1458
1459 /*if ( exonBlocks[i].leftType == 1 )
1460 {
1461 // For the case (...[, the leftRatio is actuall the leftratio of the subexon on its right.
1462 int len = 0 ;
1463 double depthSum = 0 ;
1464 int tag = i ;
1465
1466 for ( j = 0 ; j < exonBlocks[tag].prevCnt ; ++j )
1467 {
1468 int k = exonBlocks[tag].prev[j] ;
1469 len += ( exonBlocks[k].end - exonBlocks[k].start + 1 ) ;
1470 depthSum += exonBlocks[k].depthSum ;
1471 }
1472 double otherAvgDepth = depthSum / len ;
1473 double avgDepth = GetAvgDepth( exonBlocks[tag] ) ;
1474
1475 // adjust avgDepth by looking into the following exon blocks(subexon) in the same region whose left side is not hard end
1476 for ( j = tag + 1 ; j < exonBlockCnt ; ++j )
1477 {
1478 if ( exonBlocks[j].start != exonBlocks[j - 1].end + 1 || exonBlocks[j].leftType == 1 )
1479 break ;
1480 double tmp = GetAvgDepth( exonBlocks[j] ) ;
1481 if ( tmp > avgDepth )
1482 avgDepth = tmp ;
1483 }
1484
1485 if ( avgDepth < 1 )
1486 avgDepth = 1 ;
1487 if ( otherAvgDepth < 1 )
1488 otherAvgDepth = 1 ;
1489
1490 //exonBlocks[i].leftRatio = pow( log( avgDepth ) / log(2.0), 2.0 / 3.0 ) - pow( log( otherAvgDepth ) / log( 2.0 ), 2.0 / 3.0 );
1491 exonBlocks[i].leftRatio = sqrt( avgDepth ) - log( avgDepth ) - ( sqrt( otherAvgDepth ) + log( otherAvgDepth ) ) ;
1492
1493 }*/
1494 if ( exonBlocks[i].rightType == 0 && exonBlocks[i].leftType == 2 )
1495 {
1496 // for overhang subexon.
1497 double avgDepth = GetAvgDepth( exonBlocks[i] ) ;
1498 double anchorAvg = GetAvgDepth( exonBlocks[i - 1] ) ;
1499 if ( avgDepth > 1 && anchorAvg > 1 )
1500 exonBlocks[i].leftRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ;
1501
1502 }
1503 /*if ( exonBlocks[i].rightType == 2 )
1504 {
1505 int len = 0 ;
1506 double depthSum = 0 ;
1507 int tag = i ;
1508
1509 for ( j = 0 ; j < exonBlocks[tag].nextCnt ; ++j )
1510 {
1511 int k = exonBlocks[tag].next[j] ;
1512 len += ( exonBlocks[k].end - exonBlocks[k].start + 1 ) ;
1513 depthSum += exonBlocks[k].depthSum ;
1514 }
1515 double otherAvgDepth = depthSum / len ;
1516 double avgDepth = GetAvgDepth( exonBlocks[tag] ) ;
1517
1518 // adjust avgDepth by looking into the earlier exon blocks(subexon) in the same region whose right side is not hard end
1519 for ( j = tag - 1 ; j >= 0 ; --j )
1520 {
1521 if ( exonBlocks[j].end + 1 != exonBlocks[j + 1].start || exonBlocks[j].rightType == 2 )
1522 break ;
1523 double tmp = GetAvgDepth( exonBlocks[j] ) ;
1524 if ( tmp > avgDepth )
1525 avgDepth = tmp ;
1526 }
1527
1528 if ( avgDepth < 1 )
1529 avgDepth = 1 ;
1530 if ( otherAvgDepth < 1 )
1531 otherAvgDepth = 1 ;
1532
1533 exonBlocks[i].rightRatio = sqrt( avgDepth ) - log( avgDepth ) - ( sqrt( otherAvgDepth ) + log( otherAvgDepth ) ) ;
1534 //pow( log( avgDepth ) / log(2.0), 2.0 / 3.0 ) - pow( log( otherAvgDepth ) / log(2.0), 2.0 / 3.0 );
1535 }*/
1536 // The remaining case the islands, (...)
1537 }
1538
1539 // go through each region of subexons, and only compute the ratio for the first and last hard boundary
1540 for ( i = 0 ; i < exonBlockCnt ; )
1541 {
1542 // the blocks from [i,j) corresponds to a region, namely consecutive subexons.
1543 for ( j = i + 1 ; j < exonBlockCnt ; ++j )
1544 if ( exonBlocks[j].contigId != exonBlocks[i].contigId )
1545 break ;
1546
1547 int leftSupport = 0 ;
1548 int rightSupport = 0 ;
1549 int leftTag = j, rightTag = i - 1 ;
1550 for ( int k = i ; k < j ; ++k )
1551 {
1552 if ( exonBlocks[k].leftType == 1 && exonBlocks[k].leftSplice > 0 )
1553 {
1554 leftSupport += exonBlocks[k].leftSplice ;
1555 if ( k < leftTag )
1556 leftTag = k ;
1557 }
1558 if ( exonBlocks[k].rightType == 2 && exonBlocks[k].rightSplice > 0 )
1559 {
1560 rightSupport += exonBlocks[k].rightSplice ;
1561 if ( k > rightTag )
1562 rightTag = k ;
1563 }
1564 }
1565
1566 if ( leftSupport > 0 && rightSupport > 0 )
1567 {
1568 // when the right splice support is much greater than that of the left side, there should be a soft boundary for the left side.
1569 // Wether we want to include this soft boundary or not will be determined in class, when it looked at whether the overhang block
1570 // should be kept or not.
1571 exonBlocks[ leftTag ].leftRatio = sqrt( rightSupport ) - log( ( rightSupport + leftSupport ) * 0.5 ) - ( sqrt( leftSupport ) ) ; //+ log( leftSupport ) ) ;
1572 exonBlocks[ rightTag ].rightRatio = sqrt( leftSupport ) - log( ( rightSupport + leftSupport ) * 0.5 ) - ( sqrt( rightSupport ) ) ; //+ log( rightSupport ) ) ;
1573 }
1574
1575 i = j ; // don't forget this.
1576 }
1577 }
1578 } ;
1579
1580 #endif