Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/blocks.hpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 // The class manage the blocks | |
2 // Li Song | |
3 | |
4 #ifndef _LSONG_CLASSES_BLOCKS_HEADER | |
5 #define _LSONG_CLASSES_BLOCKS_HEADER | |
6 | |
7 #include <stdlib.h> | |
8 #include <vector> | |
9 #include <map> | |
10 #include <assert.h> | |
11 #include <math.h> | |
12 #include <set> | |
13 #include <inttypes.h> | |
14 | |
15 #include "defs.h" | |
16 | |
17 extern bool VERBOSE ; | |
18 extern FILE *fpOut ; | |
19 | |
20 extern int gMinDepth ; | |
21 | |
22 | |
23 struct _splitSite // means the next position belongs to another block | |
24 { | |
25 int64_t pos ; | |
26 char strand ; | |
27 int chrId ; | |
28 int type ; // 1-start of an exon. 2-end of an exon. | |
29 | |
30 int support ; | |
31 int uniqSupport ; | |
32 | |
33 int mismatchSum ; | |
34 | |
35 int64_t oppositePos ; // the position of the other sites to form the intron. | |
36 } ; | |
37 | |
38 struct _block | |
39 { | |
40 int chrId ; | |
41 int contigId ; | |
42 int64_t start, end ; | |
43 int64_t leftSplice, rightSplice ; // Some information about the left splice site and right splice site. | |
44 // record the leftmost and rightmost coordinates of a splice site within this block | |
45 //or the length of read alignment support the splice sites. | |
46 //Or the number of support. | |
47 int64_t depthSum ; | |
48 | |
49 int leftType, rightType ; //0-soft boundary, 1-start of an exon, 2-end of an exon. | |
50 | |
51 double leftRatio, rightRatio ; // the adjusted ratio-like value to the left and right anchor subexons. | |
52 double ratio ; | |
53 | |
54 char leftStrand, rightStrand ; | |
55 | |
56 int *depth ; | |
57 int prevCnt, nextCnt ; // Some connection information for the subexons. | |
58 int *prev ; | |
59 int *next ; | |
60 } ; | |
61 | |
62 // adjacent graph | |
63 struct _adj | |
64 { | |
65 int ind ; | |
66 int info ; | |
67 int support ; | |
68 int next ; | |
69 } ; | |
70 | |
71 class Blocks | |
72 { | |
73 private: | |
74 std::map<int, int> exonBlocksChrIdOffset ; | |
75 | |
76 int64_t Overlap( int64_t s0, int64_t e0, int64_t s1, int64_t e1, int64_t &s, int64_t &e ) | |
77 { | |
78 s = e = -1 ; | |
79 if ( e0 < s1 || s0 > e1 ) | |
80 return 0 ; | |
81 s = s0 > s1 ? s0 : s1 ; | |
82 e = e0 < e1 ? e0 : e1 ; | |
83 return e - s + 1 ; | |
84 } | |
85 | |
86 void Split( const char *s, char delimit, std::vector<std::string> &fields ) | |
87 { | |
88 int i ; | |
89 fields.clear() ; | |
90 if ( s == NULL ) | |
91 return ; | |
92 | |
93 std::string f ; | |
94 for ( i = 0 ; s[i] ; ++i ) | |
95 { | |
96 if ( s[i] == delimit || s[i] == '\n' ) | |
97 { | |
98 fields.push_back( f ) ; | |
99 f.clear() ; | |
100 } | |
101 else | |
102 f.append( 1, s[i] ) ; | |
103 } | |
104 fields.push_back( f ) ; | |
105 f.clear() ; | |
106 } | |
107 | |
108 void BuildBlockChrIdOffset() | |
109 { | |
110 // Build the map for the offsets of chr id in the exonBlock list. | |
111 exonBlocksChrIdOffset[ exonBlocks[0].chrId] = 0 ; | |
112 int cnt = exonBlocks.size() ; | |
113 for ( int i = 1 ; i < cnt ; ++i ) | |
114 { | |
115 if ( exonBlocks[i].chrId != exonBlocks[i - 1].chrId ) | |
116 exonBlocksChrIdOffset[ exonBlocks[i].chrId ] = i ; | |
117 } | |
118 } | |
119 | |
120 bool CanReach( int from, int to, struct _adj *adj, bool *visited ) | |
121 { | |
122 if ( visited[from] ) | |
123 return false ; | |
124 visited[from] = true ; | |
125 int p ; | |
126 p = adj[from].next ; | |
127 while ( p != -1 ) | |
128 { | |
129 if ( adj[p].ind == to ) | |
130 return true ; | |
131 if ( adj[p].ind < to | |
132 && CanReach( adj[p].ind, to, adj, visited ) ) | |
133 return true ; | |
134 p = adj[p].next ; | |
135 } | |
136 return false ; | |
137 } | |
138 | |
139 void AdjustAndCreateExonBlocks( int tag, std::vector<struct _block> &newExonBlocks ) | |
140 { | |
141 int i, j ; | |
142 if ( exonBlocks[tag].depth != NULL ) | |
143 { | |
144 // Convert the increment and decrement into actual depth. | |
145 int len = exonBlocks[tag].end - exonBlocks[tag].start + 1 ; | |
146 int *depth = exonBlocks[tag].depth ; | |
147 for ( i = 1 ; i < len ; ++i ) | |
148 depth[i] = depth[i - 1] + depth[i] ; | |
149 | |
150 struct _block island ; // the portion created by the hollow. | |
151 island.start = island.end = -1 ; | |
152 island.depthSum = 0 ; | |
153 | |
154 /*if ( exonBlocks[tag].start == 1562344 ) | |
155 { | |
156 for ( i = 0 ; i < len ; ++i ) | |
157 printf( "%d\n", depth[i] ) ; | |
158 }*/ | |
159 // Adjust boundary accordingly. | |
160 int64_t adjustStart = exonBlocks[tag].start ; | |
161 int64_t adjustEnd = exonBlocks[tag].end ; | |
162 | |
163 if ( exonBlocks[tag].leftType == 0 && exonBlocks[tag].rightType != 0 ) | |
164 { | |
165 for ( i = len - 1 ; i >= 0 ; --i ) | |
166 if ( depth[i] < gMinDepth ) | |
167 break ; | |
168 ++i ; | |
169 if ( exonBlocks[tag].rightType == 2 && i + exonBlocks[tag].start > exonBlocks[tag].rightSplice ) | |
170 i = exonBlocks[tag].rightSplice - exonBlocks[tag].start ; | |
171 adjustStart = i + exonBlocks[tag].start ; | |
172 | |
173 if ( adjustStart > exonBlocks[tag].start ) | |
174 { | |
175 int s, e ; | |
176 // firstly [s,e] is the range of depth array. | |
177 for ( s = 0 ; s < i ; ++s ) | |
178 if ( depth[s] >= gMinDepth ) | |
179 break ; | |
180 for ( e = i - 1 ; e >= s ; -- e ) | |
181 if ( depth[e] >= gMinDepth ) | |
182 break ; | |
183 if ( e >= s ) | |
184 { | |
185 island = exonBlocks[tag] ; | |
186 island.depthSum = 0 ; | |
187 island.leftType = island.rightType = 0 ; | |
188 for ( j = s ; j <= e ; ++j ) | |
189 island.depthSum += depth[j] ; | |
190 island.start = s + exonBlocks[tag].start ; // offset the coordinate. | |
191 island.end = e + exonBlocks[tag].start ; | |
192 } | |
193 } | |
194 } | |
195 else if ( exonBlocks[tag].leftType != 0 && exonBlocks[tag].rightType == 0 ) | |
196 { | |
197 for ( i = 0 ; i < len ; ++i ) | |
198 if ( depth[i] < gMinDepth ) | |
199 break ; | |
200 --i ; | |
201 if ( exonBlocks[tag].leftType == 1 && i + exonBlocks[tag].start < exonBlocks[tag].leftSplice ) | |
202 i = exonBlocks[tag].leftSplice - exonBlocks[tag].start ; | |
203 adjustEnd = i + exonBlocks[tag].start ; | |
204 | |
205 if ( adjustEnd < exonBlocks[tag].end ) | |
206 { | |
207 int s, e ; | |
208 // firstly [s,e] is the range of depth array. | |
209 for ( s = i + 1 ; s < len ; ++s ) | |
210 if ( depth[s] >= gMinDepth ) | |
211 break ; | |
212 for ( e = len - 1 ; e >= s ; --e ) | |
213 if ( depth[e] >= gMinDepth ) | |
214 break ; | |
215 if ( e >= s ) | |
216 { | |
217 island = exonBlocks[tag] ; | |
218 island.depthSum = 0 ; | |
219 island.leftType = island.rightType = 0 ; | |
220 for ( j = s ; j <= e ; ++j ) | |
221 island.depthSum += depth[j] ; | |
222 island.start = s + exonBlocks[tag].start ; // offset the coordinate. | |
223 island.end = e + exonBlocks[tag].start ; | |
224 } | |
225 } | |
226 } | |
227 else if ( exonBlocks[tag].leftType == 0 && exonBlocks[tag].rightType == 0 ) | |
228 { | |
229 for ( i = 0 ; i < len ; ++i ) | |
230 if ( depth[i] >= gMinDepth ) | |
231 break ; | |
232 adjustStart = i + exonBlocks[tag].start ; | |
233 | |
234 for ( i = len - 1 ; i >= 0 ; --i ) | |
235 if ( depth[i] >= gMinDepth ) | |
236 break ; | |
237 adjustEnd = i + exonBlocks[tag].start ; | |
238 } | |
239 else if ( exonBlocks[tag].leftType == 1 && exonBlocks[tag].rightType == 2 | |
240 && exonBlocks[tag].end - exonBlocks[tag].start + 1 >= 1000 ) | |
241 { | |
242 // The possible merge of two genes or merge of UTRs, if we don't break the low coverage part. | |
243 // If we decide to cut, I'll reuse the variable "island" to represent the subexon on right hand side. | |
244 int l ; | |
245 int gapSize = 30 ; | |
246 for ( i = 0 ; i <= len - ( gapSize + 1 ) ; ++i ) | |
247 { | |
248 if ( depth[i] < gMinDepth ) | |
249 { | |
250 for ( l = i ; l < i + ( gapSize + 1 ) ; ++l ) | |
251 if ( depth[l] >= gMinDepth ) | |
252 break ; | |
253 if ( l < i + ( gapSize + 1 ) ) | |
254 { | |
255 i = l - 1 ; | |
256 continue ; | |
257 } | |
258 else | |
259 break ; | |
260 } | |
261 } | |
262 | |
263 for ( j = len - 1 ; j >= ( gapSize + 1 ) - 1 ; --j ) | |
264 { | |
265 if ( depth[j] < gMinDepth ) | |
266 { | |
267 for ( l = j ; l >= j - ( gapSize + 1 ) + 1 ; --l ) | |
268 if ( depth[l] >= gMinDepth ) | |
269 break ; | |
270 | |
271 if ( l >= j - ( gapSize + 1 ) + 1 ) | |
272 { | |
273 j = l + 1 ; | |
274 continue ; | |
275 } | |
276 else | |
277 break ; | |
278 } | |
279 } | |
280 | |
281 if ( j - i + 1 > gapSize ) | |
282 { | |
283 // we break. | |
284 --i ; ++j ; | |
285 | |
286 adjustEnd = i + exonBlocks[tag].start ; | |
287 | |
288 if ( j < len - 1 ) | |
289 { | |
290 island = exonBlocks[tag] ; | |
291 island.depthSum = 0 ; | |
292 island.leftType = 0 ; | |
293 for ( l = j ; l <= len - 1 ; ++l ) | |
294 island.depthSum += depth[j] ; | |
295 island.start = j + exonBlocks[tag].start ; // offset the coordinate. | |
296 island.end = exonBlocks[tag].end ; | |
297 } | |
298 | |
299 exonBlocks[tag].rightType = 0 ; // put it here, so the island can get the right info | |
300 //fprintf( stderr, "break %d %d %d %d.\n", (int)exonBlocks[tag].start, (int)adjustEnd, i + (int)exonBlocks[tag].start, j + (int)exonBlocks[tag].start ) ; | |
301 } | |
302 } | |
303 | |
304 int lostDepthSum = 0 ; | |
305 for ( i = exonBlocks[tag].start ; i < adjustStart ; ++i ) | |
306 lostDepthSum += depth[i - exonBlocks[tag].start ] ; | |
307 for ( i = adjustEnd + 1 ; i < exonBlocks[tag].end ; ++i ) | |
308 lostDepthSum += depth[i - exonBlocks[tag].start ] ; | |
309 exonBlocks[tag].depthSum -= lostDepthSum ; | |
310 | |
311 delete[] exonBlocks[tag].depth ; | |
312 exonBlocks[tag].depth = NULL ; | |
313 | |
314 //if ( exonBlocks[tag].start == 1562344 ) | |
315 // printf( "%d %d\n", adjustStart, adjustEnd ) ; | |
316 if ( ( len > 1 && adjustEnd - adjustStart + 1 <= 1 ) || ( adjustEnd - adjustStart + 1 <= 0 ) ) | |
317 return ; | |
318 exonBlocks[tag].start = adjustStart ; | |
319 exonBlocks[tag].end = adjustEnd ; | |
320 | |
321 if ( island.start != -1 && island.end < exonBlocks[tag].start ) | |
322 newExonBlocks.push_back( island ) ; | |
323 | |
324 newExonBlocks.push_back( exonBlocks[tag] ) ; | |
325 if ( island.start != -1 && island.start > exonBlocks[tag].end ) | |
326 newExonBlocks.push_back( island ) ; | |
327 } | |
328 } | |
329 public: | |
330 std::vector<struct _block> exonBlocks ; | |
331 | |
332 Blocks() { } | |
333 ~Blocks() | |
334 { | |
335 int blockCnt = exonBlocks.size() ; | |
336 for ( int i = 0 ; i < blockCnt ; ++i ) | |
337 { | |
338 if ( exonBlocks[i].next != NULL ) | |
339 { | |
340 delete[] exonBlocks[i].next ; | |
341 } | |
342 if ( exonBlocks[i].prev != NULL ) | |
343 { | |
344 delete[] exonBlocks[i].prev ; | |
345 } | |
346 } | |
347 } | |
348 | |
349 double GetAvgDepth( const struct _block &block ) | |
350 { | |
351 return block.depthSum / (double)( block.end - block.start + 1 ) ; | |
352 } | |
353 | |
354 int BuildExonBlocks( Alignments &alignments ) | |
355 { | |
356 int tag = 0 ; | |
357 while ( alignments.Next() ) | |
358 { | |
359 int i, j, k ; | |
360 int segCnt = alignments.segCnt ; | |
361 struct _pair *segments = alignments.segments ; | |
362 int eid = 0 ; // the exonblock id that the segment update | |
363 | |
364 // locate the first exonblock that beyond the start of the read. | |
365 while ( tag < (int)exonBlocks.size() && ( exonBlocks[tag].end < segments[0].a - 1 | |
366 || exonBlocks[tag].chrId != alignments.GetChromId() ) ) | |
367 { | |
368 ++tag ; | |
369 } | |
370 // due to the merge of exon blocks, we might need roll back | |
371 --tag ; | |
372 while ( tag >= 0 && ( exonBlocks[tag].end >= segments[0].a - 1 | |
373 && exonBlocks[tag].chrId == alignments.GetChromId() ) ) | |
374 { | |
375 --tag ; | |
376 } | |
377 ++tag ; | |
378 | |
379 for ( i = 0 ; i < segCnt ; ++i ) | |
380 { | |
381 //if ( i == 0 ) | |
382 // printf( "hi %d %s %d %d\n", i, alignments.GetReadId(), segments[i].a, segments[i].b ) ; | |
383 for ( j = tag ; j < (int)exonBlocks.size() ; ++j ) | |
384 { | |
385 if ( exonBlocks[j].end >= segments[i].a - 1 ) | |
386 break ; | |
387 } | |
388 if ( j >= (int)exonBlocks.size() ) | |
389 { | |
390 // Append a new block | |
391 struct _block newSeg ; | |
392 newSeg.chrId = alignments.GetChromId() ; | |
393 newSeg.start = segments[i].a ; | |
394 newSeg.end = segments[i].b ; | |
395 | |
396 newSeg.leftSplice = -1 ; | |
397 newSeg.rightSplice = -1 ; | |
398 if ( i > 0 ) | |
399 newSeg.leftSplice = segments[i].a ; | |
400 if ( i < segCnt - 1 ) | |
401 newSeg.rightSplice = segments[i].b ; | |
402 | |
403 exonBlocks.push_back( newSeg ) ; | |
404 | |
405 eid = exonBlocks.size() - 1 ; | |
406 } | |
407 else if ( exonBlocks[j].end < segments[i].b || | |
408 ( exonBlocks[j].start > segments[i].a && exonBlocks[j].start <= segments[i].b + 1 ) ) | |
409 { | |
410 // If overlaps with a current exon block, so we extend it | |
411 if ( exonBlocks[j].end < segments[i].b ) | |
412 { | |
413 // extends toward right | |
414 exonBlocks[j].end = segments[i].b ; | |
415 if ( i > 0 && ( exonBlocks[j].leftSplice == -1 || segments[i].a < exonBlocks[j].leftSplice ) ) | |
416 exonBlocks[j].leftSplice = segments[i].a ; | |
417 if ( i < segCnt - 1 && segments[i].b > exonBlocks[j].rightSplice ) | |
418 exonBlocks[j].rightSplice = segments[i].b ; | |
419 eid = j ; | |
420 | |
421 // Merge with next few exon blocks | |
422 for ( k = j + 1 ; k < (int)exonBlocks.size() ; ++k ) | |
423 { | |
424 if ( exonBlocks[k].start <= exonBlocks[j].end + 1 ) | |
425 { | |
426 if ( exonBlocks[k].end > exonBlocks[j].end ) | |
427 exonBlocks[j].end = exonBlocks[k].end ; | |
428 | |
429 if ( exonBlocks[k].leftSplice != -1 && ( exonBlocks[j].leftSplice == -1 || exonBlocks[k].leftSplice < exonBlocks[j].leftSplice ) ) | |
430 exonBlocks[j].leftSplice = exonBlocks[k].leftSplice ; | |
431 if ( exonBlocks[k].rightSplice != -1 && exonBlocks[k].rightSplice > exonBlocks[j].rightSplice ) | |
432 exonBlocks[j].rightSplice = exonBlocks[k].rightSplice ; | |
433 | |
434 } | |
435 else | |
436 break ; | |
437 } | |
438 | |
439 if ( k > j + 1 ) | |
440 { | |
441 // Remove the merged blocks | |
442 int a, b ; | |
443 for ( a = j + 1, b = k ; b < (int)exonBlocks.size() ; ++a, ++b ) | |
444 exonBlocks[a] = exonBlocks[b] ; | |
445 for ( a = 0 ; a < k - ( j + 1 ) ; ++a ) | |
446 exonBlocks.pop_back() ; | |
447 } | |
448 } | |
449 else if ( exonBlocks[j].start > segments[i].a && exonBlocks[j].start <= segments[i].b + 1 ) | |
450 { | |
451 // extends toward left | |
452 exonBlocks[j].start = segments[i].a ; | |
453 if ( i > 0 && ( exonBlocks[j].leftSplice == -1 || segments[i].a < exonBlocks[j].leftSplice ) ) | |
454 exonBlocks[j].leftSplice = segments[i].a ; | |
455 if ( i < segCnt - 1 && segments[i].b > exonBlocks[j].rightSplice ) | |
456 exonBlocks[j].rightSplice = segments[i].b ; | |
457 eid = j ; | |
458 | |
459 // Merge with few previous exon blocks | |
460 for ( k = j - 1 ; k >= 0 ; --k ) | |
461 { | |
462 if ( exonBlocks[k].end >= exonBlocks[k + 1].start - 1 ) | |
463 { | |
464 if ( exonBlocks[k + 1].start < exonBlocks[k].start ) | |
465 { | |
466 exonBlocks[k].start = exonBlocks[k + 1].start ; | |
467 } | |
468 | |
469 if ( exonBlocks[k].leftSplice != -1 && ( exonBlocks[j].leftSplice == -1 || exonBlocks[k].leftSplice < exonBlocks[j].leftSplice ) ) | |
470 exonBlocks[j].leftSplice = exonBlocks[k].leftSplice ; | |
471 if ( exonBlocks[k].rightSplice != -1 && exonBlocks[k].rightSplice > exonBlocks[j].rightSplice ) | |
472 exonBlocks[j].rightSplice = exonBlocks[k].rightSplice ; | |
473 | |
474 } | |
475 else | |
476 break ; | |
477 } | |
478 | |
479 if ( k < j - 1 ) | |
480 { | |
481 int a, b ; | |
482 eid = k + 1 ; | |
483 for ( a = k + 2, b = j + 1 ; b < (int)exonBlocks.size() ; ++a, ++b ) | |
484 exonBlocks[a] = exonBlocks[b] ; | |
485 for ( a = 0 ; a < ( j - 1 ) - k ; ++a ) | |
486 exonBlocks.pop_back() ; | |
487 | |
488 } | |
489 } | |
490 } | |
491 else if ( exonBlocks[j].start > segments[i].b + 1 ) | |
492 { | |
493 int size = exonBlocks.size() ; | |
494 int a ; | |
495 // No overlap, insert a new block | |
496 struct _block newSeg ; | |
497 newSeg.chrId = alignments.GetChromId() ; | |
498 newSeg.start = segments[i].a ; | |
499 newSeg.end = segments[i].b ; | |
500 | |
501 newSeg.leftSplice = -1 ; | |
502 newSeg.rightSplice = -1 ; | |
503 if ( i > 0 ) | |
504 newSeg.leftSplice = segments[i].a ; | |
505 if ( i < segCnt - 1 ) | |
506 newSeg.rightSplice = segments[i].b ; | |
507 | |
508 // Insert at position j | |
509 exonBlocks.push_back( newSeg ) ; | |
510 for ( a = size ; a > j ; --a ) | |
511 exonBlocks[a] = exonBlocks[a - 1] ; | |
512 exonBlocks[a] = newSeg ; | |
513 | |
514 eid = a ; | |
515 } | |
516 else | |
517 { | |
518 // The segment is contained in j | |
519 eid = -1 ; | |
520 } | |
521 | |
522 // Merge the block with the mate if the gap is very small | |
523 // Note that since reads are already sorted by coordinate, | |
524 // the previous exon blocks is built completely. | |
525 /*int64_t matePos ; | |
526 int mateChrId ; | |
527 alignments.GetMatePosition( mateChrId, matePos ) ; | |
528 if ( i == 0 && eid > 0 && mateChrId == exonBlocks[ eid ].chrId | |
529 && matePos < segments[0].a | |
530 && matePos >= exonBlocks[eid - 1].start | |
531 && matePos <= exonBlocks[eid - 1].end | |
532 && segments[0].a - matePos + 1 <= 500 | |
533 && exonBlocks[eid-1].chrId == exonBlocks[eid].chrId | |
534 && exonBlocks[eid].start - exonBlocks[eid - 1].end - 1 <= 30 ) | |
535 { | |
536 printf( "%d: (%d-%d) (%d-%d). %d %d\n", ( segments[0].a + alignments.readLen - 1 ) - matePos + 1, exonBlocks[eid - 1].start, exonBlocks[eid - 1].end, | |
537 exonBlocks[eid].start, | |
538 exonBlocks[eid].end, eid, exonBlocks.size() ) ; | |
539 exonBlocks[eid - 1].end = exonBlocks[eid].end ; | |
540 | |
541 if ( exonBlocks[eid].leftSplice != -1 && ( exonBlocks[eid - 1].leftSplice == -1 || exonBlocks[eid].leftSplice < exonBlocks[eid - 1].leftSplice ) ) | |
542 exonBlocks[eid - 1].leftSplice = exonBlocks[k].leftSplice ; | |
543 if ( exonBlocks[eid].rightSplice != -1 && exonBlocks[eid].rightSplice > exonBlocks[eid - 1].rightSplice ) | |
544 exonBlocks[eid - 1].rightSplice = exonBlocks[eid].rightSplice ; | |
545 | |
546 int size = exonBlocks.size() ; | |
547 for ( j = eid ; j < size - 1 ; ++j ) | |
548 exonBlocks[j] = exonBlocks[j + 1] ; | |
549 exonBlocks.pop_back() ; | |
550 }*/ | |
551 } | |
552 } | |
553 /*for ( int i = 0 ; i < (int)exonBlocks.size() ; ++i ) | |
554 { | |
555 printf( "%d %d\n", exonBlocks[i].start, exonBlocks[i].end ) ; | |
556 }*/ | |
557 | |
558 if ( exonBlocks.size() > 0 ) | |
559 { | |
560 BuildBlockChrIdOffset() ; | |
561 int cnt = exonBlocks.size() ; | |
562 for ( int i = 0 ; i < cnt ; ++i ) | |
563 { | |
564 exonBlocks[i].contigId = exonBlocks[i].chrId ; | |
565 | |
566 exonBlocks[i].leftType = 0 ; | |
567 exonBlocks[i].rightType = 0 ; | |
568 exonBlocks[i].depth = NULL ; | |
569 exonBlocks[i].nextCnt = 0 ; | |
570 exonBlocks[i].prevCnt = 0 ; | |
571 exonBlocks[i].leftStrand = '.' ; | |
572 exonBlocks[i].rightStrand = '.' ; | |
573 } | |
574 } | |
575 return exonBlocks.size() ; | |
576 } | |
577 | |
578 void FilterSplitSitesInRegions( std::vector<struct _splitSite> &sites ) | |
579 { | |
580 int i, j, k ; | |
581 int size = sites.size() ; | |
582 int bsize = exonBlocks.size() ; | |
583 int tag = 0 ; | |
584 | |
585 for ( i = 0 ; i < bsize ; ++i ) | |
586 { | |
587 while ( tag < size && ( sites[tag].chrId < exonBlocks[i].chrId || | |
588 ( sites[tag].chrId == exonBlocks[i].chrId && sites[tag].pos < exonBlocks[i].start ) ) ) | |
589 { | |
590 ++tag ; | |
591 } | |
592 if ( tag >= size ) | |
593 break ; | |
594 | |
595 if ( sites[tag].chrId != exonBlocks[i].chrId || sites[tag].pos > exonBlocks[i].end ) | |
596 continue ; | |
597 | |
598 for ( j = tag + 1 ; j < size ; ++j ) | |
599 if ( sites[j].chrId != exonBlocks[i].chrId || sites[j].pos > exonBlocks[i].end ) | |
600 break ; | |
601 | |
602 // [tag,j) holds the split sites in this region. | |
603 int leftStrandSupport[2] = {0, 0} ; | |
604 int rightStrandSupport[2] = {0, 0} ; | |
605 int strandCnt[2] = { 0, 0 } ; | |
606 for ( k = tag ; k < j ; ++k ) | |
607 { | |
608 if ( sites[k].strand == '-' ) | |
609 { | |
610 ++strandCnt[0] ; | |
611 if ( sites[k].oppositePos < exonBlocks[i].start ) | |
612 leftStrandSupport[0] += sites[k].support ; | |
613 else if ( sites[k].oppositePos > exonBlocks[i].end ) | |
614 rightStrandSupport[0] += sites[k].support ; | |
615 } | |
616 else if ( sites[k].strand == '+' ) | |
617 { | |
618 ++strandCnt[1] ; | |
619 if ( sites[k].oppositePos < exonBlocks[i].start ) | |
620 leftStrandSupport[1] += sites[k].support ; | |
621 else if ( sites[k].oppositePos > exonBlocks[i].end ) | |
622 rightStrandSupport[1] += sites[k].support ; | |
623 } | |
624 | |
625 } | |
626 | |
627 | |
628 if ( leftStrandSupport[0] == 0 && rightStrandSupport[0] == 0 | |
629 && leftStrandSupport[1] != 0 && rightStrandSupport[1] != 0 && strandCnt[0] == 2 ) | |
630 { | |
631 // check whether a different strand accidentally show up in this region. | |
632 bool remove = false ; | |
633 for ( k = tag ; k < j ; ++k ) | |
634 { | |
635 if ( sites[k].strand == '-' && | |
636 sites[k].oppositePos >= sites[k].pos && sites[k].oppositePos <= exonBlocks[i].end && | |
637 sites[k].support <= 2 ) | |
638 { | |
639 remove = true ; | |
640 break ; | |
641 } | |
642 } | |
643 | |
644 if ( remove ) | |
645 for ( k = tag ; k < j ; ++k ) | |
646 if ( sites[k].strand == '-' ) | |
647 sites[k].support = -1 ; | |
648 } | |
649 else if ( leftStrandSupport[1] == 0 && rightStrandSupport[1] == 0 | |
650 && leftStrandSupport[0] != 0 && rightStrandSupport[0] != 0 && strandCnt[1] == 2 ) | |
651 { | |
652 // check whether a different strand accidentally show up in this region. | |
653 bool remove = false ; | |
654 for ( k = tag ; k < j ; ++k ) | |
655 { | |
656 if ( sites[k].strand == '+' && | |
657 sites[k].oppositePos >= sites[k].pos && sites[k].oppositePos <= exonBlocks[i].end && | |
658 sites[k].support <= 2 ) | |
659 { | |
660 remove = true ; | |
661 break ; | |
662 } | |
663 } | |
664 | |
665 if ( remove ) | |
666 for ( k = tag ; k < j ; ++k ) | |
667 if ( sites[k].strand == '+' ) | |
668 sites[k].support = -1 ; | |
669 } | |
670 | |
671 tag = j ; | |
672 } | |
673 | |
674 k = 0 ; | |
675 for ( i = 0 ; i < size ; ++i ) | |
676 if ( sites[i].support > 0 ) | |
677 { | |
678 sites[k] = sites[i] ; | |
679 ++k ; | |
680 } | |
681 sites.resize( k ) ; | |
682 } | |
683 | |
684 // Filter the pair of split sites that is likely merge two genes. | |
685 // We filter the case like [..]------------------------[..] | |
686 // [..]--[..] [..]-[...] | |
687 void FilterGeneMergeSplitSites( std::vector<struct _splitSite> &sites ) | |
688 { | |
689 int i, j, k ; | |
690 int bsize = exonBlocks.size() ; | |
691 int ssize = sites.size() ; | |
692 | |
693 struct _pair32 *siteToBlock = new struct _pair32[ ssize ] ; | |
694 struct _adj *adj = new struct _adj[ ssize / 2 + bsize ] ; | |
695 bool *visited = new bool[bsize] ; | |
696 | |
697 int adjCnt = 0 ; | |
698 for ( i = 0 ; i < bsize ; ++i ) | |
699 { | |
700 adj[i].info = 0 ; // in the header, info means the number of next block | |
701 adj[i].support = 0 ; | |
702 adj[i].next = -1 ; | |
703 } | |
704 adjCnt = i ; | |
705 memset( siteToBlock, -1, sizeof( struct _pair32 ) * ssize ) ; | |
706 memset( visited, false, sizeof( bool ) * bsize ) ; | |
707 | |
708 // Build the graph. | |
709 k = 0 ; | |
710 for ( i = 0 ; i < bsize ; ++i ) | |
711 { | |
712 for ( ; k < ssize ; ++k ) | |
713 { | |
714 if ( sites[k].oppositePos < sites[k].pos ) | |
715 continue ; | |
716 if ( sites[k].chrId < exonBlocks[i].chrId | |
717 || ( sites[k].chrId == exonBlocks[i].chrId && sites[k].pos < exonBlocks[i].start ) ) | |
718 continue ; | |
719 break ; | |
720 } | |
721 | |
722 for ( ; k < ssize ; ++k ) | |
723 { | |
724 if ( sites[k].chrId > exonBlocks[i].chrId | |
725 || sites[k].pos > exonBlocks[i].end ) | |
726 break ; | |
727 | |
728 if ( sites[k].oppositePos <= exonBlocks[i].end ) // ignore self-loop | |
729 continue ; | |
730 | |
731 for ( j = i + 1 ; j < bsize ; ++j ) | |
732 if ( sites[k].oppositePos >= exonBlocks[j].start && sites[k].oppositePos <= exonBlocks[j].end ) | |
733 break ; | |
734 if ( sites[k].oppositePos >= exonBlocks[j].start && sites[k].oppositePos <= exonBlocks[j].end ) | |
735 { | |
736 int p ; | |
737 p = adj[i].next ; | |
738 while ( p != -1 ) | |
739 { | |
740 if ( adj[p].ind == j ) | |
741 { | |
742 ++adj[p].info ; | |
743 adj[p].support += sites[k].uniqSupport ; | |
744 break ; | |
745 } | |
746 p = adj[p].next ; | |
747 } | |
748 if ( p == -1 ) | |
749 { | |
750 adj[ adjCnt ].ind = j ; | |
751 adj[ adjCnt ].info = 1 ; | |
752 adj[ adjCnt ].support = sites[k].uniqSupport ; | |
753 adj[ adjCnt ].next = adj[i].next ; | |
754 adj[i].next = adjCnt ; | |
755 ++adj[i].info ; | |
756 ++adjCnt ; | |
757 } | |
758 | |
759 siteToBlock[k].a = i ; | |
760 siteToBlock[k].b = j ; | |
761 } | |
762 } | |
763 } | |
764 for ( k = 0 ; k < ssize ; ++k ) | |
765 { | |
766 if ( sites[k].oppositePos - sites[k].pos + 1 < 20000 || sites[k].uniqSupport >= 30 ) | |
767 continue ; | |
768 | |
769 int from = siteToBlock[k].a ; | |
770 int to = siteToBlock[k].b ; | |
771 if ( to - from - 1 < 2 || adj[from].info <= 1 ) | |
772 continue ; | |
773 | |
774 memset( &visited[from], false, sizeof( bool ) * ( to - from + 1 ) ) ; | |
775 | |
776 int cnt = 0 ; | |
777 int p = adj[from].next ; | |
778 int max = -1 ; | |
779 int maxP = 0 ; | |
780 while ( p != -1 ) | |
781 { | |
782 if ( adj[p].support >= 10 && adj[p].ind <= to ) | |
783 ++cnt ; | |
784 if ( adj[p].support > max || ( adj[p].support == max && adj[p].ind == to ) ) | |
785 { | |
786 max = adj[p].support ; | |
787 maxP = p ; | |
788 } | |
789 | |
790 if ( adj[p].ind == to && adj[p].info > 1 ) | |
791 { | |
792 cnt = 0 ; | |
793 break ; | |
794 } | |
795 p = adj[p].next ; | |
796 } | |
797 if ( cnt <= 1 ) | |
798 continue ; | |
799 | |
800 if ( max != -1 && adj[maxP].ind == to ) | |
801 continue ; | |
802 | |
803 // No other path can reach "to" | |
804 p = adj[from].next ; | |
805 while ( p != -1 ) | |
806 { | |
807 if ( adj[p].ind != to ) | |
808 { | |
809 //if ( sites[k].pos ==43917158 ) | |
810 // printf( "hi %d %d. (%d %d)\n", from, adj[p].ind, exonBlocks[ adj[p].ind ].start, exonBlocks[ adj[p].ind ].end ) ; | |
811 if ( CanReach( adj[p].ind, to, adj, visited ) ) | |
812 break ; | |
813 } | |
814 p = adj[p].next ; | |
815 } | |
816 if ( p != -1 ) | |
817 continue ; | |
818 | |
819 //if ( sites[k].pos == 34073267 ) | |
820 // printf( "hi %d %d. (%d %d)\n", from, to, exonBlocks[to].start, exonBlocks[to].end ) ; | |
821 | |
822 // There are some blocks between that can reach "to" | |
823 for ( i = from + 1 ; i < to ; ++i ) | |
824 { | |
825 p = adj[i].next ; | |
826 while ( p != -1 ) | |
827 { | |
828 if ( adj[p].ind == to && adj[p].support >= 10 ) | |
829 break ; | |
830 p = adj[p].next ; | |
831 } | |
832 if ( p != -1 ) | |
833 break ; | |
834 } | |
835 if ( i >= to ) | |
836 continue ; | |
837 | |
838 // Filter the site | |
839 //printf( "filtered: %d %d\n", sites[k].pos, sites[k].oppositePos ) ; | |
840 sites[k].support = -1 ; | |
841 for ( j = k + 1 ; j < ssize ; ++j ) | |
842 { | |
843 if ( sites[j].pos == sites[k].oppositePos && sites[j].oppositePos == sites[k].pos ) | |
844 sites[j].support = -1 ; | |
845 if ( sites[j].pos > sites[k].oppositePos ) | |
846 break ; | |
847 } | |
848 } | |
849 | |
850 k = 0 ; | |
851 for ( i = 0 ; i < ssize ; ++i ) | |
852 if ( sites[i].support > 0 ) | |
853 { | |
854 sites[k] = sites[i] ; | |
855 ++k ; | |
856 } | |
857 sites.resize( k ) ; | |
858 | |
859 // Filter the intron on the different strand of a gene. | |
860 ssize = k ; | |
861 k = 0 ; | |
862 for ( i = 0 ; i < bsize ; ) | |
863 { | |
864 int farthest = exonBlocks[i].end ; | |
865 | |
866 for ( j = i ; j < bsize ; ++j ) | |
867 { | |
868 if ( exonBlocks[j].start > farthest || exonBlocks[i].chrId != exonBlocks[j].chrId ) | |
869 break ; | |
870 int p = adj[i].next ; | |
871 while ( p != -1 ) | |
872 { | |
873 if ( exonBlocks[ adj[p].ind ].end > farthest ) | |
874 farthest = exonBlocks[ adj[p].ind ].end ; | |
875 p = adj[p].next ; | |
876 } | |
877 } | |
878 | |
879 for ( ; k < ssize ; ++k ) | |
880 { | |
881 if ( sites[k].chrId < exonBlocks[i].chrId || | |
882 ( sites[k].chrId == exonBlocks[i].chrId && sites[k].pos < exonBlocks[i].start ) ) | |
883 continue ; | |
884 break ; | |
885 } | |
886 | |
887 if ( sites[k].chrId > exonBlocks[i].chrId || sites[k].pos > farthest ) | |
888 { | |
889 i = j ; | |
890 continue ; | |
891 } | |
892 //printf( "%d %d. %d %d. %d %d\n", i, j, exonBlocks[i].start, farthest, k, sites[k].pos ) ; | |
893 | |
894 | |
895 int from = k ; | |
896 int to ; | |
897 for ( to = k ; to < ssize ; ++to ) | |
898 if ( sites[to].chrId != exonBlocks[i].chrId || sites[to].pos > farthest ) | |
899 break ; | |
900 | |
901 int strandSupport[2] = {0, 0}; | |
902 int strandCount[2] = {0, 0}; | |
903 for ( k = from ; k < to ; ++k ) | |
904 { | |
905 if ( sites[k].oppositePos <= sites[k].pos ) | |
906 continue ; | |
907 | |
908 int tag = ( sites[k].strand == '+' ? 1 : 0 ) ; | |
909 strandSupport[tag] += sites[k].support ; | |
910 ++strandCount[tag] ; | |
911 } | |
912 | |
913 // Not mixtured. | |
914 if ( strandCount[0] * strandCount[1] == 0 ) | |
915 { | |
916 i = j ; | |
917 k = to ; | |
918 continue ; | |
919 } | |
920 | |
921 char removeStrand = 0 ; | |
922 if ( strandCount[0] == 1 && strandCount[1] >= 3 | |
923 && strandSupport[1] >= 20 * strandSupport[0] && strandSupport[0] <= 5 ) | |
924 { | |
925 removeStrand = '-' ; | |
926 } | |
927 else if ( strandCount[1] == 1 && strandCount[0] >= 3 | |
928 && strandSupport[0] >= 20 * strandSupport[1] && strandSupport[1] <= 5 ) | |
929 { | |
930 removeStrand = '+' ; | |
931 } | |
932 | |
933 if ( removeStrand == 0 ) | |
934 { | |
935 i = j ; k = to ; | |
936 continue ; | |
937 } | |
938 | |
939 for ( k = from ; k < to ; ++k ) | |
940 { | |
941 if ( sites[k].strand == removeStrand && sites[k].oppositePos >= exonBlocks[i].start && sites[k].oppositePos <= farthest ) | |
942 { | |
943 //printf( "filtered: %d %d\n", sites[k].pos, sites[k].oppositePos ) ; | |
944 sites[k].support = -1 ; | |
945 } | |
946 } | |
947 | |
948 i = j ; | |
949 k = to ; | |
950 } | |
951 | |
952 k = 0 ; | |
953 for ( i = 0 ; i < ssize ; ++i ) | |
954 if ( sites[i].support > 0 ) | |
955 { | |
956 sites[k] = sites[i] ; | |
957 ++k ; | |
958 } | |
959 sites.resize( k ) ; | |
960 | |
961 delete[] visited ; | |
962 delete[] siteToBlock ; | |
963 delete[] adj ; | |
964 } | |
965 | |
966 void SplitBlocks( Alignments &alignments, std::vector< struct _splitSite > &splitSites ) | |
967 { | |
968 std::vector<struct _block> rawExonBlocks = exonBlocks ; | |
969 int i, j ; | |
970 int tag = 0 ; | |
971 int bsize = rawExonBlocks.size() ; | |
972 int ssize = splitSites.size() ; | |
973 | |
974 // Make sure not overflow | |
975 struct _splitSite tmp ; | |
976 tmp.pos = -1 ; | |
977 splitSites.push_back( tmp ) ; | |
978 | |
979 exonBlocks.clear() ; | |
980 for ( i = 0 ; i < bsize ; ++i ) | |
981 { | |
982 while ( tag < ssize && ( splitSites[tag].chrId < rawExonBlocks[i].chrId || | |
983 ( splitSites[tag].chrId == rawExonBlocks[i].chrId && splitSites[tag].pos < rawExonBlocks[i].start ) ) ) | |
984 ++tag ; | |
985 | |
986 int l ; | |
987 for ( l = tag ; l < ssize ; ++l ) | |
988 if ( splitSites[l].chrId != rawExonBlocks[i].chrId || splitSites[l].pos > rawExonBlocks[i].end ) | |
989 break ; | |
990 int64_t start = rawExonBlocks[i].start ; | |
991 int64_t end = rawExonBlocks[i].end ; | |
992 //printf( "%lld %lld\n", start, end ) ; | |
993 //printf( "%s %s: %d %d\n", alignments.GetChromName( splitSites[tag].chrId ), alignments.GetChromName( rawExonBlocks[i].chrId ), tag, l ) ; | |
994 for ( j = tag ; j <= l ; ++j ) | |
995 { | |
996 int leftType = 0 ; | |
997 int rightType = 0 ; | |
998 if ( j > tag ) | |
999 { | |
1000 start = splitSites[j - 1].pos ; // end is from previous stage | |
1001 leftType = splitSites[j - 1].type ; | |
1002 } | |
1003 else | |
1004 start = rawExonBlocks[i].start ; | |
1005 | |
1006 if ( j <= l - 1 ) | |
1007 { | |
1008 end = splitSites[j].pos ; | |
1009 rightType = splitSites[j].type ; | |
1010 } | |
1011 else | |
1012 end = rawExonBlocks[i].end ; | |
1013 | |
1014 if ( leftType == 2 ) | |
1015 ++start ; | |
1016 if ( rightType == 1 ) | |
1017 --end ; | |
1018 | |
1019 struct _block tmpB ; | |
1020 tmpB = rawExonBlocks[i] ; | |
1021 tmpB.start = start ; | |
1022 tmpB.end = end ; | |
1023 tmpB.depthSum = 0 ; | |
1024 tmpB.ratio = 0 ; | |
1025 //printf( "\t%lld %lld\n", start, end ) ; | |
1026 tmpB.leftType = leftType ; | |
1027 tmpB.rightType = rightType ; | |
1028 tmpB.depth = NULL ; | |
1029 | |
1030 exonBlocks.push_back( tmpB ) ; | |
1031 // handle the soft boundary is the same as the hard boundary case | |
1032 // or adjacent splice sites | |
1033 /*if ( j == tag && start == end ) | |
1034 { | |
1035 exonBlocks.pop_back() ; | |
1036 --end ; | |
1037 } | |
1038 else if ( j == l && start > end ) | |
1039 { | |
1040 exonBlocks.pop_back() ; | |
1041 }*/ | |
1042 if ( start > end | |
1043 //|| ( tmpB.start == rawExonBlocks[i].start && tmpB.end != rawExonBlocks[i].end && tmpB.end - tmpB.start + 1 <= 7 ) | |
1044 //|| ( tmpB.end == rawExonBlocks[i].end && tmpB.start != rawExonBlocks[i].start && tmpB.end - tmpB.start + 1 <= 7 )) // the case of too short overhang | |
1045 || ( tmpB.leftType == 0 && tmpB.rightType == 2 && tmpB.end - tmpB.start + 1 <= 9 ) | |
1046 || ( tmpB.leftType == 1 && tmpB.rightType == 0 && tmpB.end - tmpB.start + 1 <= 9 ) ) | |
1047 { | |
1048 exonBlocks.pop_back() ; | |
1049 } | |
1050 /*else if ( start == end ) | |
1051 { | |
1052 | |
1053 }*/ | |
1054 } | |
1055 } | |
1056 BuildBlockChrIdOffset() ; | |
1057 } | |
1058 | |
1059 void ComputeDepth( Alignments &alignments ) | |
1060 { | |
1061 // Go through the alignment list again to fill in the depthSum; | |
1062 int i ; | |
1063 int j ; | |
1064 int tag = 0 ; | |
1065 int blockCnt = exonBlocks.size() ; | |
1066 | |
1067 std::vector<struct _block> newExonBlocks ; | |
1068 while ( alignments.Next() ) | |
1069 { | |
1070 int segCnt = alignments.segCnt ; | |
1071 struct _pair *segments = alignments.segments ; | |
1072 | |
1073 while ( tag < blockCnt && ( exonBlocks[tag].chrId < alignments.GetChromId() || | |
1074 ( exonBlocks[tag].chrId == alignments.GetChromId() && exonBlocks[tag].end < segments[0].a ) ) ) | |
1075 { | |
1076 AdjustAndCreateExonBlocks( tag, newExonBlocks ) ; | |
1077 ++tag ; | |
1078 } | |
1079 for ( i = 0 ; i < segCnt ; ++i ) | |
1080 { | |
1081 for ( j = tag ; j < blockCnt && ( exonBlocks[j].chrId == alignments.GetChromId() && exonBlocks[j].start <= segments[i].b ) ; ++j ) | |
1082 { | |
1083 int64_t s = -1, e = -1 ; | |
1084 exonBlocks[j].depthSum += Overlap( segments[i].a, segments[i].b, exonBlocks[j].start, exonBlocks[j].end, s, e ) ; | |
1085 if ( s == -1 ) | |
1086 continue ; | |
1087 | |
1088 if ( exonBlocks[j].depth == NULL ) | |
1089 { | |
1090 int len = exonBlocks[j].end - exonBlocks[j].start + 1 ; | |
1091 exonBlocks[j].depth = new int[len + 1] ; | |
1092 memset( exonBlocks[j].depth, 0, sizeof( int ) * ( len + 1 ) ) ; | |
1093 exonBlocks[j].leftSplice = exonBlocks[j].start ; | |
1094 exonBlocks[j].rightSplice = exonBlocks[j].end ; | |
1095 } | |
1096 int *depth = exonBlocks[j].depth ; | |
1097 ++depth[s - exonBlocks[j].start] ; | |
1098 --depth[e - exonBlocks[j].start + 1] ; // notice the +1 here, since the decrease of coverage actually happens at next base. | |
1099 | |
1100 // record the longest alignment stretch support the splice site. | |
1101 if ( exonBlocks[j].leftType == 1 && segments[i].a == exonBlocks[j].start | |
1102 && e > exonBlocks[j].leftSplice ) | |
1103 { | |
1104 exonBlocks[j].leftSplice = e ; | |
1105 } | |
1106 if ( exonBlocks[j].rightType == 2 && segments[i].b == exonBlocks[j].end | |
1107 && s < exonBlocks[j].rightSplice ) | |
1108 { | |
1109 exonBlocks[j].rightSplice = s ; | |
1110 } | |
1111 } | |
1112 } | |
1113 } | |
1114 | |
1115 for ( ; tag < blockCnt ; ++tag ) | |
1116 AdjustAndCreateExonBlocks( tag, newExonBlocks ) ; | |
1117 exonBlocks.clear() ; | |
1118 | |
1119 // Due to multi-alignment, we may skip some alignments that determines leftSplice and | |
1120 // rightSplice, hence filtered some subexons. As a result, some subexons' anchor subexon will disapper | |
1121 // and we need to change its boundary type. | |
1122 blockCnt = newExonBlocks.size() ; | |
1123 for ( i = 0 ; i < blockCnt ; ++i ) | |
1124 { | |
1125 if ( ( i == 0 && newExonBlocks[i].leftType == 2 ) || | |
1126 ( i > 0 && newExonBlocks[i].leftType == 2 && newExonBlocks[i - 1].end + 1 != newExonBlocks[i].start ) ) | |
1127 { | |
1128 newExonBlocks[i].leftType = 0 ; | |
1129 } | |
1130 | |
1131 if ( ( i == blockCnt - 1 && newExonBlocks[i].rightType == 1 ) || | |
1132 ( i < blockCnt - 1 && newExonBlocks[i].rightType == 1 && | |
1133 newExonBlocks[i].end + 1 != newExonBlocks[i + 1].start ) ) | |
1134 newExonBlocks[i].rightType = 0 ; | |
1135 } | |
1136 exonBlocks = newExonBlocks ; | |
1137 } | |
1138 | |
1139 // If two blocks whose soft boundary are close to each other, we can merge them. | |
1140 void MergeNearBlocks() | |
1141 { | |
1142 std::vector<struct _block> rawExonBlocks = exonBlocks ; | |
1143 int i, k ; | |
1144 int bsize = rawExonBlocks.size() ; | |
1145 int *newIdx = new int[bsize] ; // used for adjust prev and next. Note that by merging, it won't change the number of prevCnt or nextCnt. | |
1146 | |
1147 exonBlocks.clear() ; | |
1148 exonBlocks.push_back( rawExonBlocks[0] ) ; | |
1149 k = 0 ; | |
1150 newIdx[0] = 0 ; | |
1151 for ( i = 1 ; i < bsize ; ++i ) | |
1152 { | |
1153 if ( rawExonBlocks[i].chrId == exonBlocks[k].chrId | |
1154 && rawExonBlocks[i].leftType == 0 && exonBlocks[k].rightType == 0 | |
1155 && rawExonBlocks[i].start - exonBlocks[k].end - 1 <= 50 | |
1156 && rawExonBlocks[i].depthSum / double( rawExonBlocks[i].end - rawExonBlocks[i].start + 1 ) < 3.0 | |
1157 && exonBlocks[k].depthSum / double( exonBlocks[k].end - exonBlocks[i].start + 1 ) < 3.0 ) | |
1158 { | |
1159 exonBlocks[k].end = rawExonBlocks[i].end ; | |
1160 exonBlocks[k].rightType = rawExonBlocks[i].rightType ; | |
1161 exonBlocks[k].rightStrand = rawExonBlocks[i].rightStrand ; | |
1162 exonBlocks[k].depthSum += rawExonBlocks[i].depthSum ; | |
1163 if ( rawExonBlocks[i].rightSplice != -1 ) | |
1164 exonBlocks[k].rightSplice = rawExonBlocks[i].rightSplice ; | |
1165 | |
1166 if ( exonBlocks[k].nextCnt > 0 ) | |
1167 delete[] exonBlocks[k].next ; | |
1168 exonBlocks[k].nextCnt = rawExonBlocks[i].nextCnt ; | |
1169 exonBlocks[k].next = rawExonBlocks[i].next ; | |
1170 } | |
1171 else | |
1172 { | |
1173 exonBlocks.push_back( rawExonBlocks[i] ) ; | |
1174 ++k ; | |
1175 } | |
1176 newIdx[i] = k ; | |
1177 } | |
1178 BuildBlockChrIdOffset() ; | |
1179 bsize = exonBlocks.size() ; | |
1180 for ( i = 0 ; i < bsize ; ++i ) | |
1181 { | |
1182 int cnt = exonBlocks[i].prevCnt ; | |
1183 for ( k = 0 ; k < cnt ; ++k ) | |
1184 exonBlocks[i].prev[k] = newIdx[ exonBlocks[i].prev[k] ] ; | |
1185 | |
1186 cnt = exonBlocks[i].nextCnt ; | |
1187 for ( k = 0 ; k < cnt ; ++k ) | |
1188 exonBlocks[i].next[k] = newIdx[ exonBlocks[i].next[k] ] ; | |
1189 } | |
1190 delete[] newIdx ; | |
1191 } | |
1192 | |
1193 void AddIntronInformation( std::vector<struct _splitSite> &sites, Alignments &alignments ) | |
1194 { | |
1195 // Add the connection information, support information and the strand information. | |
1196 int i, j, k, tag ; | |
1197 tag = 0 ; | |
1198 int scnt = sites.size() ; | |
1199 int exonBlockCnt = exonBlocks.size() ; | |
1200 bool flag ; | |
1201 | |
1202 for ( i = 0 ; i < exonBlockCnt ; ++i ) | |
1203 { | |
1204 exonBlocks[i].prevCnt = exonBlocks[i].nextCnt = 0 ; | |
1205 exonBlocks[i].prev = exonBlocks[i].next = NULL ; | |
1206 exonBlocks[i].leftStrand = exonBlocks[i].rightStrand = '.' ; | |
1207 exonBlocks[i].leftSplice = exonBlocks[i].rightSplice = 0 ; | |
1208 } | |
1209 | |
1210 // Now, we add the region id and compute the length of each region, here the region does not contain possible IR event. | |
1211 int regionId = 0 ; | |
1212 for ( i = 0 ; i < exonBlockCnt ; ) | |
1213 { | |
1214 if ( exonBlocks[i].leftType == 2 && exonBlocks[i].rightType == 1 ) | |
1215 { | |
1216 j = i + 1 ; | |
1217 } | |
1218 else | |
1219 for ( j = i + 1 ; j < exonBlockCnt ; ++j ) | |
1220 if ( exonBlocks[j].start > exonBlocks[j - 1].end + 1 || ( exonBlocks[j].leftType == 2 && exonBlocks[j].rightType == 1 ) ) | |
1221 break ; | |
1222 for ( k = i ; k < j ; ++k ) | |
1223 exonBlocks[k].contigId = regionId ; | |
1224 ++regionId ; | |
1225 i = j ; | |
1226 } | |
1227 int *regionLength = new int[regionId] ; | |
1228 memset( regionLength, 0, sizeof( int ) * regionId ) ; | |
1229 for ( i = 0 ; i < exonBlockCnt ; ++i ) | |
1230 { | |
1231 regionLength[ exonBlocks[i].contigId ] += exonBlocks[i].end - exonBlocks[i].start + 1 ; | |
1232 } | |
1233 | |
1234 for ( i = 0 ; i < scnt ; ) | |
1235 { | |
1236 for ( j = i ; j < scnt ; ++j ) | |
1237 { | |
1238 if ( sites[j].chrId != sites[i].chrId || | |
1239 sites[j].pos != sites[i].pos || | |
1240 sites[j].type != sites[i].type ) | |
1241 break ; | |
1242 } | |
1243 // [i,j-1] are the indices of the sites with same coordinate | |
1244 // It is possible a sites corresponds to 2 strands, then we should only keep one of them. | |
1245 char strand = '.' ; | |
1246 int strandSupport[2] = {0, 0}; | |
1247 for ( k = i ; k < j ; ++k ) | |
1248 { | |
1249 if ( sites[k].strand == '-' ) | |
1250 strandSupport[0] += sites[k].support ; | |
1251 else if ( sites[k].strand == '+' ) | |
1252 strandSupport[1] += sites[k].support ; | |
1253 } | |
1254 if ( strandSupport[0] > strandSupport[1] ) | |
1255 strand = '-' ; | |
1256 else if ( strandSupport[1] > strandSupport[0] ) | |
1257 strand = '+' ; | |
1258 | |
1259 | |
1260 int cnt = j - i ; | |
1261 // Locate the first subexon that can overlap with this site | |
1262 while ( tag < exonBlockCnt ) | |
1263 { | |
1264 if ( ( exonBlocks[tag].chrId == sites[i].chrId && exonBlocks[tag].end >= sites[i].pos ) | |
1265 || exonBlocks[tag].chrId > sites[i].chrId ) | |
1266 break ; | |
1267 ++tag ; | |
1268 } | |
1269 flag = false ; | |
1270 for ( k = tag; k < exonBlockCnt ; ++k ) | |
1271 { | |
1272 if ( exonBlocks[tag].start > sites[i].pos || | |
1273 exonBlocks[tag].chrId > sites[i].chrId ) | |
1274 break ; | |
1275 | |
1276 if ( exonBlocks[tag].start == sites[i].pos || | |
1277 exonBlocks[tag].end == sites[i].pos ) | |
1278 { | |
1279 flag = true ; | |
1280 break ; | |
1281 } | |
1282 } | |
1283 | |
1284 if ( !flag ) | |
1285 { | |
1286 i = j ; | |
1287 continue ; | |
1288 } | |
1289 | |
1290 if ( sites[i].type == 1 && exonBlocks[tag].start == sites[i].pos ) | |
1291 { | |
1292 exonBlocks[tag].prevCnt = 0 ; | |
1293 exonBlocks[tag].prev = new int[cnt] ; | |
1294 exonBlocks[tag].leftStrand = strand ; | |
1295 | |
1296 // And we also need to put the "next" here. | |
1297 // Here we assume the oppositePos sorted in increasing order | |
1298 k = tag - 1 ; | |
1299 for ( int l = j - 1 ; l >= i ; --l ) | |
1300 { | |
1301 for ( ; k >= 0 ; --k ) | |
1302 { | |
1303 if ( exonBlocks[k].end < sites[l].oppositePos || exonBlocks[k].chrId != sites[l].chrId ) | |
1304 break ; | |
1305 if ( exonBlocks[k].end == sites[l].oppositePos ) //&& exonBlocks[k].rightType == sites[l].type ) | |
1306 { | |
1307 if ( sites[l].strand != strand || | |
1308 sites[l].strand != exonBlocks[k].rightStrand ) | |
1309 break ; | |
1310 exonBlocks[k].next[ exonBlocks[k].nextCnt ] = tag ; | |
1311 exonBlocks[tag].prev[ exonBlocks[tag].prevCnt] = k ; // Note that the prev are sorted in decreasing order | |
1312 ++exonBlocks[k].nextCnt ; | |
1313 ++exonBlocks[tag].prevCnt ; | |
1314 double factor = 1.0 ; | |
1315 if ( regionLength[ exonBlocks[k].contigId ] < alignments.readLen ) | |
1316 { | |
1317 int left ; | |
1318 for ( left = k ; left >= 0 ; --left ) | |
1319 if ( exonBlocks[left].contigId != exonBlocks[k].contigId ) | |
1320 break ; | |
1321 ++left ; | |
1322 | |
1323 int prevType = 0 ; | |
1324 // only consider the portion upto k. | |
1325 for ( int m = left ; m <= k ; ++m ) | |
1326 if ( exonBlocks[m].leftType == 1 ) | |
1327 ++prevType ; | |
1328 | |
1329 if ( prevType == 0 ) | |
1330 factor = 2 * (double)alignments.readLen / regionLength[ exonBlocks[k].contigId ] ; | |
1331 } | |
1332 if ( regionLength[ exonBlocks[tag].contigId ] < alignments.readLen ) | |
1333 { | |
1334 int right ; | |
1335 for ( right = tag ; right < exonBlockCnt ; ++right ) | |
1336 if ( exonBlocks[right].contigId != exonBlocks[tag].contigId ) | |
1337 break ; | |
1338 --right ; | |
1339 | |
1340 int nextType = 0 ; | |
1341 for ( int m = tag ; m <= right ; ++m ) | |
1342 if ( exonBlocks[m].rightType == 2 ) | |
1343 ++nextType ; | |
1344 | |
1345 if ( nextType == 0 ) | |
1346 factor = 2 * (double)alignments.readLen / regionLength[ exonBlocks[tag].contigId ] ; | |
1347 } | |
1348 if ( factor > 10 ) | |
1349 factor = 10 ; | |
1350 | |
1351 if ( exonBlocks[k].contigId != exonBlocks[tag].contigId ) // the support of introns between regions. | |
1352 { | |
1353 // Adjust the support if one of the anchor exon is too short. | |
1354 // This avoid the creation of alternative 3'/5' end due to low coverage from too short anchor. | |
1355 exonBlocks[tag].leftSplice += sites[l].support * factor ; | |
1356 exonBlocks[k].rightSplice += sites[l].support * factor ; | |
1357 } | |
1358 break ; | |
1359 } | |
1360 } | |
1361 } | |
1362 } | |
1363 else if ( sites[i].type == 2 && exonBlocks[tag].end == sites[i].pos ) | |
1364 { | |
1365 exonBlocks[tag].nextCnt = 0 ; // cnt ; it should reach cnt after putting the ids in | |
1366 exonBlocks[tag].next = new int[cnt] ; | |
1367 exonBlocks[tag].rightStrand = strand ; | |
1368 } | |
1369 | |
1370 i = j ; | |
1371 } | |
1372 // Reverse the prev list | |
1373 for ( i = 0 ; i < exonBlockCnt ; ++i ) | |
1374 { | |
1375 for ( j = 0, k = exonBlocks[i].prevCnt - 1 ; j < k ; ++j, --k ) | |
1376 { | |
1377 int tmp = exonBlocks[i].prev[j] ; | |
1378 exonBlocks[i].prev[j] = exonBlocks[i].prev[k] ; | |
1379 exonBlocks[i].prev[k] = tmp ; | |
1380 } | |
1381 } | |
1382 | |
1383 // If all of the subexon anchored the intron are filtered, we need to change the type of the other anchor. | |
1384 for ( i = 0 ; i < exonBlockCnt ; ++i ) | |
1385 { | |
1386 if ( exonBlocks[i].leftType == 1 && exonBlocks[i].prevCnt == 0 ) | |
1387 { | |
1388 exonBlocks[i].leftType = 0 ; | |
1389 exonBlocks[i].leftStrand = '.' ; | |
1390 if ( i > 0 && exonBlocks[i - 1].rightType == 1 ) | |
1391 exonBlocks[i - 1].rightType = 0 ; | |
1392 } | |
1393 if ( exonBlocks[i].rightType == 2 && exonBlocks[i].nextCnt == 0 ) | |
1394 { | |
1395 exonBlocks[i].rightType = 0 ; | |
1396 exonBlocks[i].rightStrand = '.' ; | |
1397 if ( i < exonBlockCnt - 1 && exonBlocks[i + 1].leftType == 2 ) | |
1398 exonBlocks[i + 1].leftType = 0 ; | |
1399 } | |
1400 } | |
1401 MergeNearBlocks() ; | |
1402 | |
1403 delete[] regionLength ; | |
1404 } | |
1405 | |
1406 double PickLeftAndRightRatio( double l, double r ) | |
1407 { | |
1408 if ( l >= 0 && r >= 0 ) | |
1409 return l < r ? l : r ; | |
1410 else if ( l < 0 && r < 0 ) | |
1411 return -1 ; | |
1412 else if ( l < 0 ) | |
1413 return r ; | |
1414 else | |
1415 return l ; | |
1416 } | |
1417 | |
1418 double PickLeftAndRightRatio( const struct _block &b ) | |
1419 { | |
1420 return PickLeftAndRightRatio( b.leftRatio, b.rightRatio ) ; | |
1421 } | |
1422 | |
1423 void ComputeRatios() | |
1424 { | |
1425 int i, j ; | |
1426 int exonBlockCnt = exonBlocks.size() ; | |
1427 for ( i = 0 ; i < exonBlockCnt ; ++i ) | |
1428 { | |
1429 exonBlocks[i].leftRatio = -1 ; | |
1430 exonBlocks[i].rightRatio = -1 ; | |
1431 if ( exonBlocks[i].leftType == 2 && exonBlocks[i].rightType == 1 ) | |
1432 { | |
1433 // ]...[ | |
1434 double anchorAvg = 0 ; | |
1435 double avgDepth = GetAvgDepth( exonBlocks[i] ) ; | |
1436 if ( i >= 1 && exonBlocks[i - 1].chrId == exonBlocks[i].chrId ) | |
1437 { | |
1438 anchorAvg = GetAvgDepth( exonBlocks[i - 1] ) ; | |
1439 if ( anchorAvg > 1 && avgDepth > 1 ) | |
1440 exonBlocks[i].leftRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; | |
1441 } | |
1442 if ( i < exonBlockCnt - 1 && exonBlocks[i + 1].chrId == exonBlocks[i].chrId ) | |
1443 { | |
1444 anchorAvg = GetAvgDepth( exonBlocks[i + 1] ) ; | |
1445 if ( anchorAvg > 1 && avgDepth > 1 ) | |
1446 exonBlocks[i].rightRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; | |
1447 } | |
1448 } | |
1449 | |
1450 if ( exonBlocks[i].leftType == 0 && exonBlocks[i].rightType == 1 ) | |
1451 { | |
1452 // (..[ , overhang subexon. | |
1453 double avgDepth = GetAvgDepth( exonBlocks[i] ) ; | |
1454 double anchorAvg = GetAvgDepth( exonBlocks[i + 1] ) ; | |
1455 if ( avgDepth > 1 && anchorAvg > 1 ) | |
1456 exonBlocks[i].rightRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; | |
1457 } | |
1458 | |
1459 /*if ( exonBlocks[i].leftType == 1 ) | |
1460 { | |
1461 // For the case (...[, the leftRatio is actuall the leftratio of the subexon on its right. | |
1462 int len = 0 ; | |
1463 double depthSum = 0 ; | |
1464 int tag = i ; | |
1465 | |
1466 for ( j = 0 ; j < exonBlocks[tag].prevCnt ; ++j ) | |
1467 { | |
1468 int k = exonBlocks[tag].prev[j] ; | |
1469 len += ( exonBlocks[k].end - exonBlocks[k].start + 1 ) ; | |
1470 depthSum += exonBlocks[k].depthSum ; | |
1471 } | |
1472 double otherAvgDepth = depthSum / len ; | |
1473 double avgDepth = GetAvgDepth( exonBlocks[tag] ) ; | |
1474 | |
1475 // adjust avgDepth by looking into the following exon blocks(subexon) in the same region whose left side is not hard end | |
1476 for ( j = tag + 1 ; j < exonBlockCnt ; ++j ) | |
1477 { | |
1478 if ( exonBlocks[j].start != exonBlocks[j - 1].end + 1 || exonBlocks[j].leftType == 1 ) | |
1479 break ; | |
1480 double tmp = GetAvgDepth( exonBlocks[j] ) ; | |
1481 if ( tmp > avgDepth ) | |
1482 avgDepth = tmp ; | |
1483 } | |
1484 | |
1485 if ( avgDepth < 1 ) | |
1486 avgDepth = 1 ; | |
1487 if ( otherAvgDepth < 1 ) | |
1488 otherAvgDepth = 1 ; | |
1489 | |
1490 //exonBlocks[i].leftRatio = pow( log( avgDepth ) / log(2.0), 2.0 / 3.0 ) - pow( log( otherAvgDepth ) / log( 2.0 ), 2.0 / 3.0 ); | |
1491 exonBlocks[i].leftRatio = sqrt( avgDepth ) - log( avgDepth ) - ( sqrt( otherAvgDepth ) + log( otherAvgDepth ) ) ; | |
1492 | |
1493 }*/ | |
1494 if ( exonBlocks[i].rightType == 0 && exonBlocks[i].leftType == 2 ) | |
1495 { | |
1496 // for overhang subexon. | |
1497 double avgDepth = GetAvgDepth( exonBlocks[i] ) ; | |
1498 double anchorAvg = GetAvgDepth( exonBlocks[i - 1] ) ; | |
1499 if ( avgDepth > 1 && anchorAvg > 1 ) | |
1500 exonBlocks[i].leftRatio = ( avgDepth - 1 ) / ( anchorAvg - 1 ) ; | |
1501 | |
1502 } | |
1503 /*if ( exonBlocks[i].rightType == 2 ) | |
1504 { | |
1505 int len = 0 ; | |
1506 double depthSum = 0 ; | |
1507 int tag = i ; | |
1508 | |
1509 for ( j = 0 ; j < exonBlocks[tag].nextCnt ; ++j ) | |
1510 { | |
1511 int k = exonBlocks[tag].next[j] ; | |
1512 len += ( exonBlocks[k].end - exonBlocks[k].start + 1 ) ; | |
1513 depthSum += exonBlocks[k].depthSum ; | |
1514 } | |
1515 double otherAvgDepth = depthSum / len ; | |
1516 double avgDepth = GetAvgDepth( exonBlocks[tag] ) ; | |
1517 | |
1518 // adjust avgDepth by looking into the earlier exon blocks(subexon) in the same region whose right side is not hard end | |
1519 for ( j = tag - 1 ; j >= 0 ; --j ) | |
1520 { | |
1521 if ( exonBlocks[j].end + 1 != exonBlocks[j + 1].start || exonBlocks[j].rightType == 2 ) | |
1522 break ; | |
1523 double tmp = GetAvgDepth( exonBlocks[j] ) ; | |
1524 if ( tmp > avgDepth ) | |
1525 avgDepth = tmp ; | |
1526 } | |
1527 | |
1528 if ( avgDepth < 1 ) | |
1529 avgDepth = 1 ; | |
1530 if ( otherAvgDepth < 1 ) | |
1531 otherAvgDepth = 1 ; | |
1532 | |
1533 exonBlocks[i].rightRatio = sqrt( avgDepth ) - log( avgDepth ) - ( sqrt( otherAvgDepth ) + log( otherAvgDepth ) ) ; | |
1534 //pow( log( avgDepth ) / log(2.0), 2.0 / 3.0 ) - pow( log( otherAvgDepth ) / log(2.0), 2.0 / 3.0 ); | |
1535 }*/ | |
1536 // The remaining case the islands, (...) | |
1537 } | |
1538 | |
1539 // go through each region of subexons, and only compute the ratio for the first and last hard boundary | |
1540 for ( i = 0 ; i < exonBlockCnt ; ) | |
1541 { | |
1542 // the blocks from [i,j) corresponds to a region, namely consecutive subexons. | |
1543 for ( j = i + 1 ; j < exonBlockCnt ; ++j ) | |
1544 if ( exonBlocks[j].contigId != exonBlocks[i].contigId ) | |
1545 break ; | |
1546 | |
1547 int leftSupport = 0 ; | |
1548 int rightSupport = 0 ; | |
1549 int leftTag = j, rightTag = i - 1 ; | |
1550 for ( int k = i ; k < j ; ++k ) | |
1551 { | |
1552 if ( exonBlocks[k].leftType == 1 && exonBlocks[k].leftSplice > 0 ) | |
1553 { | |
1554 leftSupport += exonBlocks[k].leftSplice ; | |
1555 if ( k < leftTag ) | |
1556 leftTag = k ; | |
1557 } | |
1558 if ( exonBlocks[k].rightType == 2 && exonBlocks[k].rightSplice > 0 ) | |
1559 { | |
1560 rightSupport += exonBlocks[k].rightSplice ; | |
1561 if ( k > rightTag ) | |
1562 rightTag = k ; | |
1563 } | |
1564 } | |
1565 | |
1566 if ( leftSupport > 0 && rightSupport > 0 ) | |
1567 { | |
1568 // when the right splice support is much greater than that of the left side, there should be a soft boundary for the left side. | |
1569 // Wether we want to include this soft boundary or not will be determined in class, when it looked at whether the overhang block | |
1570 // should be kept or not. | |
1571 exonBlocks[ leftTag ].leftRatio = sqrt( rightSupport ) - log( ( rightSupport + leftSupport ) * 0.5 ) - ( sqrt( leftSupport ) ) ; //+ log( leftSupport ) ) ; | |
1572 exonBlocks[ rightTag ].rightRatio = sqrt( leftSupport ) - log( ( rightSupport + leftSupport ) * 0.5 ) - ( sqrt( rightSupport ) ) ; //+ log( rightSupport ) ) ; | |
1573 } | |
1574 | |
1575 i = j ; // don't forget this. | |
1576 } | |
1577 } | |
1578 } ; | |
1579 | |
1580 #endif |