0
|
1 #include "TranscriptDecider.hpp"
|
|
2
|
|
3 void TranscriptDecider::OutputTranscript( int sampleId, struct _subexon *subexons, struct _transcript &transcript )
|
|
4 {
|
|
5 int i, j ;
|
|
6 // determine the strand
|
|
7 std::vector<int> subexonInd ;
|
|
8 transcript.seVector.GetOnesIndices( subexonInd ) ;
|
|
9
|
|
10 // Determine the strand
|
|
11 char strand[2] = "." ;
|
|
12 int size = subexonInd.size() ;
|
|
13 if ( size > 1 )
|
|
14 {
|
|
15 // locate the intron showed up in this transcript.
|
|
16 for ( i = 0 ; i < size - 1 ; ++i )
|
|
17 {
|
|
18 /*int nextCnt = subexons[ subexonInd[i] ].nextCnt ;
|
|
19 if ( nextCnt == 0 )
|
|
20 continue ;
|
|
21
|
|
22 for ( j = 0 ; j < nextCnt ; ++j )
|
|
23 {
|
|
24 int a = subexons[ subexonInd[i] ].next[j] ;
|
|
25 if ( subexonInd[i + 1] == a
|
|
26 && subexons[ subexonInd[i] ].end + 1 < subexons[a].start ) // avoid the case like ..(...[...
|
|
27 {
|
|
28 break ;
|
|
29 }
|
|
30 }
|
|
31 if ( j < nextCnt )*/
|
|
32
|
|
33 if ( subexons[ subexonInd[i] ].end + 1 < subexons[ subexonInd[i + 1] ].start )
|
|
34 {
|
|
35 if ( subexons[ subexonInd[i] ].rightStrand == 1 )
|
|
36 strand[0] = '+' ;
|
|
37 else if ( subexons[ subexonInd[i] ].rightStrand == -1 )
|
|
38 strand[0] = '-' ;
|
|
39 break ;
|
|
40 }
|
|
41 }
|
|
42 }
|
|
43
|
|
44 // TODO: transcript_id
|
|
45 char *chrom = alignments.GetChromName( subexons[0].chrId ) ;
|
|
46 char prefix[10] = "" ;
|
|
47 struct _subexon *catSubexons = new struct _subexon[ size + 1 ] ;
|
|
48 // Concatenate adjacent subexons
|
|
49 catSubexons[0] = subexons[ subexonInd[0] ] ;
|
|
50 j = 1 ;
|
|
51 for ( i = 1 ; i < size ; ++i )
|
|
52 {
|
|
53 if ( subexons[ subexonInd[i] ].start == catSubexons[j - 1].end + 1 )
|
|
54 {
|
|
55 catSubexons[j - 1].end = subexons[ subexonInd[i] ].end ;
|
|
56 }
|
|
57 else
|
|
58 {
|
|
59 catSubexons[j] = subexons[ subexonInd[i] ] ;
|
|
60 ++j ;
|
|
61 }
|
|
62 }
|
|
63 size = j ;
|
|
64
|
|
65 int gid = GetTranscriptGeneId( subexonInd, subexons ) ;
|
|
66 if ( 0 ) //numThreads <= 1 )
|
|
67 {
|
|
68 fprintf( outputFPs[sampleId], "%s\tCLASSES\ttranscript\t%d\t%d\t1000\t%s\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; Abundance \"%.6lf\";\n",
|
|
69 chrom, catSubexons[0].start + 1, catSubexons[size - 1].end + 1, strand,
|
|
70 prefix, chrom, gid,
|
|
71 prefix, chrom, gid, transcriptId[ gid - baseGeneId ], transcript.FPKM ) ;
|
|
72 for ( i = 0 ; i < size ; ++i )
|
|
73 {
|
|
74 fprintf( outputFPs[ sampleId ], "%s\tCLASSES\texon\t%d\t%d\t1000\t%s\t.\tgene_id \"%s%s.%d\"; "
|
|
75 "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; Abundance \"%.6lf\"\n",
|
|
76 chrom, catSubexons[i].start + 1, catSubexons[i].end + 1, strand,
|
|
77 prefix, chrom, gid,
|
|
78 prefix, chrom, gid, transcriptId[ gid - baseGeneId ],
|
|
79 i + 1, transcript.FPKM ) ;
|
|
80 }
|
|
81 }
|
|
82 else
|
|
83 {
|
|
84 struct _outputTranscript t ;
|
|
85 int len = 0 ;
|
|
86 t.chrId = subexons[0].chrId ;
|
|
87 t.geneId = gid ;
|
|
88 t.transcriptId = transcriptId[ gid - baseGeneId ] ;
|
|
89 t.FPKM = transcript.FPKM ;
|
|
90 t.sampleId = sampleId ;
|
|
91 t.exons = new struct _pair32[size] ;
|
|
92 for ( i = 0 ; i < size ; ++i )
|
|
93 {
|
|
94 t.exons[i].a = catSubexons[i].start + 1 ;
|
|
95 t.exons[i].b = catSubexons[i].end + 1 ;
|
|
96 len += t.exons[i].b - t.exons[i].a + 1 ;
|
|
97 }
|
|
98 t.cov = transcript.abundance * alignments.readLen / len ;
|
|
99 t.ecnt = size ;
|
|
100 t.strand = strand[0] ;
|
|
101 //printf( "%lf\n", transcript.correlationScore ) ;
|
|
102
|
|
103 if ( numThreads > 1 )
|
|
104 outputHandler->Add( t ) ;
|
|
105 else
|
|
106 outputHandler->Add_SingleThread( t ) ;
|
|
107 }
|
|
108 ++transcriptId[ gid - baseGeneId ] ;
|
|
109
|
|
110 delete[] catSubexons ;
|
|
111 }
|
|
112
|
|
113 int TranscriptDecider::GetFather( int f, int *father )
|
|
114 {
|
|
115 if ( father[f] != f )
|
|
116 return father[f] = GetFather( father[f], father ) ;
|
|
117 return f ;
|
|
118 }
|
|
119
|
|
120 int TranscriptDecider::GetTranscriptGeneId( std::vector<int> &subexonInd, struct _subexon *subexons )
|
|
121 {
|
|
122 int i ;
|
|
123 int size = subexonInd.size() ;
|
|
124
|
|
125 for ( i = 0 ; i < size ; ++i )
|
|
126 if ( subexons[ subexonInd[i] ].geneId != -2 )
|
|
127 return subexons[ subexonInd[i] ].geneId ;
|
|
128
|
|
129 // Some extreme case, where all the regions are mixture regions.
|
|
130 for ( i = 0 ; i < size - 1 ; ++i )
|
|
131 if ( subexons[ subexonInd[i] ].end + 1 < subexons[ subexonInd[i + 1] ].start )
|
|
132 {
|
|
133 return defaultGeneId[ ( subexons[ subexonInd[i] ].rightStrand + 1 ) / 2 ] ;
|
|
134 }
|
|
135 return defaultGeneId[0] ;
|
|
136 }
|
|
137
|
|
138 int TranscriptDecider::GetTranscriptGeneId( struct _transcript &t, struct _subexon *subexons )
|
|
139 {
|
|
140 if ( subexons[ t.first ].geneId != -2 )
|
|
141 return subexons[ t.first ].geneId ;
|
|
142 if ( subexons[ t.last ].geneId != -2 )
|
|
143 return subexons[ t.last ].geneId ;
|
|
144 std::vector<int> subexonInd ;
|
|
145 t.seVector.GetOnesIndices( subexonInd ) ;
|
|
146 return GetTranscriptGeneId( subexonInd, subexons ) ;
|
|
147 }
|
|
148
|
|
149 void TranscriptDecider::InitTranscriptId()
|
|
150 {
|
|
151 int i ;
|
|
152 for ( i = 0 ; i < usedGeneId - baseGeneId ; ++i )
|
|
153 transcriptId[i] = 0 ;
|
|
154 }
|
|
155
|
|
156 bool TranscriptDecider::IsStartOfMixtureStrandRegion( int tag, struct _subexon *subexons, int seCnt )
|
|
157 {
|
|
158 int j, k ;
|
|
159 int leftStrandCnt[2] = {0, 0}, rightStrandCnt[2] = {0, 0};
|
|
160 for ( j = tag + 1 ; j < seCnt ; ++j )
|
|
161 if ( subexons[j].start > subexons[j - 1].end + 1 )
|
|
162 break ;
|
|
163
|
|
164 for ( k = tag ; k < j ; ++k )
|
|
165 {
|
|
166 if ( subexons[k].leftStrand != 0 )
|
|
167 ++leftStrandCnt[ ( subexons[k].leftStrand + 1 ) / 2 ] ;
|
|
168 if ( subexons[k].rightStrand != 0 )
|
|
169 ++rightStrandCnt[ ( subexons[k].rightStrand + 1 ) / 2 ] ;
|
|
170 }
|
|
171
|
|
172 if ( rightStrandCnt[0] > 0 && leftStrandCnt[0] == 0 && leftStrandCnt[1] > 0 )
|
|
173 return true ;
|
|
174 if ( rightStrandCnt[1] > 0 && leftStrandCnt[1] == 0 && leftStrandCnt[0] > 0 )
|
|
175 return true ;
|
|
176 return false ;
|
|
177 }
|
|
178
|
|
179 // Return 0 - uncompatible or does not overlap at all. 1 - fully compatible. 2 - Head of the constraints compatible with the tail of the transcript
|
|
180 // the partial compatible case (return 2) mostly likely happen in DP where we have partial transcript.
|
|
181 int TranscriptDecider::IsConstraintInTranscript( struct _transcript transcript, struct _constraint &c )
|
|
182 {
|
|
183 //printf( "%d %d, %d %d\n", c.first, c.last, transcript.first, transcript.last ) ;
|
|
184 if ( c.first < transcript.first || c.first > transcript.last
|
|
185 || !transcript.seVector.Test( c.first )
|
|
186 || ( !transcript.partial && !transcript.seVector.Test( c.last ) ) ) // no overlap or starts too early or some chosen subexons does not compatible
|
|
187 return 0 ;
|
|
188
|
|
189 // Extract the subexons we should focus on.
|
|
190 int s, e ;
|
|
191 s = c.first ;
|
|
192 e = c.last ;
|
|
193 bool returnPartial = false ;
|
|
194 if ( e > transcript.last ) // constraints ends after the transcript.
|
|
195 {
|
|
196 if ( transcript.partial )
|
|
197 {
|
|
198 e = transcript.last ;
|
|
199 returnPartial = true ;
|
|
200 }
|
|
201 else
|
|
202 return 0 ;
|
|
203 }
|
|
204 /*printf( "%s: %d %d: (%d %d) (%d %d)\n", __func__, s, e,
|
|
205 transcript.seVector.Test(0), transcript.seVector.Test(1),
|
|
206 c.vector.Test(0), c.vector.Test(1) ) ;*/
|
|
207
|
|
208 compatibleTestVectorT.Assign( transcript.seVector ) ;
|
|
209 //compatibleTestVectorT.MaskRegionOutsideInRange( s, e, transcript.first, transcript.last ) ;
|
|
210 compatibleTestVectorT.MaskRegionOutside( s, e ) ;
|
|
211
|
|
212 compatibleTestVectorC.Assign( c.vector ) ;
|
|
213 if ( c.last > transcript.last )
|
|
214 {
|
|
215 //compatibleTestVectorC.MaskRegionOutsideInRange( s, e, c.first, c.last ) ;
|
|
216 //compatibleTestVectorC.MaskRegionOutside( s, e ) ;
|
|
217 compatibleTestVectorC.MaskRegionOutside( 0, e ) ; // Because the bits before s are already all 0s in C.
|
|
218 }
|
|
219 /*printf( "after masking %d %d. %d %d %d %d:\n", s, e, transcript.first, transcript.last, c.first, c.last ) ;
|
|
220 compatibleTestVectorT.Print() ;
|
|
221 compatibleTestVectorC.Print() ; */
|
|
222 // Test compatible.
|
|
223 int ret = 0 ;
|
|
224 if ( compatibleTestVectorT.IsEqual( compatibleTestVectorC ) )
|
|
225 {
|
|
226 if ( returnPartial )
|
|
227 ret = 2 ;
|
|
228 else
|
|
229 ret = 1 ;
|
|
230 }
|
|
231
|
|
232 return ret ;
|
|
233 }
|
|
234
|
|
235 int TranscriptDecider::IsConstraintInTranscriptDebug( struct _transcript transcript, struct _constraint &c )
|
|
236 {
|
|
237 //printf( "%d %d, %d %d\n", c.first, c.last, transcript.first, transcript.last ) ;
|
|
238 if ( c.first < transcript.first || c.first > transcript.last ) // no overlap or starts too early.
|
|
239 return 0 ;
|
|
240 printf( "hi\n" ) ;
|
|
241 // Extract the subexons we should focus on.
|
|
242 int s, e ;
|
|
243 s = c.first ;
|
|
244 e = c.last ;
|
|
245 bool returnPartial = false ;
|
|
246 if ( e > transcript.last ) // constraints ends after the transcript.
|
|
247 {
|
|
248 if ( transcript.partial )
|
|
249 {
|
|
250 e = transcript.last ;
|
|
251 returnPartial = true ;
|
|
252 }
|
|
253 else
|
|
254 return 0 ;
|
|
255 }
|
|
256 /*printf( "%s: %d %d: (%d %d) (%d %d)\n", __func__, s, e,
|
|
257 transcript.seVector.Test(0), transcript.seVector.Test(1),
|
|
258 c.vector.Test(0), c.vector.Test(1) ) ;*/
|
|
259
|
|
260 compatibleTestVectorT.Assign( transcript.seVector ) ;
|
|
261 compatibleTestVectorT.MaskRegionOutside( s, e ) ;
|
|
262
|
|
263 compatibleTestVectorC.Assign( c.vector ) ;
|
|
264 if ( e > transcript.last )
|
|
265 compatibleTestVectorC.MaskRegionOutside( s, e ) ;
|
|
266 /*printf( "after masking: (%d %d) (%d %d)\n",
|
|
267 compatibleTestVectorT.Test(0), compatibleTestVectorT.Test(1),
|
|
268 compatibleTestVectorC.Test(0), compatibleTestVectorC.Test(1) ) ;*/
|
|
269
|
|
270 // Test compatible.
|
|
271 int ret = 0 ;
|
|
272 if ( compatibleTestVectorT.IsEqual( compatibleTestVectorC ) )
|
|
273 {
|
|
274 if ( returnPartial )
|
|
275 ret = 2 ;
|
|
276 else
|
|
277 ret = 1 ;
|
|
278 }
|
|
279 compatibleTestVectorT.Print() ;
|
|
280 compatibleTestVectorC.Print() ;
|
|
281 printf( "ret=%d\n", ret ) ;
|
|
282 return ret ;
|
|
283 }
|
|
284 int TranscriptDecider::SubTranscriptCount( int tag, struct _subexon *subexons, int *f )
|
|
285 {
|
|
286 if ( f[tag] != -1 )
|
|
287 return f[tag] ;
|
|
288
|
|
289 int ret = 0 ;
|
|
290 int i ;
|
|
291 if ( subexons[tag].canBeEnd )
|
|
292 ret = 1 ;
|
|
293 for ( i = 0 ; i < subexons[tag].nextCnt ; ++i )
|
|
294 {
|
|
295 ret += SubTranscriptCount( subexons[tag].next[i], subexons, f ) ;
|
|
296 }
|
|
297
|
|
298 if ( ret == 0 )
|
|
299 ret = 1 ;
|
|
300 return f[tag] = ret ;
|
|
301 }
|
|
302
|
|
303 void TranscriptDecider::CoalesceSameTranscripts( std::vector<struct _transcript> &t )
|
|
304 {
|
|
305 int i, k ;
|
|
306 if ( t.size() == 0 )
|
|
307 return ;
|
|
308
|
|
309 std::sort( t.begin(), t.end(), CompSortTranscripts ) ;
|
|
310
|
|
311 int size = t.size() ;
|
|
312 k = 0 ;
|
|
313 for ( i = 1 ; i < size ; ++i )
|
|
314 {
|
|
315 if ( t[k].seVector.IsEqual( t[i].seVector ) )
|
|
316 {
|
|
317 t[k].abundance += t[i].abundance ;
|
|
318 t[i].seVector.Release() ;
|
|
319 }
|
|
320 else
|
|
321 {
|
|
322 ++k ;
|
|
323 if ( i != k )
|
|
324 t[k] = t[i] ;
|
|
325 }
|
|
326 }
|
|
327 t.resize( k + 1 ) ;
|
|
328 }
|
|
329
|
|
330 void TranscriptDecider::EnumerateTranscript( int tag, int strand, int visit[], int vcnt, struct _subexon *subexons, SubexonCorrelation &correlation, double correlationScore, std::vector<struct _transcript> &alltranscripts, int &atcnt )
|
|
331 {
|
|
332 int i ;
|
|
333 visit[ vcnt ] = tag ;
|
|
334 //printf( "%s: %d %d %d %d. %d %d\n", __func__, vcnt, tag, subexons[tag].nextCnt, strand, subexons[tag].start, subexons[tag].end ) ;
|
|
335 // Compute the correlation score
|
|
336 double minCor = correlationScore ;
|
|
337 for ( i = 0 ; i < vcnt ; ++i )
|
|
338 {
|
|
339 double tmp = correlation.Query( visit[i], visit[vcnt] ) ;
|
|
340 if ( tmp < minCor )
|
|
341 minCor = tmp ;
|
|
342 }
|
|
343
|
|
344 if ( subexons[tag].canBeEnd )
|
|
345 {
|
|
346 struct _transcript &txpt = alltranscripts[atcnt] ;
|
|
347 for ( i = 0 ; i <= vcnt ; ++i )
|
|
348 txpt.seVector.Set( visit[i] ) ;
|
|
349
|
|
350 txpt.first = visit[0] ;
|
|
351 txpt.last = visit[vcnt] ;
|
|
352 txpt.partial = false ;
|
|
353 txpt.correlationScore = minCor ;
|
|
354
|
|
355 //printf( "%lf %d %d ", txpt.correlationScore, vcnt, visit[0] ) ;
|
|
356 //txpt.seVector.Print() ;
|
|
357 ++atcnt ;
|
|
358 }
|
|
359
|
|
360 for ( i = 0 ; i < subexons[tag].nextCnt ; ++i )
|
|
361 {
|
|
362 int a = subexons[tag].next[i] ;
|
|
363 if ( !SubexonGraph::IsSameStrand( subexons[tag].rightStrand, strand )
|
|
364 && subexons[a].start > subexons[tag].end + 1 )
|
|
365 continue ;
|
|
366 int backupStrand = strand ;
|
|
367 if ( subexons[a].start > subexons[tag].end + 1 && strand == 0 )
|
|
368 strand = subexons[tag].rightStrand ;
|
|
369 EnumerateTranscript( subexons[tag].next[i], strand, visit, vcnt + 1, subexons, correlation, minCor, alltranscripts, atcnt ) ;
|
|
370 strand = backupStrand ;
|
|
371 }
|
|
372 }
|
|
373
|
|
374 void TranscriptDecider::SearchSubTranscript( int tag, int strand, int parents[], int pcnt, struct _dp &pdp, int visit[], int vcnt, int extends[], int extendCnt,
|
|
375 std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr )
|
|
376 {
|
|
377 int i ;
|
|
378 int size ;
|
|
379 double cover ;
|
|
380 bool keepSearch = true ;
|
|
381 bool belowMin = false ;
|
|
382
|
|
383 struct _subexon *subexons = attr.subexons ;
|
|
384
|
|
385 visit[vcnt] = tag ;
|
|
386 ++vcnt ;
|
|
387 struct _dp visitdp ;
|
|
388
|
|
389 visitdp.cover = -1 ;
|
|
390
|
|
391 struct _transcript &subTxpt = attr.bufferTxpt ;
|
|
392 subTxpt.seVector.Reset() ;
|
|
393 for ( i = 0 ; i < pcnt ; ++i )
|
|
394 subTxpt.seVector.Set( parents[i] ) ;
|
|
395 subTxpt.first = parents[0] ;
|
|
396 subTxpt.last = parents[ pcnt - 1] ;
|
|
397 for ( i = 0 ; i < vcnt ; ++i )
|
|
398 subTxpt.seVector.Set( visit[i] ) ;
|
|
399 subTxpt.last = visit[ vcnt - 1 ] ;
|
|
400 subTxpt.partial = true ;
|
|
401
|
|
402 // Adjust the extendsCnt
|
|
403 /*printf( "%s: %d %d %d\n", __func__, vcnt , extendCnt, extends[ extendCnt - 1] ) ;
|
|
404 subTxpt.seVector.Print() ;
|
|
405 tc[extends[extendCnt - 1]].vector.Print() ;
|
|
406 printf( "Adjust extend:\n") ;*/
|
|
407 for ( i = extendCnt - 1 ; i >= 0 ; --i )
|
|
408 {
|
|
409 if ( tc[ extends[i] ].last <= tag || ( tc[ extends[i] ].vector.Test( tag ) && IsConstraintInTranscript( subTxpt, tc[ extends[i] ] ) != 0 ) )
|
|
410 break ;
|
|
411 }
|
|
412 extendCnt = i + 1 ;
|
|
413
|
|
414 // If the extension ends.
|
|
415 subTxpt.partial = false ;
|
|
416 if ( subexons[tag].nextCnt > 0 && ( extendCnt == 0 || tag >= tc[ extends[ extendCnt - 1 ] ].last ) )
|
|
417 {
|
|
418 // Solve the subtranscript beginning with visit.
|
|
419 // Now we got the optimal transcript for visit.
|
|
420 visitdp = SolveSubTranscript( visit, vcnt, strand, tc, tcStartInd, attr ) ;
|
|
421 keepSearch = false ;
|
|
422 }
|
|
423 //printf( "%s %d %d: visitdp.cover=%lf\n", __func__, parents[0], tag, visitdp.cover ) ;
|
|
424
|
|
425 // the constraints across the parents and visit.
|
|
426 size = tc.size() ;
|
|
427 if ( visitdp.cover >= 0 )
|
|
428 {
|
|
429 cover = visitdp.cover ;
|
|
430 // Reset the subTxpt, since its content is modofitied in SolveSubTxpt called above.
|
|
431 subTxpt.seVector.Reset() ;
|
|
432 for ( i = 0 ; i < pcnt ; ++i )
|
|
433 subTxpt.seVector.Set( parents[i] ) ;
|
|
434 subTxpt.seVector.Or( visitdp.seVector ) ;
|
|
435 subTxpt.first = parents[0] ;
|
|
436 subTxpt.last = visitdp.last ;
|
|
437 subTxpt.partial = false ;
|
|
438
|
|
439 if ( !attr.forAbundance && attr.minAbundance > 0 )
|
|
440 {
|
|
441 for ( i = 0 ; i < pcnt - 1 ; ++i )
|
|
442 {
|
|
443 if ( attr.uncoveredPair.find( parents[i] * attr.seCnt + parents[i + 1] ) != attr.uncoveredPair.end() )
|
|
444 belowMin = true ;
|
|
445 }
|
|
446 for ( i = -1 ; i < vcnt - 1 ; ++i )
|
|
447 {
|
|
448 if ( i == -1 && pcnt >= 1 )
|
|
449 {
|
|
450 if ( attr.uncoveredPair.find( parents[pcnt - 1] * attr.seCnt + visit[0] ) != attr.uncoveredPair.end() )
|
|
451 belowMin = true ;
|
|
452 }
|
|
453 else
|
|
454 {
|
|
455 if ( attr.uncoveredPair.find( visit[i] * attr.seCnt + visit[i + 1] ) != attr.uncoveredPair.end() )
|
|
456 belowMin = true ;
|
|
457 }
|
|
458 }
|
|
459 if ( attr.forAbundance && belowMin )
|
|
460 cover = 1e-6 ;
|
|
461 }
|
|
462
|
|
463 for ( i = tcStartInd ; i < size ; ++i )
|
|
464 {
|
|
465 if ( tc[i].first > parents[ pcnt - 1] )
|
|
466 break ;
|
|
467
|
|
468 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 )
|
|
469 {
|
|
470 if ( tc[i].normAbund <= attr.minAbundance )
|
|
471 {
|
|
472 belowMin = true ;
|
|
473 cover = -2 ;
|
|
474 break ;
|
|
475 }
|
|
476
|
|
477 if ( tc[i].abundance <= 0 )
|
|
478 continue ;
|
|
479
|
|
480 if ( attr.forAbundance )
|
|
481 {
|
|
482 if ( tc[i].normAbund < cover || cover == 0 )
|
|
483 cover = tc[i].normAbund ;
|
|
484 }
|
|
485 else
|
|
486 {
|
|
487 ++cover ;
|
|
488 }
|
|
489 }
|
|
490 }
|
|
491 if ( belowMin && pdp.cover == -1 )
|
|
492 {
|
|
493 pdp.cover = -2 ;
|
|
494 pdp.seVector.Assign( subTxpt.seVector ) ;
|
|
495 pdp.first = subTxpt.first ;
|
|
496 pdp.last = subTxpt.last ;
|
|
497 pdp.strand = strand ;
|
|
498 }
|
|
499 else if ( cover > pdp.cover )
|
|
500 {
|
|
501 pdp.cover = cover ;
|
|
502 pdp.seVector.Assign( subTxpt.seVector ) ;
|
|
503 pdp.first = subTxpt.first ;
|
|
504 pdp.last = subTxpt.last ;
|
|
505 pdp.strand = strand ;
|
|
506 }
|
|
507 }
|
|
508 else if ( visitdp.cover == -2 && pdp.cover == -1 ) // no valid extension from visit
|
|
509 {
|
|
510 subTxpt.seVector.Reset() ;
|
|
511 for ( i = 0 ; i < pcnt ; ++i )
|
|
512 subTxpt.seVector.Set( parents[i] ) ;
|
|
513 subTxpt.seVector.Or( visitdp.seVector ) ;
|
|
514 subTxpt.first = parents[0] ;
|
|
515 subTxpt.last = visitdp.last ;
|
|
516
|
|
517 pdp.cover = -2 ;
|
|
518 pdp.seVector.Assign( subTxpt.seVector ) ;
|
|
519 pdp.first = subTxpt.first ;
|
|
520 pdp.last = subTxpt.last ;
|
|
521 pdp.strand = strand ;
|
|
522 }
|
|
523
|
|
524 if ( subexons[tag].canBeEnd && ( visitdp.cover < 0 || attr.forAbundance ) )
|
|
525 // This works is because that the extension always covers more constraints. So we only go this branch if the extension does not work
|
|
526 // and it goes this branch if it violates minAbundance
|
|
527 // But we need to go here when we want to compute the maxAbundance transcript.
|
|
528 // This part also works as the exit point of the recurive function.
|
|
529 {
|
|
530 bool belowMin = false ;
|
|
531 subTxpt.seVector.Reset() ;
|
|
532 for ( i = 0 ; i < pcnt ; ++i )
|
|
533 subTxpt.seVector.Set( parents[i] ) ;
|
|
534 for ( i = 0 ; i < vcnt ; ++i )
|
|
535 subTxpt.seVector.Set( visit[i] ) ;
|
|
536 subTxpt.first = parents[0] ;
|
|
537 subTxpt.last = visit[ vcnt - 1] ;
|
|
538 subTxpt.partial = false ;
|
|
539
|
|
540 cover = 0 ;
|
|
541 if ( attr.forAbundance || attr.minAbundance > 0 )
|
|
542 {
|
|
543 for ( i = 0 ; i < pcnt - 1 ; ++i )
|
|
544 {
|
|
545 if ( attr.uncoveredPair.find( parents[i] * attr.seCnt + parents[i + 1] ) != attr.uncoveredPair.end() )
|
|
546 belowMin = true ;
|
|
547 }
|
|
548 for ( i = -1 ; i < vcnt - 1 ; ++i )
|
|
549 {
|
|
550 if ( i == -1 && pcnt >= 1 )
|
|
551 {
|
|
552 if ( attr.uncoveredPair.find( parents[pcnt - 1] * attr.seCnt + visit[0] ) != attr.uncoveredPair.end() )
|
|
553 belowMin = true ;
|
|
554 }
|
|
555 else
|
|
556 {
|
|
557 if ( attr.uncoveredPair.find( visit[i] * attr.seCnt + visit[i + 1] ) != attr.uncoveredPair.end() )
|
|
558 belowMin = true ;
|
|
559 }
|
|
560 }
|
|
561
|
|
562 //if ( belowMin == true )
|
|
563 // printf( "turned belowMin. %d. %d %d: %d %d %d\n", attr.uncoveredPair.size(), pcnt, vcnt, parents[0], visit[0], visit[ vcnt - 1] ) ;
|
|
564
|
|
565 if ( attr.forAbundance && belowMin )
|
|
566 cover = 1e-6 ;
|
|
567 }
|
|
568
|
|
569 for ( i = tcStartInd ; i < size ; ++i )
|
|
570 {
|
|
571 // note that the value is parents[ pcnt - 1], because
|
|
572 // in above the part of "visit" is computed in SolveSubTranscript( visit ).
|
|
573 if ( tc[i].first > visit[ vcnt - 1] )
|
|
574 break ;
|
|
575 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 )
|
|
576 {
|
|
577 if ( tc[i].normAbund <= attr.minAbundance )
|
|
578 {
|
|
579 belowMin = true ;
|
|
580 cover = -2 ;
|
|
581 break ;
|
|
582 }
|
|
583
|
|
584 if ( tc[i].abundance <= 0 )
|
|
585 continue ;
|
|
586 if ( attr.forAbundance )
|
|
587 {
|
|
588 if ( tc[i].normAbund < cover || cover == 0 )
|
|
589 cover = tc[i].normAbund ;
|
|
590 }
|
|
591 else
|
|
592 {
|
|
593 ++cover ;
|
|
594 }
|
|
595 }
|
|
596 }
|
|
597
|
|
598 if ( belowMin && pdp.cover == -1 )
|
|
599 {
|
|
600 pdp.cover = -2 ;
|
|
601 pdp.seVector.Assign( subTxpt.seVector ) ;
|
|
602 pdp.first = subTxpt.first ;
|
|
603 pdp.last = subTxpt.last ;
|
|
604 pdp.strand = strand ;
|
|
605 }
|
|
606 else if ( cover > pdp.cover )
|
|
607 {
|
|
608 pdp.cover = cover ;
|
|
609 pdp.seVector.Assign( subTxpt.seVector ) ;
|
|
610 pdp.first = subTxpt.first ;
|
|
611 pdp.last = subTxpt.last ;
|
|
612 pdp.strand = strand ;
|
|
613 }
|
|
614 }
|
|
615 //printf( "%s %d: pdp.cover=%lf\n", __func__, tag, pdp.cover ) ;
|
|
616
|
|
617 // keep searching.
|
|
618 if ( keepSearch )
|
|
619 {
|
|
620 for ( i = 0 ; i < subexons[tag].nextCnt ; ++i )
|
|
621 {
|
|
622 int b = subexons[tag].next[i] ;
|
|
623 if ( ( SubexonGraph::IsSameStrand( subexons[tag].rightStrand, strand )
|
|
624 && SubexonGraph::IsSameStrand( subexons[b].leftStrand, strand ) ) ||
|
|
625 subexons[b].start == subexons[tag].end + 1 )
|
|
626 {
|
|
627 int backupStrand = strand ;
|
|
628 if ( subexons[b].start > subexons[tag].end + 1 )
|
|
629 strand = subexons[tag].rightStrand ;
|
|
630
|
|
631 SearchSubTranscript( subexons[tag].next[i], strand, parents, pcnt, pdp, visit, vcnt,
|
|
632 extends, extendCnt, tc, tcStartInd, attr ) ;
|
|
633 strand = backupStrand ;
|
|
634 }
|
|
635 }
|
|
636
|
|
637 }
|
|
638
|
|
639 return ;
|
|
640 }
|
|
641
|
|
642 struct _dp TranscriptDecider::SolveSubTranscript( int visit[], int vcnt, int strand, std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr )
|
|
643 {
|
|
644 int i ;
|
|
645 int size ;
|
|
646 /*printf( "%s: ", __func__ ) ;
|
|
647 for ( i = 0 ; i < vcnt ; ++i )
|
|
648 printf( "%d ", visit[i] ) ;
|
|
649 printf( ": %lf %d %d", attr.f1[ visit[0] ].cover, attr.f1[ visit[0] ].timeStamp, attr.timeStamp ) ;
|
|
650 printf( "\n" ) ;*/
|
|
651 // Test whether it is stored in dp
|
|
652 if ( vcnt == 1 )
|
|
653 {
|
|
654 if ( attr.f1[ visit[0] ].cover != -1 && attr.f1[ visit[0] ].strand == strand && ( attr.f1[ visit[0] ].timeStamp == attr.timeStamp ||
|
|
655 ( attr.f1[ visit[0] ].minAbundance < attr.minAbundance && attr.f1[visit[0]].cover == -2 ) ) ) //even given lower minAbundance threshold, it fails
|
|
656 {
|
|
657 return attr.f1[ visit[0] ] ;
|
|
658 }
|
|
659 }
|
|
660 else if ( vcnt == 2 && attr.f2 )
|
|
661 {
|
|
662 int a = visit[0] ;
|
|
663 int b = visit[1] ;
|
|
664
|
|
665 if ( attr.f2[a][b].cover != -1 && attr.f2[a][b].strand == strand && ( attr.f2[a][b].timeStamp == attr.timeStamp ||
|
|
666 ( attr.f2[a][b].minAbundance < attr.minAbundance && attr.f2[a][b].cover == -2 ) ) )
|
|
667 {
|
|
668 return attr.f2[a][b] ;
|
|
669 }
|
|
670 }
|
|
671 else
|
|
672 {
|
|
673 int key = 0 ;
|
|
674 for ( i = 0 ; i < vcnt ; ++i )
|
|
675 key = ( key * attr.seCnt + visit[i] ) % hashMax ;
|
|
676 if ( key < 0 )
|
|
677 key += hashMax ;
|
|
678
|
|
679 if ( attr.hash[key].cover != -1 && attr.hash[key].cnt == vcnt && attr.hash[key].strand == strand &&
|
|
680 ( attr.hash[key].first == visit[0] ) &&
|
|
681 ( attr.hash[key].timeStamp == attr.timeStamp ||
|
|
682 ( attr.hash[key].minAbundance < attr.minAbundance && attr.hash[key].cover == -2 ) ) )
|
|
683 {
|
|
684 struct _transcript subTxpt = attr.bufferTxpt ;
|
|
685 subTxpt.seVector.Reset() ;
|
|
686 for ( i = 0 ; i < vcnt ; ++i )
|
|
687 subTxpt.seVector.Set( visit[i] ) ;
|
|
688 //subTxpt.seVector.Print() ;
|
|
689 //attr.hash[key].seVector.Print() ;
|
|
690 subTxpt.seVector.Xor( attr.hash[key].seVector ) ;
|
|
691 subTxpt.seVector.MaskRegionOutside( visit[0], visit[ vcnt - 1] ) ;
|
|
692 //printf( "hash test: %d %d\n", key, subTxpt.seVector.IsAllZero() ) ;
|
|
693 if ( subTxpt.seVector.IsAllZero() )
|
|
694 {
|
|
695 return attr.hash[key] ;
|
|
696 }
|
|
697
|
|
698 // Can't use the code below, because vcnt is the header of subexons.
|
|
699 /*for ( i = 0 ; i < vcnt ; ++i )
|
|
700 if ( !attr.hash[key].seVector.Test( visit[i] ) )
|
|
701 break ;
|
|
702 if ( i >= vcnt )
|
|
703 return attr.hash[key] ;*/
|
|
704
|
|
705 }
|
|
706 }
|
|
707 // adjust tcStartInd
|
|
708 size = tc.size() ;
|
|
709 for ( i = tcStartInd ; i < size ; ++i )
|
|
710 if ( tc[i].first >= visit[0] )
|
|
711 break ;
|
|
712 tcStartInd = i ;
|
|
713
|
|
714
|
|
715 struct _subexon *subexons = attr.subexons ;
|
|
716 struct _dp visitdp ;
|
|
717 visitdp.seVector.Init( attr.seCnt ) ;
|
|
718 visitdp.cover = -1 ;
|
|
719
|
|
720 struct _transcript &subTxpt = attr.bufferTxpt ;
|
|
721 // This happens when it is called from PickTranscriptsByDP, the first subexon might be the end.
|
|
722 subTxpt.seVector.Reset() ;
|
|
723 for ( i = 0 ; i < vcnt ; ++i )
|
|
724 subTxpt.seVector.Set( visit[i] ) ;
|
|
725 subTxpt.first = visit[0] ;
|
|
726 subTxpt.last = visit[vcnt - 1] ;
|
|
727
|
|
728 if ( subexons[ visit[vcnt - 1] ].canBeEnd )
|
|
729 {
|
|
730 subTxpt.partial = false ;
|
|
731 double cover = 0 ;
|
|
732 for ( i = tcStartInd ; i < size ; ++i )
|
|
733 {
|
|
734 if ( tc[i].first > subTxpt.last )
|
|
735 break ;
|
|
736
|
|
737 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 )
|
|
738 {
|
|
739 if ( tc[i].normAbund <= attr.minAbundance )
|
|
740 {
|
|
741 cover = -2 ;
|
|
742 break ;
|
|
743 }
|
|
744
|
|
745 if ( tc[i].abundance <= 0 )
|
|
746 continue ;
|
|
747 if ( attr.forAbundance )
|
|
748 {
|
|
749 if ( tc[i].normAbund < cover || cover == 0 )
|
|
750 cover = tc[i].normAbund ;
|
|
751 }
|
|
752 else
|
|
753 ++cover ;
|
|
754 }
|
|
755 }
|
|
756
|
|
757 visitdp.seVector.Assign( subTxpt.seVector ) ;
|
|
758 visitdp.cover = cover ;
|
|
759 visitdp.first = subTxpt.first ;
|
|
760 visitdp.last = subTxpt.last ;
|
|
761 visitdp.strand = strand ;
|
|
762 }
|
|
763
|
|
764 // Now we extend.
|
|
765 size = tc.size() ;
|
|
766 int *extends = new int[tc.size() - tcStartInd + 1] ;
|
|
767 int extendCnt = 0 ;
|
|
768 subTxpt.partial = true ;
|
|
769 for ( i = tcStartInd ; i < size ; ++i )
|
|
770 {
|
|
771 if ( tc[i].first > subTxpt.last )
|
|
772 break ;
|
|
773 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 2 )
|
|
774 {
|
|
775 extends[extendCnt] = i ;
|
|
776 ++extendCnt ;
|
|
777 }
|
|
778 }
|
|
779
|
|
780 // Sort the extend by the index of the last subexon.
|
|
781 if ( extendCnt > 0 )
|
|
782 {
|
|
783 struct _pair32 *extendsPairs = new struct _pair32[extendCnt] ;
|
|
784
|
|
785 for ( i = 0 ; i < extendCnt ; ++i )
|
|
786 {
|
|
787 extendsPairs[i].a = extends[i] ;
|
|
788 extendsPairs[i].b = tc[ extends[i] ].last ;
|
|
789 }
|
|
790 qsort( extendsPairs, extendCnt, sizeof( struct _pair32 ), CompPairsByB ) ;
|
|
791
|
|
792 for ( i = 0 ; i < extendCnt ; ++i )
|
|
793 extends[i] = extendsPairs[i].a ;
|
|
794
|
|
795 delete[] extendsPairs ;
|
|
796 }
|
|
797
|
|
798 size = subexons[ visit[vcnt - 1] ].nextCnt ;
|
|
799 int nextvCnt = 1 ;
|
|
800 if ( extendCnt > 0 && tc[ extends[ extendCnt - 1 ] ].last - visit[ vcnt - 1 ] > 1 )
|
|
801 nextvCnt = tc[ extends[ extendCnt - 1 ] ].last - visit[ vcnt - 1 ] ;
|
|
802 int *nextv = new int[ nextvCnt ] ;
|
|
803 for ( i = 0 ; i < size ; ++i )
|
|
804 {
|
|
805 int a = visit[vcnt - 1] ;
|
|
806 int b = subexons[a].next[i] ;
|
|
807 if ( ( SubexonGraph::IsSameStrand( subexons[a].rightStrand, strand )
|
|
808 && SubexonGraph::IsSameStrand( subexons[b].leftStrand, strand ) )
|
|
809 ||
|
|
810 subexons[b].start == subexons[a].end + 1 )
|
|
811 {
|
|
812 int backupStrand = strand ;
|
|
813 if ( subexons[b].start > subexons[a].end + 1 )
|
|
814 strand = subexons[a].rightStrand ;
|
|
815 SearchSubTranscript( subexons[ visit[vcnt - 1] ].next[i], strand, visit, vcnt, visitdp, nextv, 0, extends, extendCnt, tc, tcStartInd, attr ) ;
|
|
816 strand = backupStrand ;
|
|
817
|
|
818 }
|
|
819 }
|
|
820 //printf( "%s %d(%d) %d %d %d: %lf\n", __func__, visit[0], subexons[ visit[vcnt - 1] ].canBeEnd, size, extendCnt, strand, visitdp.cover ) ;
|
|
821 delete[] nextv ;
|
|
822 delete[] extends ;
|
|
823
|
|
824 // store the result in the dp structure.
|
|
825 // We return the structure stored in dp to simplify the memory access pattern.
|
|
826 // In other words, we assume the structure returned from this function always uses the memory from attr.dp
|
|
827 if ( vcnt == 1 )
|
|
828 {
|
|
829 SetDpContent( attr.f1[ visit[0] ], visitdp, attr ) ;
|
|
830 visitdp.seVector.Release() ;
|
|
831 return attr.f1[ visit[0] ] ;
|
|
832 }
|
|
833 else if ( vcnt == 2 && attr.f2 )
|
|
834 {
|
|
835 SetDpContent( attr.f2[ visit[0] ][ visit[1] ], visitdp, attr ) ;
|
|
836 visitdp.seVector.Release() ;
|
|
837 return attr.f2[ visit[0] ][ visit[1] ] ;
|
|
838 }
|
|
839 else
|
|
840 {
|
|
841 int key = 0 ;
|
|
842 for ( i = 0 ; i < vcnt ; ++i )
|
|
843 key = ( key * attr.seCnt + visit[i] ) % hashMax ;
|
|
844 if ( key < 0 )
|
|
845 key += hashMax ;
|
|
846
|
|
847 //static int hashUsed = 0 ;
|
|
848 //if ( attr.hash[key].cover == -1 )
|
|
849 // ++hashUsed ;
|
|
850 //printf( "%d/%d\n", hashUsed, HASH_MAX) ;
|
|
851 //printf( "hash write: %d\n", key ) ;
|
|
852 SetDpContent( attr.hash[key], visitdp, attr ) ;
|
|
853 attr.hash[key].cnt = vcnt ;
|
|
854 visitdp.seVector.Release() ;
|
|
855 return attr.hash[key] ;
|
|
856 }
|
|
857 }
|
|
858
|
|
859
|
|
860 void TranscriptDecider::PickTranscriptsByDP( struct _subexon *subexons, int seCnt, int iterBound, Constraints &constraints, SubexonCorrelation &correlation, struct _dpAttribute &attr, std::vector<struct _transcript> &alltranscripts )
|
|
861 {
|
|
862 int i, j, k ;
|
|
863
|
|
864 std::vector<struct _transcript> transcripts ;
|
|
865 std::vector<struct _constraint> &tc = constraints.constraints ;
|
|
866 int tcCnt = tc.size() ;
|
|
867 int coalesceThreshold = 1024 ;
|
|
868
|
|
869 //printf( "tcCnt=%d\n", tcCnt ) ;
|
|
870
|
|
871 attr.timeStamp = 1 ;
|
|
872 attr.bufferTxpt.seVector.Init( seCnt ) ;
|
|
873 attr.subexons = subexons ;
|
|
874 attr.seCnt = seCnt ;
|
|
875
|
|
876 double maxAbundance = -1 ;
|
|
877 // Initialize the dp data structure
|
|
878 /*memset( attr.f1, -1, sizeof( struct _dp ) * seCnt ) ;
|
|
879 for ( i = 0 ; i < seCnt ; ++i )
|
|
880 memset( attr.f2[i], -1, sizeof( struct _dp ) * seCnt ) ;
|
|
881 memset( attr.hash, -1, sizeof( struct _dp ) * HASH_MAX ) ;*/
|
|
882 for ( i = 0 ; i < seCnt ; ++i )
|
|
883 ResetDpContent( attr.f1[i] ) ;
|
|
884 for ( i = 0 ; i < seCnt && attr.f2 ; ++i )
|
|
885 for ( j = i ; j < seCnt ; ++j )
|
|
886 ResetDpContent( attr.f2[i][j] ) ;
|
|
887 for ( i = 0 ; i < hashMax ; ++i )
|
|
888 ResetDpContent( attr.hash[i] ) ;
|
|
889
|
|
890 // Set the uncovered pair
|
|
891 attr.uncoveredPair.clear() ;
|
|
892 BitTable bufferTable( seCnt ) ;
|
|
893 k = 0 ;
|
|
894 for ( i = 0 ; i < seCnt ; ++i )
|
|
895 {
|
|
896 for ( ; k < tcCnt ; ++k )
|
|
897 {
|
|
898 if ( tc[k].last >= i )
|
|
899 break ;
|
|
900 }
|
|
901
|
|
902 if ( k >= tcCnt || tc[k].first > i )
|
|
903 {
|
|
904 for ( j = 0 ; j < subexons[i].nextCnt ; ++j )
|
|
905 {
|
|
906 attr.uncoveredPair[i * seCnt + subexons[i].next[j] ] = 1 ;
|
|
907 }
|
|
908 continue ;
|
|
909 }
|
|
910
|
|
911 for ( j = 0 ; j < subexons[i].nextCnt ; ++j )
|
|
912 {
|
|
913 bool covered = false ;
|
|
914 int l, n ;
|
|
915
|
|
916 n = subexons[i].next[j] ;
|
|
917 for ( l = k ; l < tcCnt ; ++l )
|
|
918 {
|
|
919 if ( tc[l].first > i )
|
|
920 break ;
|
|
921 if ( tc[l].vector.Test( i ) && tc[l].vector.Test( n ) )
|
|
922 {
|
|
923 if ( n == i + 1 )
|
|
924 {
|
|
925 covered = true ;
|
|
926 break ;
|
|
927 }
|
|
928 else
|
|
929 {
|
|
930 bufferTable.Assign( tc[l].vector ) ;
|
|
931 bufferTable.MaskRegionOutside( i + 1, n - 1 ) ;
|
|
932 if ( bufferTable.IsAllZero() )
|
|
933 {
|
|
934 covered = true ;
|
|
935 break ;
|
|
936 }
|
|
937 }
|
|
938 }
|
|
939 }
|
|
940
|
|
941 if ( !covered )
|
|
942 {
|
|
943 //printf( "set!: (%d: %d %d) (%d: %d %d)\n", i, subexons[i].start, subexons[i].end, n, subexons[n].start, subexons[n].end ) ;
|
|
944 attr.uncoveredPair[ i * seCnt + n ] = 1 ;
|
|
945 }
|
|
946 }
|
|
947 }
|
|
948 bufferTable.Release() ;
|
|
949
|
|
950
|
|
951 // Find the max abundance
|
|
952 attr.forAbundance = true ;
|
|
953 attr.minAbundance = 0 ;
|
|
954 for ( i = 0 ; i < seCnt ; ++i )
|
|
955 {
|
|
956 if ( subexons[i].canBeStart )
|
|
957 {
|
|
958 int visit[1] = {i} ;
|
|
959 struct _dp tmp ;
|
|
960
|
|
961 tmp = SolveSubTranscript( visit, 1, 0, tc, 0, attr ) ;
|
|
962
|
|
963 if ( tmp.cover > maxAbundance )
|
|
964 maxAbundance = tmp.cover ;
|
|
965 }
|
|
966 }
|
|
967 //PrintLog( "maxAbundance=%lf", maxAbundance ) ;
|
|
968 //exit( 1 ) ;
|
|
969
|
|
970 // Pick the transcripts. Quantative Set-Cover
|
|
971 // Notice that by the logic in SearchSubTxpt and SolveSubTxpt, we don't need to reinitialize the data structure.
|
|
972 attr.forAbundance = false ;
|
|
973 int *coveredTc = new int[tcCnt] ;
|
|
974 int coveredTcCnt ;
|
|
975 struct _dp maxCoverDp ;
|
|
976 struct _dp bestDp ;
|
|
977 std::map<double, struct _dp> cachedCoverResult ;
|
|
978
|
|
979 maxCoverDp.seVector.Init( seCnt ) ;
|
|
980 bestDp.seVector.Init( seCnt ) ;
|
|
981 int iterCnt = 0 ;
|
|
982
|
|
983 while ( 1 )
|
|
984 {
|
|
985 double bestScore ;
|
|
986
|
|
987 // iterately assign constraints
|
|
988 attr.minAbundance = 0 ;
|
|
989
|
|
990 // Find the best candidate transcript.
|
|
991 bestDp.cover = -1 ;
|
|
992 bestScore = -1 ;
|
|
993 while ( 1 )
|
|
994 {
|
|
995 // iterate the change of minAbundance
|
|
996 if ( cachedCoverResult.find( attr.minAbundance ) != cachedCoverResult.end() )
|
|
997 {
|
|
998 struct _dp tmp = cachedCoverResult[ attr.minAbundance ] ;
|
|
999 SetDpContent( maxCoverDp, tmp, attr ) ;
|
|
1000 }
|
|
1001 else
|
|
1002 {
|
|
1003 maxCoverDp.cover = -1 ;
|
|
1004 ++attr.timeStamp ;
|
|
1005 for ( i = 0 ; i < seCnt ; ++i )
|
|
1006 {
|
|
1007 if ( subexons[i].canBeStart == false )
|
|
1008 continue ;
|
|
1009 int visit[1] = {i} ;
|
|
1010 struct _dp tmp ;
|
|
1011 tmp = SolveSubTranscript( visit, 1, 0, tc, 0, attr ) ;
|
|
1012
|
|
1013 if ( tmp.cover > maxCoverDp.cover && tmp.cover > 0 )
|
|
1014 {
|
|
1015 SetDpContent( maxCoverDp, tmp, attr ) ;
|
|
1016 }
|
|
1017 //if ( subexons[i].start == 6870264 || subexons[i].start == 6872237 )
|
|
1018 // printf( "%d: %lf\n", i, tmp.cover ) ;
|
|
1019 }
|
|
1020
|
|
1021 if ( maxCoverDp.cover == -1 )
|
|
1022 break ;
|
|
1023 struct _dp ccr ;
|
|
1024 ccr.seVector.Init( seCnt ) ;
|
|
1025 SetDpContent( ccr, maxCoverDp, attr ) ;
|
|
1026 cachedCoverResult[ attr.minAbundance ] = ccr ;
|
|
1027 }
|
|
1028 // the abundance for the max cover txpt.
|
|
1029 double min = -1 ;
|
|
1030 struct _transcript &subTxpt = attr.bufferTxpt ;
|
|
1031 subTxpt.seVector.Assign( maxCoverDp.seVector ) ;
|
|
1032 subTxpt.first = maxCoverDp.first ;
|
|
1033 subTxpt.last = maxCoverDp.last ;
|
|
1034
|
|
1035 for ( i = 0 ; i < tcCnt ; ++i )
|
|
1036 {
|
|
1037 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 )
|
|
1038 {
|
|
1039 if ( tc[i].normAbund < min || min == -1 )
|
|
1040 min = tc[i].normAbund ;
|
|
1041 }
|
|
1042 }
|
|
1043
|
|
1044 if ( attr.minAbundance == 0 )
|
|
1045 {
|
|
1046 std::vector<int> subexonIdx ;
|
|
1047 maxCoverDp.seVector.GetOnesIndices( subexonIdx ) ;
|
|
1048 int size = subexonIdx.size() ;
|
|
1049 for ( i = 0 ; i < size - 1 ; ++i )
|
|
1050 if ( attr.uncoveredPair.find( subexonIdx[i] * seCnt + subexonIdx[i + 1] ) != attr.uncoveredPair.end() )
|
|
1051 {
|
|
1052 min = 1e-6 ;
|
|
1053 break ;
|
|
1054 }
|
|
1055 }
|
|
1056
|
|
1057 double score = ComputeScore( maxCoverDp.cover, 1.0, min, maxAbundance, 0 ) ;
|
|
1058 if ( bestScore == -1 || score > bestScore )
|
|
1059 {
|
|
1060 bestScore = score ;
|
|
1061 SetDpContent( bestDp, maxCoverDp, attr ) ;
|
|
1062 }
|
|
1063 else if ( score < bestScore )
|
|
1064 {
|
|
1065 if ( ComputeScore( maxCoverDp.cover, 1.0, maxAbundance, maxAbundance, 0 ) < bestScore )
|
|
1066 break ;
|
|
1067 }
|
|
1068 //PrintLog( "normAbund=%lf maxCoverDp.cover=%lf score=%lf timeStamp=%d", min, maxCoverDp.cover, score, attr.timeStamp ) ;
|
|
1069 attr.minAbundance = min ;
|
|
1070 } // end of iteration for minAbundance.
|
|
1071
|
|
1072 if ( bestDp.cover == -1 )
|
|
1073 break ;
|
|
1074 // Assign the constraints.
|
|
1075 coveredTcCnt = 0 ;
|
|
1076 double update = -1 ;
|
|
1077 struct _transcript &subTxpt = attr.bufferTxpt ;
|
|
1078 subTxpt.seVector.Assign( bestDp.seVector ) ;
|
|
1079 subTxpt.first = bestDp.first ;
|
|
1080 subTxpt.last = bestDp.last ;
|
|
1081 subTxpt.partial = false ;
|
|
1082 for ( i = 0 ; i < tcCnt ; ++i )
|
|
1083 {
|
|
1084 if ( IsConstraintInTranscript( subTxpt, tc[i] ) == 1 )
|
|
1085 {
|
|
1086 if ( tc[i].abundance > 0 &&
|
|
1087 ( tc[i].abundance < update || update == -1 ) )
|
|
1088 {
|
|
1089 update = tc[i].abundance ;
|
|
1090 }
|
|
1091 coveredTc[ coveredTcCnt ] = i ;
|
|
1092 ++coveredTcCnt ;
|
|
1093 }
|
|
1094 /*else
|
|
1095 {
|
|
1096 printf( "%d: ", i ) ;
|
|
1097 tc[i].vector.Print() ;
|
|
1098 if ( i == 127 )
|
|
1099 {
|
|
1100 printf( "begin debug:\n" ) ;
|
|
1101 IsConstraintInTranscriptDebug( subTxpt, tc[i] ) ;
|
|
1102 }
|
|
1103 }*/
|
|
1104 }
|
|
1105 update *= ( 1 + iterCnt / 50 ) ;//* ( 1 + iterCnt / 50 ) ;
|
|
1106
|
|
1107 //PrintLog( "%d: update=%lf %d %d. %d %d %d", iterCnt, update, coveredTcCnt, tcCnt,
|
|
1108 // bestDp.first, bestDp.last, subexons[ bestDp.first ].start ) ;
|
|
1109 //bestDp.seVector.Print() ;
|
|
1110
|
|
1111 struct _transcript nt ;
|
|
1112 nt.seVector.Duplicate( bestDp.seVector ) ;
|
|
1113 nt.first = bestDp.first ;
|
|
1114 nt.last = bestDp.last ;
|
|
1115 nt.partial = false ;
|
|
1116 nt.abundance = 0 ;
|
|
1117 for ( i = 0 ; i < coveredTcCnt ; ++i )
|
|
1118 {
|
|
1119 j = coveredTc[i] ;
|
|
1120 if ( tc[j].abundance > 0 )
|
|
1121 {
|
|
1122 double tmp = ( tc[j].abundance > update ? update : tc[j].abundance ) ;
|
|
1123 tc[j].abundance -= tmp ;
|
|
1124 double factor = 1 ;
|
|
1125
|
|
1126 nt.abundance += ( tc[j].support * update / tc[j].normAbund * factor ) ;
|
|
1127
|
|
1128 if ( tc[j].abundance <= 0 )
|
|
1129 {
|
|
1130 std::vector<double> removeKey ;
|
|
1131 for ( std::map<double, struct _dp>::iterator it = cachedCoverResult.begin() ; it != cachedCoverResult.end() ; ++it )
|
|
1132 {
|
|
1133 subTxpt.seVector.Assign( it->second.seVector ) ;
|
|
1134 subTxpt.first = it->second.first ;
|
|
1135 subTxpt.last = it->second.last ;
|
|
1136 subTxpt.partial = false ;
|
|
1137 if ( IsConstraintInTranscript( subTxpt, tc[j] ) == 1 )
|
|
1138 {
|
|
1139 it->second.seVector.Release() ;
|
|
1140 removeKey.push_back( it->first ) ;
|
|
1141 }
|
|
1142 }
|
|
1143 int size = removeKey.size() ;
|
|
1144 int l ;
|
|
1145 for ( l = 0 ; l < size ; ++l )
|
|
1146 cachedCoverResult.erase( removeKey[l] ) ;
|
|
1147 }
|
|
1148 }
|
|
1149
|
|
1150 if ( tc[j].abundance < 0 )
|
|
1151 tc[j].abundance = 0 ;
|
|
1152 }
|
|
1153
|
|
1154 transcripts.push_back( nt ) ;
|
|
1155 if ( transcripts.size() >= transcripts.capacity() && (int)transcripts.size() >= coalesceThreshold )
|
|
1156 {
|
|
1157 CoalesceSameTranscripts( transcripts ) ;
|
|
1158 if ( transcripts.size() >= transcripts.capacity() / 2 )
|
|
1159 coalesceThreshold *= 2 ;
|
|
1160 }
|
|
1161 ++iterCnt ;
|
|
1162
|
|
1163 if ( iterCnt >= iterBound )
|
|
1164 break ;
|
|
1165 }
|
|
1166 CoalesceSameTranscripts( transcripts ) ;
|
|
1167 int size = transcripts.size() ;
|
|
1168 // Compute the correlation score
|
|
1169 for ( i = 0 ; i < size ; ++i )
|
|
1170 {
|
|
1171 std::vector<int> subexonInd ;
|
|
1172 transcripts[i].seVector.GetOnesIndices( subexonInd ) ;
|
|
1173 double cor = 2.0 ;
|
|
1174 int s = subexonInd.size() ;
|
|
1175 for ( j = 0 ; j < s ; ++j )
|
|
1176 for ( k = j + 1 ; k < s ; ++k )
|
|
1177 {
|
|
1178 double tmp = correlation.Query( subexonInd[j], subexonInd[k] ) ;
|
|
1179 if ( tmp < cor )
|
|
1180 cor = tmp ;
|
|
1181 }
|
|
1182 if ( cor > 1 )
|
|
1183 cor = 0 ;
|
|
1184 transcripts[i].correlationScore = cor ;
|
|
1185 }
|
|
1186
|
|
1187 // store the result
|
|
1188 for ( i = 0 ; i < size ; ++i )
|
|
1189 alltranscripts.push_back( transcripts[i] ) ;
|
|
1190
|
|
1191 // Release the memory
|
|
1192 for ( std::map<double, struct _dp>::iterator it = cachedCoverResult.begin() ; it != cachedCoverResult.end() ; ++it )
|
|
1193 {
|
|
1194 it->second.seVector.Release() ;
|
|
1195 }
|
|
1196 attr.bufferTxpt.seVector.Release() ;
|
|
1197
|
|
1198 delete[] coveredTc ;
|
|
1199 maxCoverDp.seVector.Release() ;
|
|
1200 bestDp.seVector.Release() ;
|
|
1201 }
|
|
1202
|
|
1203
|
|
1204 // Add the preifx/suffix of transcripts to the list
|
|
1205 void TranscriptDecider::AugmentTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, int limit, bool extend )
|
|
1206 {
|
|
1207 int i, j, k ;
|
|
1208 int size = alltranscripts.size() ;
|
|
1209 if ( size >= limit )
|
|
1210 return ;
|
|
1211
|
|
1212 // Augment suffix, prefix transcripts
|
|
1213 for ( i = 0 ; i < size ; ++i )
|
|
1214 {
|
|
1215 std::vector<int> subexonIdx ;
|
|
1216 alltranscripts[i].seVector.GetOnesIndices( subexonIdx ) ;
|
|
1217 int seIdxCnt = subexonIdx.size() ;
|
|
1218 // suffix
|
|
1219 for ( j = 1 ; j < seIdxCnt ; ++j )
|
|
1220 {
|
|
1221 if ( subexons[ subexonIdx[j] ].canBeStart )
|
|
1222 {
|
|
1223 struct _transcript nt ;
|
|
1224 nt.first = subexonIdx[j] ;
|
|
1225 nt.last = alltranscripts[i].last ;
|
|
1226 nt.seVector.Duplicate( alltranscripts[i].seVector ) ;
|
|
1227 nt.seVector.MaskRegionOutside( nt.first, nt.last ) ;
|
|
1228 nt.partial = false ;
|
|
1229 nt.correlationScore = 0 ;
|
|
1230 nt.abundance = 0 ;
|
|
1231 nt.constraintsSupport = NULL ;
|
|
1232
|
|
1233 alltranscripts.push_back( nt ) ;
|
|
1234 if ( alltranscripts.size() >= limit )
|
|
1235 return ;
|
|
1236 }
|
|
1237 }
|
|
1238
|
|
1239 // prefix
|
|
1240 for ( j = 0 ; j < seIdxCnt - 1 ; ++j )
|
|
1241 {
|
|
1242 if ( subexons[ subexonIdx[j] ].canBeEnd )
|
|
1243 {
|
|
1244 struct _transcript nt ;
|
|
1245 nt.first = alltranscripts[i].first ;
|
|
1246 nt.last = subexonIdx[j] ;
|
|
1247 nt.seVector.Duplicate( alltranscripts[i].seVector ) ;
|
|
1248 nt.seVector.MaskRegionOutside( nt.first, nt.last ) ;
|
|
1249 nt.partial = false ;
|
|
1250 nt.correlationScore = 0 ;
|
|
1251 nt.abundance = 0 ;
|
|
1252 nt.constraintsSupport = NULL ;
|
|
1253
|
|
1254 alltranscripts.push_back( nt ) ;
|
|
1255 if ( alltranscripts.size() >= limit )
|
|
1256 return ;
|
|
1257 }
|
|
1258 }
|
|
1259
|
|
1260 if ( extend )
|
|
1261 {
|
|
1262 //Extentions right.
|
|
1263 for ( j = 0 ; j < seIdxCnt ; ++j )
|
|
1264 {
|
|
1265 if ( subexons[ subexonIdx[j] ].nextCnt > 1 )
|
|
1266 {
|
|
1267 for ( k = 0 ; k < subexons[ subexonIdx[j] ].nextCnt ; ++k )
|
|
1268 {
|
|
1269 int idx = subexons[ subexonIdx[j] ].next[k] ;
|
|
1270
|
|
1271 if ( alltranscripts[i].seVector.Test( idx ) )
|
|
1272 continue ;
|
|
1273 int l ;
|
|
1274 std::vector<int> visited ;
|
|
1275 while ( 1 )
|
|
1276 {
|
|
1277 if ( subexons[idx].nextCnt > 1 || subexons[idx].prevCnt > 1 )
|
|
1278 {
|
|
1279 break ;
|
|
1280 }
|
|
1281
|
|
1282 visited.push_back( idx ) ;
|
|
1283 if ( subexons[idx].canBeEnd && subexons[idx].nextCnt == 0 )
|
|
1284 {
|
|
1285 struct _transcript nt ;
|
|
1286 nt.first = alltranscripts[i].first ;
|
|
1287 nt.last = idx ;
|
|
1288 nt.seVector.Duplicate( alltranscripts[i].seVector ) ;
|
|
1289 nt.seVector.MaskRegionOutside( nt.first, subexonIdx[j] ) ;
|
|
1290 int visitedSize = visited.size() ;
|
|
1291 for ( l = 0 ; l < visitedSize ; ++l )
|
|
1292 nt.seVector.Set( visited[l] ) ;
|
|
1293 nt.partial = false ;
|
|
1294 nt.correlationScore = 0 ;
|
|
1295 nt.abundance = 0 ;
|
|
1296 nt.constraintsSupport = NULL ;
|
|
1297
|
|
1298 alltranscripts.push_back( nt ) ;
|
|
1299 if ( alltranscripts.size() >= limit )
|
|
1300 return ;
|
|
1301 }
|
|
1302
|
|
1303 if ( subexons[idx].nextCnt == 1 )
|
|
1304 idx = subexons[idx].next[0] ;
|
|
1305 else
|
|
1306 break ;
|
|
1307 }
|
|
1308 }
|
|
1309 }
|
|
1310 }
|
|
1311
|
|
1312 // Extension towards left
|
|
1313 for ( j = 0 ; j < seIdxCnt ; ++j )
|
|
1314 {
|
|
1315 if ( subexons[ subexonIdx[j] ].prevCnt > 1 )
|
|
1316 {
|
|
1317 for ( k = 0 ; k < subexons[ subexonIdx[j] ].prevCnt ; ++k )
|
|
1318 {
|
|
1319 int idx = subexons[ subexonIdx[j] ].prev[k] ;
|
|
1320
|
|
1321 if ( alltranscripts[i].seVector.Test( idx ) )
|
|
1322 continue ;
|
|
1323 int l ;
|
|
1324 std::vector<int> visited ;
|
|
1325 while ( 1 )
|
|
1326 {
|
|
1327 if ( subexons[idx].nextCnt > 1 || subexons[idx].prevCnt > 1 )
|
|
1328 {
|
|
1329 break ;
|
|
1330 }
|
|
1331
|
|
1332 visited.push_back( idx ) ;
|
|
1333 if ( subexons[idx].canBeStart && subexons[idx].prevCnt == 0 )
|
|
1334 {
|
|
1335 struct _transcript nt ;
|
|
1336 nt.first = idx ;
|
|
1337 nt.last = alltranscripts[i].last ;
|
|
1338 nt.seVector.Duplicate( alltranscripts[i].seVector ) ;
|
|
1339 nt.seVector.MaskRegionOutside( subexonIdx[j], nt.last ) ;
|
|
1340 int visitedSize = visited.size() ;
|
|
1341 for ( l = 0 ; l < visitedSize ; ++l )
|
|
1342 nt.seVector.Set( visited[l] ) ;
|
|
1343 nt.partial = false ;
|
|
1344 nt.correlationScore = 0 ;
|
|
1345 nt.abundance = 0 ;
|
|
1346 nt.constraintsSupport = NULL ;
|
|
1347
|
|
1348 alltranscripts.push_back( nt ) ;
|
|
1349 if ( alltranscripts.size() >= limit )
|
|
1350 return ;
|
|
1351 }
|
|
1352
|
|
1353 if ( subexons[idx].prevCnt == 1 )
|
|
1354 idx = subexons[idx].prev[0] ;
|
|
1355 else
|
|
1356 break ;
|
|
1357 }
|
|
1358 }
|
|
1359 }
|
|
1360 }
|
|
1361 } // for if-extend
|
|
1362 }
|
|
1363
|
|
1364 CoalesceSameTranscripts( alltranscripts ) ;
|
|
1365 }
|
|
1366
|
|
1367 // Pick the transcripts from given transcripts.
|
|
1368 void TranscriptDecider::PickTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, Constraints &constraints,
|
|
1369 SubexonCorrelation &seCorrelation, std::vector<struct _transcript> &transcripts )
|
|
1370 {
|
|
1371 int i, j, k ;
|
|
1372 std::vector<int> chosen ;
|
|
1373 std::vector<struct _matePairConstraint> &tc = constraints.matePairs ;
|
|
1374 int atcnt = alltranscripts.size() ;
|
|
1375 int tcCnt = tc.size() ; // transcript constraints
|
|
1376 int seCnt = 0 ;
|
|
1377
|
|
1378 if ( tcCnt == 0 )
|
|
1379 return ;
|
|
1380 if ( atcnt > 0 )
|
|
1381 seCnt = alltranscripts[0].seVector.GetSize() ;
|
|
1382 else
|
|
1383 return ;
|
|
1384
|
|
1385 double inf = -1 ; // infinity
|
|
1386 int coalesceThreshold = 1024 ;
|
|
1387 int *transcriptSeCnt = new int[ atcnt ] ;
|
|
1388 int *transcriptLength = new int[atcnt] ;
|
|
1389 double *transcriptAbundance = new double[atcnt] ; // the roughly estimated abundance based on constraints.
|
|
1390 double *avgTranscriptAbundance = new double[atcnt] ; // the average normAbund from the compatible constraints.
|
|
1391
|
|
1392 BitTable *btable = new BitTable[ atcnt ] ;
|
|
1393 //BitTable lowCovSubexon ; // force the abundance to 0 for the transcript contains the subexon.
|
|
1394 double *coveredPortion = new double[atcnt] ;
|
|
1395
|
|
1396 memset( avgTranscriptAbundance, 0 ,sizeof( double ) * atcnt ) ;
|
|
1397 for ( i = 0 ; i < atcnt ; ++i )
|
|
1398 btable[i].Init( tcCnt ) ;
|
|
1399 for ( j = 0 ; j < tcCnt ; ++j )
|
|
1400 {
|
|
1401 int a = constraints.matePairs[j].i ;
|
|
1402 int b = constraints.matePairs[j].j ;
|
|
1403
|
|
1404 if ( constraints.constraints[a].support > inf )
|
|
1405 inf = constraints.constraints[a].support ;
|
|
1406 if ( constraints.constraints[b].support > inf )
|
|
1407 inf = constraints.constraints[b].support ;
|
|
1408
|
|
1409 if ( tc[j].normAbund > inf )
|
|
1410 inf = tc[j].normAbund ;
|
|
1411
|
|
1412 tc[j].abundance = tc[j].normAbund ;
|
|
1413 }
|
|
1414 ++inf ;
|
|
1415 bool btableSet = false ;
|
|
1416 for ( i = 0 ; i < atcnt ; ++i )
|
|
1417 {
|
|
1418 //printf( "correlation %d: %lf\n", i, alltranscripts[i].correlationScore ) ;
|
|
1419 /*for ( int l = 0 ; l < subexonInd.size() ; ++l )
|
|
1420 {
|
|
1421 for ( int m = l ; m < subexonInd.size() ; ++m )
|
|
1422 printf( "%lf ", seCorrelation.Query( l, m ) ) ;
|
|
1423 printf( "\n" ) ;
|
|
1424 }*/
|
|
1425
|
|
1426 for ( j = 0 ; j < tcCnt ; ++j )
|
|
1427 {
|
|
1428 int a = tc[j].i ;
|
|
1429 int b = tc[j].j ;
|
|
1430
|
|
1431 //printf( "try set btble[ %d ].Set( %d ): %d %d\n", i, j, a, b ) ;
|
|
1432 //alltranscripts[i].seVector.Print() ;
|
|
1433 //constraints.constraints[a].vector.Print() ;
|
|
1434 //constraints.constraints[b].vector.Print() ;
|
|
1435 if ( IsConstraintInTranscript( alltranscripts[i], constraints.constraints[a] ) == 1
|
|
1436 && IsConstraintInTranscript( alltranscripts[i], constraints.constraints[b] ) == 1 )
|
|
1437 {
|
|
1438 //printf( "set btble[ %d ].Set( %d ): %d %d\n", i, j, a, b ) ;
|
|
1439 btable[i].Set( j ) ;
|
|
1440 btableSet = true ;
|
|
1441 }
|
|
1442 }
|
|
1443 transcriptSeCnt[i] = alltranscripts[i].seVector.Count() ;
|
|
1444 }
|
|
1445 if ( btableSet == false )
|
|
1446 {
|
|
1447 for ( i = 0 ; i < atcnt ; ++i )
|
|
1448 btable[i].Release() ;
|
|
1449 delete[] btable ;
|
|
1450 return ;
|
|
1451 }
|
|
1452
|
|
1453 double maxAbundance = -1 ; // The abundance of the most-abundant transcript
|
|
1454 double *adjustScore = new double[atcnt] ;
|
|
1455 memset( adjustScore, 0, sizeof( double ) * atcnt ) ;
|
|
1456 if ( atcnt > 0 /*&& alltranscripts[0].abundance == -1*/ )
|
|
1457 {
|
|
1458 struct _pair32 *chain = new struct _pair32[seCnt] ;
|
|
1459 bool *covered = new bool[seCnt] ;
|
|
1460 bool *usedConstraints = new bool[constraints.constraints.size() ] ;
|
|
1461 std::vector<BitTable> togetherChain ; // those subexons is more likely to show up in the same transcript, like an IR with overhang, should be together to represent a 3'/5'-end
|
|
1462
|
|
1463 /*lowCovSubexon.Init( seCnt ) ;
|
|
1464 double *avgDepth = new double[seCnt ] ;
|
|
1465
|
|
1466 memset( avgDepth, 0, sizeof( double ) * seCnt ) ;
|
|
1467 int size = constraints.constraints.size() ;
|
|
1468 for ( i = 0 ; i < size ; ++i )
|
|
1469 {
|
|
1470 std::vector<int> subexonIdx ;
|
|
1471 constraints.constraints[i].GetOnesIndices( subexonIdx ) ;
|
|
1472 int seIdxCnt = subexonidx.size() ;
|
|
1473 for ( j = 0 ; j < seIdxCnt ; ++j )
|
|
1474 avgDepth[ subexonidx[j] ] += constraints.constraints[i].support ;
|
|
1475 }
|
|
1476 for ( i = 0 ; i < seCnt ; ++i )
|
|
1477 {
|
|
1478 if ( avgDepth[i] * alignments.readLen / (double)( subexons[i].end - subexons[i].start + 1 ) < 1 )
|
|
1479 }*/
|
|
1480
|
|
1481 struct _pair32 firstRegion, lastRegion ;
|
|
1482
|
|
1483 for ( i = 0 ; i < seCnt ; )
|
|
1484 {
|
|
1485 for ( j = i + 1 ; j < seCnt ; ++j )
|
|
1486 {
|
|
1487 if ( subexons[j].start > subexons[j - 1].end + 1 )
|
|
1488 break ;
|
|
1489 }
|
|
1490
|
|
1491
|
|
1492 int cnt = 0 ;
|
|
1493 for ( k = i ; k < j ; ++k )
|
|
1494 {
|
|
1495 if ( ( subexons[k].leftType == 2 && subexons[k].rightType == 1 )
|
|
1496 || ( subexons[k].leftType == 0 && subexons[k].rightType == 1 )
|
|
1497 || ( subexons[k].leftType == 2 && subexons[k].rightType == 0 ) )
|
|
1498 ++cnt ;
|
|
1499 }
|
|
1500
|
|
1501 if ( cnt <= 1 )
|
|
1502 {
|
|
1503 i = j ;
|
|
1504 continue ;
|
|
1505 }
|
|
1506
|
|
1507 BitTable tmpTable( seCnt ) ;
|
|
1508 for ( k = i ; k < j ; ++k )
|
|
1509 {
|
|
1510 if ( ( subexons[k].leftType == 2 && subexons[k].rightType == 1 )
|
|
1511 || ( subexons[k].leftType == 0 && subexons[k].rightType == 1 )
|
|
1512 || ( subexons[k].leftType == 2 && subexons[k].rightType == 0 ) )
|
|
1513 tmpTable.Set( k ) ;
|
|
1514 }
|
|
1515 togetherChain.push_back( tmpTable ) ;
|
|
1516 i = j ;
|
|
1517 }
|
|
1518
|
|
1519 for ( i = 0 ; i < atcnt ; ++i )
|
|
1520 {
|
|
1521 double value = inf ;
|
|
1522 int tag = -1 ;
|
|
1523
|
|
1524 alltranscripts[i].abundance = 0 ;
|
|
1525 alltranscripts[i].constraintsSupport = new double[tcCnt] ;
|
|
1526
|
|
1527 std::vector<int> subexonIdx ;
|
|
1528 alltranscripts[i].seVector.GetOnesIndices( subexonIdx ) ;
|
|
1529 int seIdxCnt = subexonIdx.size() ;
|
|
1530 transcriptLength[i] = 0 ;
|
|
1531
|
|
1532 firstRegion.a = subexonIdx[0] ;
|
|
1533 for ( j = 1 ; j < seIdxCnt ; ++j )
|
|
1534 {
|
|
1535 if ( subexons[ subexonIdx[j] ].start > subexons[ subexonIdx[j - 1] ].end + 1 )
|
|
1536 break ;
|
|
1537 }
|
|
1538 firstRegion.b = subexonIdx[j - 1] ;
|
|
1539 lastRegion.b = subexonIdx[ seIdxCnt - 1 ] ;
|
|
1540 for ( j = seIdxCnt - 2 ; j >= 0 ; --j )
|
|
1541 {
|
|
1542 if ( subexons[ subexonIdx[j] ].end < subexons[ subexonIdx[j + 1] ].start - 1 )
|
|
1543 break ;
|
|
1544 }
|
|
1545 lastRegion.a = subexonIdx[j + 1] ;
|
|
1546
|
|
1547 for ( j = 0 ; j < seIdxCnt ; ++j )
|
|
1548 transcriptLength[i] += subexons[ subexonIdx[j] ].end - subexons[ subexonIdx[j] ].start + 1 ;
|
|
1549
|
|
1550 //for ( j = firstRegion.b ; j < lastRegion.a ; ++j )
|
|
1551 for ( j = 0 ; j < seIdxCnt - 1 ; ++j )
|
|
1552 {
|
|
1553 chain[j].a = subexonIdx[j] ;
|
|
1554 chain[j].b = subexonIdx[j + 1] ;
|
|
1555 covered[j] = false ;
|
|
1556 }
|
|
1557 memset( usedConstraints, false, sizeof( bool ) * constraints.constraints.size() ) ;
|
|
1558 int compatibleCnt = 0 ;
|
|
1559 for ( j = 0 ; j < tcCnt ; ++j )
|
|
1560 {
|
|
1561 alltranscripts[i].constraintsSupport[j] = 0 ;
|
|
1562 if ( btable[i].Test(j) && tc[j].abundance > 0 )
|
|
1563 {
|
|
1564 ++compatibleCnt ;
|
|
1565 double adjustAbundance = tc[j].abundance ;
|
|
1566 if ( seIdxCnt > 1 )
|
|
1567 {
|
|
1568 if ( tc[j].i == tc[j].j
|
|
1569 && ( constraints.constraints[ tc[j].i ].first +
|
|
1570 constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].first
|
|
1571 || constraints.constraints[ tc[j].i ].first +
|
|
1572 constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].last ) )
|
|
1573 {
|
|
1574 adjustAbundance = inf ;
|
|
1575 }
|
|
1576 else if ( tc[j].i != tc[j].j
|
|
1577 && ( constraints.constraints[ tc[j].i ].first +
|
|
1578 constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].first
|
|
1579 || constraints.constraints[ tc[j].i ].first +
|
|
1580 constraints.constraints[ tc[j].i ].last == 2 * alltranscripts[i].last ) )
|
|
1581 {
|
|
1582 adjustAbundance = constraints.constraints[ tc[j].j ].normAbund ;
|
|
1583 }
|
|
1584 else if ( tc[j].i != tc[j].j
|
|
1585 && ( constraints.constraints[ tc[j].j ].first +
|
|
1586 constraints.constraints[ tc[j].j ].last == 2 * alltranscripts[i].first
|
|
1587 || constraints.constraints[ tc[j].j ].first +
|
|
1588 constraints.constraints[ tc[j].j ].last == 2 * alltranscripts[i].last ) )
|
|
1589 {
|
|
1590 adjustAbundance = constraints.constraints[ tc[j].i ].normAbund ;
|
|
1591 }
|
|
1592 }
|
|
1593 if ( adjustAbundance < value )
|
|
1594 /*!( seIdxCnt > 1
|
|
1595 && ( ( ( constraints.constraints[ tc[j].i ].first >= firstRegion.a && constraints.constraints[ tc[j].i ].last <= firstRegion.b )
|
|
1596 && ( constraints.constraints[ tc[j].j ].first >= firstRegion.a && constraints.constraints[ tc[j].j ].last <= firstRegion.b ) )
|
|
1597 || ( ( constraints.constraints[ tc[j].i ].first >= lastRegion.a && constraints.constraints[ tc[j].i ].last <= lastRegion.b )
|
|
1598 && ( constraints.constraints[ tc[j].j ].first >= lastRegion.a && constraints.constraints[ tc[j].j ].last <= lastRegion.b ) ) ) )
|
|
1599 )*/
|
|
1600 {
|
|
1601 // Not use the constraints totally within the 3'/5'-end in the transcript
|
|
1602 value = adjustAbundance ;
|
|
1603 tag = j ;
|
|
1604 }
|
|
1605 avgTranscriptAbundance[i] += tc[j].abundance ;
|
|
1606
|
|
1607 if ( !usedConstraints[ tc[j].i ] )
|
|
1608 {
|
|
1609 struct _constraint &c = constraints.constraints[ tc[j].i ] ;
|
|
1610 for ( k = 0 ; k < seIdxCnt - 1 ; ++k )
|
|
1611 {
|
|
1612 // Note that since the constraint is already compatible with the txpt,
|
|
1613 // chain[k].a/b must be also adjacent in this constraint.
|
|
1614 if ( c.vector.Test( chain[k].a ) && c.vector.Test( chain[k].b ) )
|
|
1615 covered[k] = true ;
|
|
1616 }
|
|
1617 usedConstraints[ tc[j].i ] = true ;
|
|
1618 }
|
|
1619
|
|
1620 if ( !usedConstraints[ tc[j].j ] )
|
|
1621 {
|
|
1622 struct _constraint &c = constraints.constraints[ tc[j].j ] ;
|
|
1623 for ( k = 0 ; k < seIdxCnt - 1 ; ++k )
|
|
1624 {
|
|
1625 if ( c.vector.Test( chain[k].a ) && c.vector.Test( chain[k].b ) )
|
|
1626 covered[k] = true ;
|
|
1627 }
|
|
1628 usedConstraints[ tc[j].j ] = true ;
|
|
1629 }
|
|
1630 }
|
|
1631 }
|
|
1632
|
|
1633 // Get some penalty if something should together did not show up together
|
|
1634 int size = togetherChain.size() ;
|
|
1635 if ( size > 0 )
|
|
1636 {
|
|
1637 BitTable bufferTable( seCnt ) ;
|
|
1638 for ( j = 0 ; j < size ; ++j )
|
|
1639 {
|
|
1640 bufferTable.Assign( togetherChain[j] ) ;
|
|
1641 bufferTable.And( alltranscripts[i].seVector ) ;
|
|
1642 //if ( !bufferTable.IsAllZero() && !bufferTable.IsEqual( togetherChain[j] ) )
|
|
1643 // value /= 2 ;
|
|
1644
|
|
1645 if ( !bufferTable.IsAllZero() )
|
|
1646 {
|
|
1647 if ( bufferTable.IsEqual( togetherChain[j] ) )
|
|
1648 //printf( "nice together!\n" ) ;
|
|
1649 ;
|
|
1650 else
|
|
1651 value /= 2 ;
|
|
1652 //printf( "bad together!\n" ) ;
|
|
1653 }
|
|
1654 }
|
|
1655 bufferTable.Release() ;
|
|
1656 }
|
|
1657
|
|
1658
|
|
1659 // Every two-subexon chain should be covered by some reads if a transcript is expressed highly enough
|
|
1660 int cnt = 0 ;
|
|
1661 for ( j = 0 ; j < seIdxCnt - 1 ; ++j )
|
|
1662 if ( covered[j] == false ) // && j >= firstRegion.b && j <= lastRegion.a - 1 )
|
|
1663 {
|
|
1664 value = 0 ;
|
|
1665 }
|
|
1666 else
|
|
1667 ++cnt ;
|
|
1668 if ( seIdxCnt > 1 )
|
|
1669 coveredPortion[i] = (double)cnt / (double)( seIdxCnt - 1 ) ;
|
|
1670 else
|
|
1671 coveredPortion[i] = 1 ;
|
|
1672 if ( coveredPortion[i] == 0 )
|
|
1673 coveredPortion[i] = (double)0.5 / ( seIdxCnt ) ;
|
|
1674
|
|
1675 // For short subexon (readLength-subexon_length-1>30), we further require a constraint cover three conseuctive subexon
|
|
1676 /*memset( usedConstraints, false, sizeof( bool ) * constraints.constraints.size() ) ;
|
|
1677 for ( j = 1 ; j < seIdxCnt - 1 ; ++j )
|
|
1678 {
|
|
1679 int k = subexonIdx[j] ;
|
|
1680 if ( alignments.readLen - ( subexons[k].end - subexons[k].start + 1 ) - 1 <= 30 )
|
|
1681 continue ;
|
|
1682 // We need at least one of the side subexons are adjacent to the center one.
|
|
1683 if ( subexons[ subexonIdx[j - 1] ].end + 1 < subexons[k].start && subexons[k].end + 1 < subexons[ subexonIdx[j + 1] ].start )
|
|
1684 continue ;
|
|
1685
|
|
1686 int l = 0 ;
|
|
1687 for ( l = 0 ; l < tcCnt ; ++l )
|
|
1688 {
|
|
1689 if ( btable[i].Test(l) && tc[l].abundance > 0 )
|
|
1690 {
|
|
1691 if ( !usedConstraints[ tc[l].i ] )
|
|
1692 {
|
|
1693 struct _constraint &c = constraints.constraints[ tc[l].i ] ;
|
|
1694 if ( c.vector.Test( subexonIdx[j - 1] ) && c.vector.Test( subexonIdx[j] ) &&
|
|
1695 c.vector.Test( subexonIdx[j + 1] ) )
|
|
1696 break ;
|
|
1697 usedConstraints[ tc[l].i ] = true ;
|
|
1698 }
|
|
1699
|
|
1700 if ( !usedConstraints[ tc[l].j ] )
|
|
1701 {
|
|
1702 struct _constraint &c = constraints.constraints[ tc[l].j ] ;
|
|
1703 if ( c.vector.Test( subexonIdx[j - 1] ) && c.vector.Test( subexonIdx[j] ) &&
|
|
1704 c.vector.Test( subexonIdx[j + 1] ) )
|
|
1705 break ;
|
|
1706 usedConstraints[ tc[l].j ] = true ;
|
|
1707 }
|
|
1708 }
|
|
1709 }
|
|
1710 // It is not covered
|
|
1711 if ( l >= tcCnt )
|
|
1712 {
|
|
1713 int residual = alignments.readLen - ( subexons[k].end - subexons[k].start + 1 ) - 1 ;
|
|
1714 //printf( "residual: %d %d %lf\n", k, residual, value ) ;
|
|
1715 if ( value * residual > 2 )
|
|
1716 {
|
|
1717 value = 1 / (double)residual ;
|
|
1718 }
|
|
1719 }
|
|
1720 }*/
|
|
1721
|
|
1722 if ( tag == -1 )
|
|
1723 value = 0 ;
|
|
1724 if ( value > maxAbundance )
|
|
1725 maxAbundance = value ;
|
|
1726 transcriptAbundance[i] = value ;
|
|
1727 if ( tag != -1 )
|
|
1728 avgTranscriptAbundance[i] /= compatibleCnt ;
|
|
1729
|
|
1730 //printf( "abundance %d: %lf %lf ", i, value, avgTranscriptAbundance[i] ) ;
|
|
1731 //alltranscripts[i].seVector.Print() ;
|
|
1732 }
|
|
1733 if ( maxAbundance == 0 )
|
|
1734 {
|
|
1735 for ( i = 0 ; i < atcnt ; ++i )
|
|
1736 {
|
|
1737 transcriptAbundance[i] = coveredPortion[i] ;
|
|
1738 }
|
|
1739 maxAbundance = 1 ;
|
|
1740 }
|
|
1741 //printf( "%s: %lf\n", __func__, maxAbundance ) ;
|
|
1742 int size = togetherChain.size() ;
|
|
1743 for ( j = 0 ; j < size ; ++j )
|
|
1744 togetherChain[j].Release() ;
|
|
1745 delete[] usedConstraints ;
|
|
1746 delete[] covered ;
|
|
1747 delete[] chain ;
|
|
1748 }
|
|
1749 else
|
|
1750 {
|
|
1751 for ( i = 0 ; i < atcnt ; ++i )
|
|
1752 {
|
|
1753 transcriptAbundance[i] = alltranscripts[i].abundance ;
|
|
1754 if ( transcriptAbundance[i] > maxAbundance )
|
|
1755 maxAbundance = transcriptAbundance[i] ;
|
|
1756 coveredPortion[i] = 1 ;
|
|
1757 }
|
|
1758 if ( maxAbundance == 0 )
|
|
1759 maxAbundance = 1 ;
|
|
1760 }
|
|
1761
|
|
1762 // Obtain the prefix, suffix information of the transcripts.
|
|
1763 int *nextSuffix, *nextPrefix ;
|
|
1764 struct _pair32 *txptRank ;
|
|
1765 nextSuffix = new int[atcnt] ;
|
|
1766 nextPrefix = new int[atcnt] ;
|
|
1767 txptRank = new struct _pair32[atcnt] ;
|
|
1768 memset( nextSuffix, -1, sizeof( int ) * atcnt ) ;
|
|
1769 memset( nextPrefix, -1, sizeof( int ) * atcnt ) ;
|
|
1770 /*for ( i = 0 ; i < atcnt ; ++i )
|
|
1771 {
|
|
1772 std::vector<int> subexonIdx ;
|
|
1773 txptRank[i].a = i ;
|
|
1774 alltranscripts[i].seVector.GetOnesIndices( subexonIdx ) ;
|
|
1775 txptRank[i].b = subexonIdx.size() ;
|
|
1776 }
|
|
1777 qsort( txptRank, atcnt, sizeof( struct _pair32 ), CompPairsByB) ;
|
|
1778 BitTable bufferTable( seCnt ) ;
|
|
1779 for ( i = atcnt - 1 ; i >= 0 ; --i )
|
|
1780 {
|
|
1781 int a = txptRank[i].a ;
|
|
1782 for ( j = i - 1 ; j >= 0 ; --j )
|
|
1783 {
|
|
1784 if ( txptRank[i].b == txptRank[j].b )
|
|
1785 continue ;
|
|
1786
|
|
1787 int b = txptRank[j].a ;
|
|
1788
|
|
1789 if ( alltranscripts[b].last != alltranscripts[a].last )
|
|
1790 continue ;
|
|
1791
|
|
1792 bufferTable.Assign( alltranscripts[a].seVector ) ;
|
|
1793 bufferTable.MaskRegionOutside( alltranscripts[b].first, alltranscripts[b].last ) ;
|
|
1794 if ( bufferTable.IsEqual( alltranscripts[b].seVector ) )
|
|
1795 {
|
|
1796 nextSuffix[a] = b ;
|
|
1797 break ;
|
|
1798 }
|
|
1799 }
|
|
1800 }
|
|
1801 for ( i = atcnt - 1 ; i >= 0 ; --i )
|
|
1802 {
|
|
1803 int a = txptRank[i].a ;
|
|
1804 for ( j = i - 1 ; j >= 0 ; --j )
|
|
1805 {
|
|
1806 if ( txptRank[i].b == txptRank[j].b )
|
|
1807 continue ;
|
|
1808
|
|
1809 int b = txptRank[j].a ;
|
|
1810
|
|
1811 if ( alltranscripts[b].first != alltranscripts[a].first )
|
|
1812 continue ;
|
|
1813
|
|
1814 bufferTable.Assign( alltranscripts[a].seVector ) ;
|
|
1815 bufferTable.MaskRegionOutside( alltranscripts[b].first, alltranscripts[b].last ) ;
|
|
1816 if ( bufferTable.IsEqual( alltranscripts[b].seVector ) )
|
|
1817 {
|
|
1818 nextPrefix[a] = b ;
|
|
1819 break ;
|
|
1820 }
|
|
1821 }
|
|
1822 }
|
|
1823
|
|
1824 bufferTable.Release() ;*/
|
|
1825 delete[] txptRank ;
|
|
1826
|
|
1827 // Quantative Set-Cover
|
|
1828 int iterCnt = -1 ;
|
|
1829 double *coverCnt = new double[atcnt] ;
|
|
1830 for ( i = 0 ; i < atcnt ; ++i )
|
|
1831 coverCnt[i] = -1 ;
|
|
1832 int *list = new int[atcnt] ;
|
|
1833 int listCnt ;
|
|
1834
|
|
1835 while ( 1 )
|
|
1836 {
|
|
1837 double max = -1 ;
|
|
1838 int maxtag = -1 ;
|
|
1839 double maxcnt = -1 ;
|
|
1840 ++iterCnt ;
|
|
1841
|
|
1842 // Find the optimal candidate.
|
|
1843 for ( i = 0 ; i < atcnt ; ++i )
|
|
1844 {
|
|
1845 double value = inf ;
|
|
1846 double cnt = 0 ;
|
|
1847
|
|
1848 if ( coverCnt[i] == -1 )
|
|
1849 {
|
|
1850 for ( j = 0 ; j < tcCnt ; ++j )
|
|
1851 {
|
|
1852 if ( tc[j].abundance > 0 && btable[i].Test( j ) )
|
|
1853 {
|
|
1854 cnt += tc[j].effectiveCount ;
|
|
1855 }
|
|
1856 }
|
|
1857 /*else
|
|
1858 {
|
|
1859 std::vector<int> tcIdx ;
|
|
1860 btable[i].GetOnesIndices( tcIdx ) ;
|
|
1861 int size = tcIdx.size() ;
|
|
1862 for ( j = 0 ; j < size ; ++j )
|
|
1863 {
|
|
1864 if ( tc[ tcIdx[j] ].abundance > 0 )
|
|
1865 {
|
|
1866 cnt += tc[ tcIdx[j] ].effectiveCount ;
|
|
1867 }
|
|
1868 }
|
|
1869 }*/
|
|
1870 coverCnt[i] = cnt ;
|
|
1871 }
|
|
1872 else
|
|
1873 {
|
|
1874 cnt = coverCnt[i] ;
|
|
1875 }
|
|
1876
|
|
1877 value = transcriptAbundance[i] ;
|
|
1878 if ( cnt < 1 ) // This transcript does not satisfy any undepleted constraints.
|
|
1879 continue ;
|
|
1880 cnt *= coveredPortion[i] ;
|
|
1881
|
|
1882 double weight = 1 ; //* seCnt / transcriptSeCnt[i] ;
|
|
1883 //if ( maxAbundance >= 1 && value / maxAbundance >= 0.2 )
|
|
1884 // seCntAdjust = sqrt( (double)( transcriptSeCnt[i] ) / seCnt ) ;//< 0.5 ? 0.5 : (double)( transcriptSeCnt[i] ) / seCnt ;
|
|
1885
|
|
1886 if ( alltranscripts[i].FPKM > 0 && sampleCnt > 1 )
|
|
1887 weight = ( 1 + alltranscripts[i].FPKM / sampleCnt ) ;
|
|
1888
|
|
1889 double score = ComputeScore( cnt, weight, value, maxAbundance, alltranscripts[i].correlationScore ) ;
|
|
1890 if ( cnt > maxcnt )
|
|
1891 maxcnt = cnt ;
|
|
1892 score += adjustScore[i] ;
|
|
1893 if ( score > max )
|
|
1894 {
|
|
1895 max = score ;
|
|
1896 maxtag = i ;
|
|
1897 }
|
|
1898 else if ( score == max )
|
|
1899 {
|
|
1900 if ( avgTranscriptAbundance[maxtag] < avgTranscriptAbundance[i] )
|
|
1901 {
|
|
1902 max = score ;
|
|
1903 maxtag = i ;
|
|
1904 }
|
|
1905 }
|
|
1906 //printf( "score: %d %lf -> %lf\n", i, cnt, score ) ;
|
|
1907 }
|
|
1908
|
|
1909 if ( maxcnt == 0 || maxtag == -1 )
|
|
1910 break ;
|
|
1911
|
|
1912 // Find the constraint that should be depleted.
|
|
1913 double update = inf ;
|
|
1914 int updateTag = 0 ;
|
|
1915 for ( j = 0 ; j < tcCnt ; ++j )
|
|
1916 {
|
|
1917 if ( btable[ maxtag ].Test( j ) && tc[j].abundance > 0 &&
|
|
1918 tc[j].abundance <= update )
|
|
1919 {
|
|
1920 update = tc[j].abundance ;
|
|
1921 updateTag = j ;
|
|
1922 }
|
|
1923 }
|
|
1924
|
|
1925 // Search suffix and prefix to see whether these fit better.
|
|
1926 int p = nextSuffix[ maxtag] ;
|
|
1927 while ( p != -1 )
|
|
1928 {
|
|
1929 if ( transcriptAbundance[p] >= 10.0 * transcriptAbundance[maxtag]
|
|
1930 && btable[p].Test( updateTag ) )
|
|
1931 {
|
|
1932 //printf( "%d\n", p ) ;
|
|
1933 maxtag = p ;
|
|
1934 break ;
|
|
1935 }
|
|
1936 p = nextSuffix[p] ;
|
|
1937 }
|
|
1938 p = nextPrefix[maxtag] ;
|
|
1939 while ( p != -1 )
|
|
1940 {
|
|
1941 if ( transcriptAbundance[p] >= 10.0 * transcriptAbundance[maxtag]
|
|
1942 && btable[p].Test( updateTag ) )
|
|
1943 {
|
|
1944 maxtag = p ;
|
|
1945 break ;
|
|
1946 }
|
|
1947 p = nextPrefix[p] ;
|
|
1948 }
|
|
1949
|
|
1950
|
|
1951 // Update the abundance.
|
|
1952 int supportCnt = 0 ;
|
|
1953 for ( j = 0 ; j < tcCnt ; ++j )
|
|
1954 {
|
|
1955 if ( btable[maxtag].Test( j ) )
|
|
1956 {
|
|
1957 if ( tc[j].abundance > 0 )
|
|
1958 {
|
|
1959 tc[j].abundance -= 1 * update ;
|
|
1960 double factor = tc[j].effectiveCount ;
|
|
1961 double tmp = ( tc[j].support * update / tc[j].normAbund * factor ) ;
|
|
1962 alltranscripts[maxtag].constraintsSupport[j] += tmp ;
|
|
1963 alltranscripts[maxtag].abundance += tmp ;
|
|
1964
|
|
1965 if ( tc[j].abundance <= 0 )
|
|
1966 {
|
|
1967 int l ;
|
|
1968 for ( l = 0 ; l < atcnt ; ++l )
|
|
1969 {
|
|
1970 if ( btable[l].Test(j) )
|
|
1971 coverCnt[l] -= tc[j].effectiveCount ;
|
|
1972 }
|
|
1973 }
|
|
1974 ++supportCnt ;
|
|
1975 }
|
|
1976 else if ( alltranscripts[maxtag].constraintsSupport[j] == 0 )
|
|
1977 {
|
|
1978 double sum = 0 ;
|
|
1979 double takeOut = 0 ;
|
|
1980 double factor = tc[j].effectiveCount ;
|
|
1981 listCnt = 0 ;
|
|
1982 for ( i = 0 ; i < atcnt ; ++i )
|
|
1983 {
|
|
1984 if ( i == maxtag )
|
|
1985 continue ;
|
|
1986
|
|
1987 if ( alltranscripts[i].abundance > 0 && btable[i].Test(j) )
|
|
1988 {
|
|
1989 sum += alltranscripts[i].constraintsSupport[j] ;
|
|
1990
|
|
1991 double tmp = ( alltranscripts[i].constraintsSupport[j] + alltranscripts[maxtag].constraintsSupport[j] ) *
|
|
1992 transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[i] )
|
|
1993 - alltranscripts[maxtag].constraintsSupport[j] ;
|
|
1994 if ( tmp > 0 )
|
|
1995 {
|
|
1996 list[ listCnt ] = i ;
|
|
1997 ++listCnt ;
|
|
1998 takeOut += tmp ; //alltranscripts[i].constraintsSupport[j] * transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[i] ) ;
|
|
1999 }
|
|
2000 }
|
|
2001 }
|
|
2002
|
|
2003 double ratio = 1 ;
|
|
2004 double takeOutFactor = 0.5 ;
|
|
2005 if ( update < tc[j].normAbund )
|
|
2006 {
|
|
2007 if ( takeOut > ( tc[j].support * update / tc[j].normAbund * factor ) * takeOutFactor )
|
|
2008 ratio = ( tc[j].support * update / tc[j].normAbund * factor ) * takeOutFactor / takeOut ;
|
|
2009 }
|
|
2010 else
|
|
2011 {
|
|
2012 if ( takeOut > ( tc[j].support * factor ) * takeOutFactor )
|
|
2013 ratio = ( tc[j].support * factor ) * takeOutFactor / takeOut ;
|
|
2014 }
|
|
2015
|
|
2016 if ( 1 ) //update < tc[j].normAbund )
|
|
2017 {
|
|
2018 for ( i = 0 ; i < listCnt ; ++i )
|
|
2019 {
|
|
2020 //double tmp = ( tc[j].support * update / tc[j].normAbund * factor ) *
|
|
2021 // ( alltranscripts[ list[i] ].constraintsSupport[j] / sum ) ;
|
|
2022 //if ( alltranscripts[ list[i] ].constraintsSupport[j] < tmp )
|
|
2023 // printf( "WARNING! %lf %lf, %lf\n", alltranscripts[ list[i] ].constraintsSupport[j], sum, tmp ) ;
|
|
2024
|
|
2025 //double tmp = alltranscripts[ list[i] ].constraintsSupport[j] * transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[ list[i] ] ) * ratio ;
|
|
2026 double tmp = ( ( alltranscripts[ list[i] ].constraintsSupport[j] + alltranscripts[maxtag].constraintsSupport[j] ) *
|
|
2027 transcriptAbundance[maxtag] / ( transcriptAbundance[maxtag] + transcriptAbundance[ list[i] ] )
|
|
2028 - alltranscripts[maxtag].constraintsSupport[j] ) * ratio ;
|
|
2029
|
|
2030
|
|
2031 alltranscripts[ list[i] ].constraintsSupport[j] -= tmp ;
|
|
2032 alltranscripts[ list[i] ].abundance -= tmp ;
|
|
2033 }
|
|
2034 //double tmp = ( tc[j].support * update / tc[j].normAbund * factor ) ;
|
|
2035 //printf( "%lf %lf. %lf %lf\n", takeOut, ratio, update, tc[j].normAbund ) ;
|
|
2036 double tmp = takeOut * ratio ;
|
|
2037 alltranscripts[maxtag].constraintsSupport[j] += tmp ;
|
|
2038 alltranscripts[maxtag].abundance += tmp ;
|
|
2039 }
|
|
2040 /*else
|
|
2041 {
|
|
2042 double tmp = ( tc[j].support / (double)( listCnt + 1 ) ) * factor ;
|
|
2043 for ( i = 0 ; i < listCnt ; ++i )
|
|
2044 {
|
|
2045 alltranscripts[ list[i] ].abundance -= alltranscripts[ list[i] ].constraintsSupport[j] ;
|
|
2046
|
|
2047 alltranscripts[ list[i] ].constraintsSupport[j] = tmp ;
|
|
2048 alltranscripts[ list[i] ].abundance += tmp ;
|
|
2049 }
|
|
2050 alltranscripts[maxtag].constraintsSupport[j] += tmp ;
|
|
2051 alltranscripts[maxtag].abundance += tmp ;
|
|
2052 }*/
|
|
2053
|
|
2054 }
|
|
2055 }
|
|
2056
|
|
2057 if ( tc[j].abundance < 0 )
|
|
2058 {
|
|
2059 tc[j].abundance = 0 ;
|
|
2060
|
|
2061 }
|
|
2062 }
|
|
2063 tc[ updateTag ].abundance = 0 ;
|
|
2064 if ( supportCnt == 0 )
|
|
2065 break ;
|
|
2066 //adjustScore[maxtag] += 1 / (double)tcCnt ;
|
|
2067 //printf( "maxtag=%d %lf %d\n", maxtag, update, updateTag ) ;
|
|
2068 }
|
|
2069
|
|
2070 for ( i = 0 ; i < atcnt ; ++i )
|
|
2071 {
|
|
2072 if ( alltranscripts[i].abundance > 0 )
|
|
2073 {
|
|
2074 struct _transcript nt = alltranscripts[i] ;
|
|
2075 nt.seVector.Nullify() ;
|
|
2076 nt.seVector.Duplicate( alltranscripts[i].seVector ) ;
|
|
2077 nt.constraintsSupport = NULL ;
|
|
2078 if ( transcriptAbundance[i] == 0 )
|
|
2079 nt.correlationScore = -1 ;
|
|
2080 else
|
|
2081 nt.correlationScore = 0 ;
|
|
2082 nt.id = i ;
|
|
2083 transcripts.push_back( nt ) ;
|
|
2084 }
|
|
2085 }
|
|
2086
|
|
2087 // Release the memory of btable.
|
|
2088 for ( i = 0 ; i < atcnt ; ++i )
|
|
2089 {
|
|
2090 delete[] alltranscripts[i].constraintsSupport ;
|
|
2091 btable[i].Release() ;
|
|
2092 }
|
|
2093 delete[] btable ;
|
|
2094
|
|
2095 delete[] list ;
|
|
2096 delete[] transcriptSeCnt ;
|
|
2097 delete[] transcriptLength ;
|
|
2098 delete[] transcriptAbundance ;
|
|
2099 delete[] avgTranscriptAbundance ;
|
|
2100 delete[] coveredPortion ;
|
|
2101 delete[] adjustScore ;
|
|
2102 delete[] coverCnt ;
|
|
2103
|
|
2104 delete[] nextPrefix ;
|
|
2105 delete[] nextSuffix ;
|
|
2106
|
|
2107 // Redistribute weight if there is some constraints that are unbalanced.
|
|
2108 /*tcnt = transcripts.size() ;
|
|
2109 for ( i = 0 ; i < tcnt ; ++i )
|
|
2110 {
|
|
2111 int maxRatio = -1 ;
|
|
2112 for ( j = 0 ; j < tcCnt ; ++j )
|
|
2113 if ( transcripts[i].constraintsSupport[j] > 0 )
|
|
2114 {
|
|
2115 double factor = tc[j].effectiveCount ;
|
|
2116 if ( transcripts[])
|
|
2117 }
|
|
2118 }*/
|
|
2119 }
|
|
2120
|
|
2121 void TranscriptDecider::AbundanceEstimation( struct _subexon *subexons, int seCnt, Constraints &constraints, std::vector<struct _transcript> &transcripts )
|
|
2122 {
|
|
2123 int tcnt = transcripts.size() ;
|
|
2124 int size ;
|
|
2125 int i, j ;
|
|
2126 if ( tcnt <= 0 )
|
|
2127 return ;
|
|
2128
|
|
2129 std::vector<struct _matePairConstraint> &tc = constraints.matePairs ;
|
|
2130 int tcCnt = tc.size() ; // transcript constraints
|
|
2131
|
|
2132 BitTable *btable = new BitTable[ tcnt ] ;
|
|
2133 int *transcriptLength = new int[tcnt] ;
|
|
2134 int *compatibleList = new int[tcnt] ;
|
|
2135 double *rho = new double[tcnt] ; // the abundance.
|
|
2136 int iterCnt = 0 ;
|
|
2137
|
|
2138 for ( i = 0 ; i < tcnt ; ++i )
|
|
2139 transcripts[i].constraintsSupport = new double[ tcCnt ] ;
|
|
2140
|
|
2141 for ( i = 0 ; i < tcnt ; ++i )
|
|
2142 {
|
|
2143 btable[i].Init( tcCnt ) ;
|
|
2144 double min = -1 ;
|
|
2145 for ( j = 0 ; j < tcCnt ; ++j )
|
|
2146 {
|
|
2147 int a = tc[j].i ;
|
|
2148 int b = tc[j].j ;
|
|
2149
|
|
2150 if ( IsConstraintInTranscript( transcripts[i], constraints.constraints[a] ) == 1
|
|
2151 && IsConstraintInTranscript( transcripts[i], constraints.constraints[b] ) == 1 )
|
|
2152 {
|
|
2153 //printf( "set btble[ %d ].Set( %d ): %d %d\n", i, j, a, b ) ;
|
|
2154 btable[i].Set( j ) ;
|
|
2155
|
|
2156 if ( min == -1 || tc[j].normAbund < min )
|
|
2157 min = tc[j].normAbund ;
|
|
2158 }
|
|
2159 }
|
|
2160
|
|
2161 std::vector<int> subexonIdx ;
|
|
2162 transcripts[i].seVector.GetOnesIndices( subexonIdx ) ;
|
|
2163 int subexonIdxCnt = subexonIdx.size() ;
|
|
2164 int len = 0 ;
|
|
2165 for ( j = 0 ; j < subexonIdxCnt ; ++j )
|
|
2166 len += subexons[ subexonIdx[j] ].end - subexons[ subexonIdx[j] ].start + 1 ;
|
|
2167 transcriptLength[i] = len - alignments.fragLen + 2 * alignments.fragStdev ;
|
|
2168 if ( transcriptLength[i] < 1 )
|
|
2169 transcriptLength[i] = 1 ;
|
|
2170 rho[i] = transcripts[i].abundance / transcriptLength[i] ; // use the rough estimation generated before.
|
|
2171 if ( transcripts[i].correlationScore == -1 && rho[i] > 0.1 / (double)alignments.readLen )
|
|
2172 rho[i] = 0.1 / (double)alignments.readLen ;
|
|
2173 }
|
|
2174
|
|
2175 while ( 1 )
|
|
2176 {
|
|
2177 for ( i = 0 ; i < tcnt ; ++i )
|
|
2178 for ( j = 0 ; j < tcCnt ; ++j )
|
|
2179 {
|
|
2180 transcripts[i].constraintsSupport[j] = 0 ;
|
|
2181 }
|
|
2182 for ( j = 0 ; j < tcCnt ; ++j )
|
|
2183 {
|
|
2184 int clCnt = 0 ;
|
|
2185 double sum = 0 ;
|
|
2186 for ( i = 0 ; i < tcnt ; ++i )
|
|
2187 {
|
|
2188 if ( btable[i].Test(j) )
|
|
2189 {
|
|
2190 compatibleList[ clCnt ] = i ;
|
|
2191 ++clCnt ;
|
|
2192 sum += rho[i] ;
|
|
2193 }
|
|
2194 }
|
|
2195
|
|
2196 for ( i = 0 ; i < clCnt ; ++i )
|
|
2197 {
|
|
2198 double factor = tc[j].effectiveCount ;
|
|
2199 transcripts[ compatibleList[i] ].constraintsSupport[j] = ( rho[ compatibleList[i] ] / sum ) * tc[j].support * factor ;
|
|
2200 }
|
|
2201 }
|
|
2202
|
|
2203 double diff = 0 ;
|
|
2204 for ( i = 0 ; i < tcnt ; ++i )
|
|
2205 {
|
|
2206 double newAbund = 0 ;
|
|
2207 for ( j = 0 ; j < tcCnt ; ++j )
|
|
2208 newAbund += transcripts[i].constraintsSupport[j] ;
|
|
2209 double old = rho[i] ;
|
|
2210 rho[i] = newAbund / transcriptLength[i] ;
|
|
2211 //printf( "rho[%d]=%lf\n", i, rho[i] ) ;
|
|
2212 if ( transcripts[i].correlationScore == -1 && rho[i] > 0.1 / (double)alignments.readLen )
|
|
2213 rho[i] = 0.1 / (double)alignments.readLen ;
|
|
2214
|
|
2215 double tmp = ( old - rho[i] ) ;
|
|
2216 diff += tmp < 0 ? -tmp : tmp ;
|
|
2217 }
|
|
2218 //printf( "%lf\n", diff ) ;
|
|
2219 if ( diff < 1e-3)
|
|
2220 break ;
|
|
2221
|
|
2222 ++iterCnt ;
|
|
2223 if ( iterCnt >= 1000 )
|
|
2224 break ;
|
|
2225 }
|
|
2226
|
|
2227 for ( i = 0 ; i < tcnt ; ++i )
|
|
2228 {
|
|
2229 //printf( "%lf=>", transcripts[i].abundance ) ;
|
|
2230 transcripts[i].abundance = 0 ;
|
|
2231 for ( j = 0 ; j < tcCnt ; ++j )
|
|
2232 {
|
|
2233 transcripts[i].abundance += transcripts[i].constraintsSupport[j] ;
|
|
2234 }
|
|
2235 //printf( "%lf. (%lf)\n", transcripts[i].abundance, transcripts[i].correlationScore ) ;
|
|
2236 //transcripts[i].seVector.Print() ;
|
|
2237 }
|
|
2238
|
|
2239 for ( i = 0 ; i < tcnt ; ++i )
|
|
2240 delete[] transcripts[i].constraintsSupport ;
|
|
2241
|
|
2242 // Release the memory of btable.
|
|
2243 for ( i = 0 ; i < tcnt ; ++i )
|
|
2244 {
|
|
2245 btable[i].Release() ;
|
|
2246 }
|
|
2247 delete[] compatibleList ;
|
|
2248 delete[] btable ;
|
|
2249 delete[] transcriptLength ;
|
|
2250 delete[] rho ;
|
|
2251 }
|
|
2252
|
|
2253 int TranscriptDecider::RefineTranscripts( struct _subexon *subexons, int seCnt, bool aggressive,
|
|
2254 std::map<int, int> *subexonChainSupport, int *txptSampleSupport, std::vector<struct _transcript> &transcripts, Constraints &constraints )
|
|
2255 {
|
|
2256 int i, j, k ;
|
|
2257 int tcnt = transcripts.size() ;
|
|
2258 if ( tcnt == 0 )
|
|
2259 return 0 ;
|
|
2260 int tcCnt = constraints.matePairs.size() ;
|
|
2261
|
|
2262 std::vector<struct _matePairConstraint> &tc = constraints.matePairs ;
|
|
2263 std::vector<struct _constraint> &scc = constraints.constraints ; //single-end constraints.constraints
|
|
2264
|
|
2265 // Remove transcripts whose FPKM are too small.
|
|
2266 //printf( "%d %d\n", usedGeneId, baseGeneId ) ;
|
|
2267 double *geneMaxFPKM = new double[usedGeneId - baseGeneId ] ;
|
|
2268 int *geneMaxFPKMTag = new int[usedGeneId - baseGeneId ] ;
|
|
2269 double *nonOverlapMaxFPKM = new double[ usedGeneId - baseGeneId ] ; // the max FPKM among all the transcripts not overlapping with maxFPKMTag transcripts.
|
|
2270 memset( geneMaxFPKM, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ;
|
|
2271 memset( geneMaxFPKMTag, 0, sizeof( int ) * ( usedGeneId - baseGeneId ) ) ;
|
|
2272 memset( nonOverlapMaxFPKM, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ;
|
|
2273
|
|
2274 double *geneMaxCov = new double[ usedGeneId - baseGeneId ] ;
|
|
2275 memset( geneMaxCov, 0, sizeof( double ) * ( usedGeneId - baseGeneId ) ) ;
|
|
2276 int *txptGid = new int[tcnt] ;
|
|
2277
|
|
2278 /*for ( i = 0 ; i < tcnt ; ++i )
|
|
2279 {
|
|
2280 printf( "%d: %lf ", i, transcripts[i].FPKM ) ;
|
|
2281 transcripts[i].seVector.Print() ;
|
|
2282 }*/
|
|
2283
|
|
2284 /*==================================================================
|
|
2285 Remove transcripts that has too few relative FPKM. (-f)
|
|
2286 ====================================================================*/
|
|
2287 for ( i = 0 ; i < tcnt ; ++i )
|
|
2288 {
|
|
2289 int gid = GetTranscriptGeneId( transcripts[i], subexons ) ;
|
|
2290 int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[i].abundance, transcripts[i].FPKM ) ;
|
|
2291 //printf( "gid=%d\n", gid ) ;
|
|
2292 //printf( "%lf %lf %d\n", transcripts[i].abundance, transcripts[i].FPKM, len ) ;
|
|
2293 if ( transcripts[i].FPKM > geneMaxFPKM[gid - baseGeneId ] )
|
|
2294 {
|
|
2295 geneMaxFPKM[ gid - baseGeneId ] = transcripts[i].FPKM ;
|
|
2296 geneMaxFPKMTag[ gid - baseGeneId ] = i ;
|
|
2297 }
|
|
2298 if ( transcripts[i].abundance * alignments.readLen / len > geneMaxCov[gid - baseGeneId ] )
|
|
2299 geneMaxCov[gid - baseGeneId] = ( transcripts[i].abundance * alignments.readLen ) / len ;
|
|
2300 txptGid[i] = gid ;
|
|
2301 }
|
|
2302
|
|
2303 for ( i = 0 ; i < tcnt ; ++i )
|
|
2304 {
|
|
2305 int tag = txptGid[i] - baseGeneId ;
|
|
2306 if ( ( transcripts[i].last < transcripts[ geneMaxFPKMTag[ tag ] ].first
|
|
2307 || transcripts[i].first > transcripts[ geneMaxFPKMTag[tag] ].last ) && transcripts[i].FPKM > nonOverlapMaxFPKM[tag] )
|
|
2308 nonOverlapMaxFPKM[tag] = transcripts[i].FPKM ;
|
|
2309 }
|
|
2310 BitTable bufferTable ;
|
|
2311 bufferTable.Duplicate( transcripts[0].seVector ) ;
|
|
2312
|
|
2313 if ( !aggressive )
|
|
2314 {
|
|
2315 // Rescue the transcripts covering unique constraints.
|
|
2316 int cnt = 0 ;
|
|
2317 int tag = 0 ;
|
|
2318 int *uniqCount = new int[tcnt] ;
|
|
2319 memset( uniqCount, 0, sizeof( int ) * tcnt ) ;
|
|
2320 for ( j = 0 ; j < tcCnt ; ++j )
|
|
2321 {
|
|
2322 cnt = 0 ;
|
|
2323 if ( tc[j].uniqSupport <= 5 )
|
|
2324 continue ;
|
|
2325 for ( i = 0 ; i < tcnt ; ++i )
|
|
2326 {
|
|
2327 if ( IsConstraintInTranscript( transcripts[i], scc[ tc[j].i ] ) &&
|
|
2328 IsConstraintInTranscript( transcripts[i], scc[ tc[j].j] ) )
|
|
2329 {
|
|
2330 tag = i ;
|
|
2331 ++cnt ;
|
|
2332 }
|
|
2333 if ( cnt >= 2 )
|
|
2334 break ;
|
|
2335 }
|
|
2336 if ( cnt == 1 )
|
|
2337 {
|
|
2338 ++uniqCount[tag] ;
|
|
2339 }
|
|
2340 }
|
|
2341 for ( i = 0 ; i < tcnt ; ++i )
|
|
2342 {
|
|
2343 if ( uniqCount[i] >= 2 )
|
|
2344 {
|
|
2345 transcripts[i].abundance *= 4 ;
|
|
2346 transcripts[i].FPKM *= 4 ;
|
|
2347 }
|
|
2348 }
|
|
2349
|
|
2350 delete[] uniqCount ;
|
|
2351 }
|
|
2352
|
|
2353 int sccCnt = scc.size() ;
|
|
2354 double filterFactor = 1.0 ;
|
|
2355
|
|
2356 for ( i = 0 ; i < tcnt ; ++i )
|
|
2357 {
|
|
2358 //printf( "%d: %lf %lf\n", txptGid[i], transcripts[i].abundance, geneMaxFPKM[ txptGid[i] - baseGeneId ] ) ;
|
|
2359
|
|
2360 if ( transcripts[i].FPKM < filterFactor * FPKMFraction * geneMaxFPKM[ txptGid[i] - baseGeneId ] )
|
|
2361 {
|
|
2362 /*int cnt = 0 ;
|
|
2363 int coverCnt = 0 ;
|
|
2364 for ( j = 0 ; j < tcCnt ; ++j )
|
|
2365 {
|
|
2366 if ( transcripts[i].constraintsSupport[j] > 0 )
|
|
2367 ++coverCnt ;
|
|
2368 double factor = tc[j].effectiveCount ;
|
|
2369 if ( transcripts[i].constraintsSupport[j] >= factor * tc[j].support - 1e-3
|
|
2370 && tc[j].support >= 10
|
|
2371 && tc[j].uniqSupport >= 0.95 * tc[j].support )
|
|
2372 {
|
|
2373 ++cnt ;
|
|
2374 }
|
|
2375 }
|
|
2376 //cnt = 0 ;
|
|
2377 if ( cnt >= 2 )
|
|
2378 {
|
|
2379 ;
|
|
2380 }
|
|
2381 else*/
|
|
2382 transcripts[i].abundance = -transcripts[i].abundance ;
|
|
2383 }
|
|
2384 //if ( transcripts[i].FPKM >= 0.8 * geneMaxFPKM[ txptGid[i] - baseGeneId ] && geneMaxCov[ txptGid[i] - baseGeneId ] >= txptMinReadDepth )
|
|
2385 // continue ;
|
|
2386 }
|
|
2387
|
|
2388 if ( nonOverlapMaxFPKM != 0 )
|
|
2389 {
|
|
2390 // Go two iterations to rescue, the first iteration should be just for marking.
|
|
2391 std::vector<int> rescueList ;
|
|
2392 for ( i = 0 ; i < tcnt ; ++i )
|
|
2393 {
|
|
2394 if ( transcripts[i].abundance >= 0 )
|
|
2395 continue ;
|
|
2396
|
|
2397 for ( j = 0 ; j < tcnt ; ++j )
|
|
2398 {
|
|
2399 if ( transcripts[j].abundance < 0 || txptGid[i] != txptGid[j] )
|
|
2400 continue ;
|
|
2401 if ( transcripts[i].first <= transcripts[j].last && transcripts[i].last >= transcripts[j].first )
|
|
2402 /*bufferTable.Assign( transcripts[i].seVector ) ;
|
|
2403 bufferTable.And( transcripts[j].seVector ) ;
|
|
2404
|
|
2405 if ( !bufferTable.IsAllZero() )*/
|
|
2406 break ;
|
|
2407 }
|
|
2408 if ( j >= tcnt && transcripts[i].FPKM >= FPKMFraction * nonOverlapMaxFPKM[ txptGid[i] - baseGeneId ] )
|
|
2409 {
|
|
2410 //transcripts[i].abundance = -transcripts[i].abundance ;
|
|
2411 rescueList.push_back( i ) ;
|
|
2412 }
|
|
2413 }
|
|
2414
|
|
2415 int size = rescueList.size() ;
|
|
2416 for ( i = 0 ; i < size ; ++i )
|
|
2417 transcripts[ rescueList[i] ].abundance *= -1 ;
|
|
2418 }
|
|
2419
|
|
2420 /*==================================================================
|
|
2421 Remove transcripts that has too few read coverage (-d)
|
|
2422 ====================================================================*/
|
|
2423 for ( i = 0 ; i < tcnt ; ++i )
|
|
2424 {
|
|
2425 if ( transcripts[i].abundance >= 0 )
|
|
2426 {
|
|
2427 int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[i].abundance, transcripts[i].FPKM ) ;
|
|
2428 double cov = ( transcripts[i].abundance * alignments.readLen ) / len ;
|
|
2429 //printf( "%d: %d %d %lf %lf\n", i, len, transcripts[i].seVector.Count(), cov, geneMaxCov[ txptGid[i] - baseGeneId ] ) ;
|
|
2430
|
|
2431 if ( ( tcnt > 1 || len <= 1000 || transcripts[i].seVector.Count() <= 3 ) && cov < txptMinReadDepth )
|
|
2432 {
|
|
2433 //if ( usedGeneId == baseGeneId + 1 && /*transcripts[i].seVector.Count() > 3
|
|
2434 // && len > 1000 &&*/ geneMaxCov[ txptGid[i] - baseGeneId ] == cov )
|
|
2435 if ( geneMaxCov[ txptGid[i] - baseGeneId ] == cov )
|
|
2436 continue ;
|
|
2437
|
|
2438 // Test whether it has some very abundant constraints.
|
|
2439 /*int cnt = 0 ;
|
|
2440 for ( j = 0 ; j < tcCnt ; ++j )
|
|
2441 {
|
|
2442 if ( transcripts[i].constraintsSupport[j] >= tc[j].support / 2.0
|
|
2443 && tc[j].support >= 10
|
|
2444 && tc[j].uniqSupport >= 0.95 * tc[j].support
|
|
2445 && tc[j].normAbund >= 1 )
|
|
2446 {
|
|
2447 ++cnt ;
|
|
2448 }
|
|
2449 }
|
|
2450
|
|
2451 if ( cnt >= 1 )
|
|
2452 {
|
|
2453 continue ;
|
|
2454 }*/
|
|
2455
|
|
2456 // Test whether this transcript is fully covered. If so ,we can filter it.
|
|
2457
|
|
2458 if ( geneMaxCov[ txptGid[i] - baseGeneId ] <= 5 )
|
|
2459 {
|
|
2460 bufferTable.Reset() ;
|
|
2461 for ( j = 0 ; j < sccCnt ; ++j )
|
|
2462 {
|
|
2463 if ( !IsConstraintInTranscript( transcripts[i], scc[j] ) )
|
|
2464 continue ;
|
|
2465 bufferTable.Or( scc[j].vector ) ;
|
|
2466 }
|
|
2467 if ( bufferTable.IsEqual( transcripts[i].seVector ) )
|
|
2468 transcripts[i].abundance = -transcripts[i].abundance ;
|
|
2469 }
|
|
2470 else
|
|
2471 transcripts[i].abundance = -transcripts[i].abundance ;
|
|
2472
|
|
2473 /*else
|
|
2474 {
|
|
2475 transcripts[i].seVector.Print() ;
|
|
2476 bufferTable.Print() ;
|
|
2477 OutputTranscript( stderr, subexons, transcripts[i] ) ;
|
|
2478 }*/
|
|
2479 }
|
|
2480 }
|
|
2481 }
|
|
2482
|
|
2483 /*==================================================================
|
|
2484 Remove transcripts that is too short
|
|
2485 ====================================================================*/
|
|
2486 for ( i = 0 ; i < tcnt ; ++i )
|
|
2487 {
|
|
2488 if ( transcripts[i].abundance <= 0 )
|
|
2489 continue ;
|
|
2490
|
|
2491 int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[i].abundance, transcripts[i].FPKM ) ;
|
|
2492 if ( len < 200 )
|
|
2493 {
|
|
2494 transcripts[i].abundance = -transcripts[i].abundance ;
|
|
2495 }
|
|
2496 }
|
|
2497
|
|
2498 // Rescue transcripts that showed up in many samples.
|
|
2499 /*for ( i = 0 ; i < tcnt ; ++i )
|
|
2500 {
|
|
2501 if ( transcripts[i].abundance > 0 )
|
|
2502 continue ;
|
|
2503 if ( txptSampleSupport[ transcripts[i].id ] >= 3 &&
|
|
2504 txptSampleSupport[transcripts[i].id ] >= (int)( sampleCnt / 2 ) )
|
|
2505 transcripts[i].abundance = -transcripts[i].abundance ;
|
|
2506 }*/
|
|
2507
|
|
2508 // Rescue some transcripts covering subexon chains showed up in many samples, but missing after filtration.
|
|
2509 struct _constraint tmpC ;
|
|
2510 tmpC.vector.Init( seCnt ) ;
|
|
2511
|
|
2512 std::vector< struct _pair32 > missingChain ;
|
|
2513 std::vector<int> recoverCandidate ;
|
|
2514 bool *used = new bool[tcnt] ;
|
|
2515 memset( used, false, sizeof( bool ) * tcnt ) ;
|
|
2516
|
|
2517 // Obtain the list of transcripts that should be recovered.
|
|
2518 for ( i = 0 ; i < seCnt && sampleCnt > 1 ; ++i )
|
|
2519 {
|
|
2520
|
|
2521 double maxFPKM = -1 ;
|
|
2522 for ( std::map<int, int>::iterator it = subexonChainSupport[i].begin() ;
|
|
2523 it != subexonChainSupport[i].end() ; ++it )
|
|
2524 {
|
|
2525 if ( sampleCnt >= 0 && ( it->second < 3 || it->second < (int)( 0.5 * sampleCnt ) ) && it->second <= sampleCnt / 2 )
|
|
2526 continue ;
|
|
2527
|
|
2528 bool recover = true ;
|
|
2529 tmpC.vector.Reset() ;
|
|
2530 tmpC.vector.Set( i ) ;
|
|
2531 tmpC.vector.Set( it->first ) ;
|
|
2532 tmpC.first = i ;
|
|
2533 tmpC.last = it->first ;
|
|
2534
|
|
2535
|
|
2536 for ( j = 0 ; j < tcnt ; ++j )
|
|
2537 {
|
|
2538 if ( transcripts[j].abundance < 0 )
|
|
2539 continue ;
|
|
2540
|
|
2541 if ( IsConstraintInTranscript( transcripts[j], tmpC ) )
|
|
2542 {
|
|
2543 recover = false ;
|
|
2544 break ;
|
|
2545 }
|
|
2546
|
|
2547 if ( recover )
|
|
2548 {
|
|
2549 for ( j = 0 ; j < tcnt ; ++j )
|
|
2550 {
|
|
2551 if ( transcripts[j].abundance > 0 )
|
|
2552 continue ;
|
|
2553 //printf( "%d %lf\n", IsConstraintInTranscript( transcripts[j], tmpC ), transcripts[j].FPKM ) ;
|
|
2554 if ( IsConstraintInTranscript( transcripts[j], tmpC ) )
|
|
2555 {
|
|
2556 /*if ( maxTag == -1 )
|
|
2557 maxTag = j ;
|
|
2558 else
|
|
2559 {
|
|
2560 if ( txptSampleSupport[ transcripts[j].id ] > txptSampleSupport[ transcripts[maxTag ].id ] )
|
|
2561 maxTag = j ;
|
|
2562 else if ( txptSampleSupport[ transcripts[j].id ] == txptSampleSupport[ transcripts[maxTag ].id ])
|
|
2563 {
|
|
2564 if ( transcripts[j].FPKM > transcripts[maxTag].FPKM )
|
|
2565 maxTag = j ;
|
|
2566 }
|
|
2567 }*/
|
|
2568
|
|
2569 struct _pair32 np ;
|
|
2570 np.a = i ; np.b = it->first ;
|
|
2571 missingChain.push_back( np ) ;
|
|
2572
|
|
2573 if ( !used[j] )
|
|
2574 {
|
|
2575 recoverCandidate.push_back( j ) ;
|
|
2576 used[j] = true ;
|
|
2577 }
|
|
2578 }
|
|
2579 }
|
|
2580
|
|
2581 /*if ( maxTag != -1 && txptSampleSupport[ transcripts[maxTag].id ] > 1 )
|
|
2582 {
|
|
2583 //printf( "recover %d %d\n", maxTag, txptSampleSupport[ transcripts[maxTag].id ] ) ;
|
|
2584 transcripts[maxTag].abundance *= -1 ;
|
|
2585 }*/
|
|
2586 }
|
|
2587 }
|
|
2588
|
|
2589 }
|
|
2590 }
|
|
2591
|
|
2592 int size = recoverCandidate.size() ;
|
|
2593 memset( used, false, sizeof( bool ) * tcnt ) ;
|
|
2594 // Recover the candidates in the order of reliability
|
|
2595 int *geneRecoverCnt = new int[ usedGeneId - baseGeneId ] ;
|
|
2596 memset( geneRecoverCnt, 0, sizeof( int ) * ( usedGeneId - baseGeneId ) ) ;
|
|
2597 int round = 1 ;
|
|
2598 if ( aggressive && size > 1 )
|
|
2599 round = 1 ;
|
|
2600
|
|
2601 for ( i = 0 ; i < size ; ++i )
|
|
2602 {
|
|
2603 int maxTag = -1 ;
|
|
2604 int maxCover = -1 ;
|
|
2605 for ( j = 0 ; j < size ; ++j )
|
|
2606 {
|
|
2607 if ( !used[ recoverCandidate[j] ] )
|
|
2608 {
|
|
2609 /*int cover = 0 ;
|
|
2610
|
|
2611 k = missingChain.size() ;
|
|
2612 int l ;
|
|
2613 for ( l = 0 ; l < k ; ++l )
|
|
2614 {
|
|
2615 if ( missingChain[l].a == -1 )
|
|
2616 continue ;
|
|
2617
|
|
2618 tmpC.vector.Reset() ;
|
|
2619 tmpC.vector.Set( missingChain[l].a ) ;
|
|
2620 tmpC.vector.Set( missingChain[l].b ) ;
|
|
2621 tmpC.first = missingChain[l].a ;
|
|
2622 tmpC.last = missingChain[l].b ;
|
|
2623
|
|
2624 if ( IsConstraintInTranscript( transcripts[ recoverCandidate[j] ], tmpC ) )
|
|
2625 {
|
|
2626 ++cover ;
|
|
2627 }
|
|
2628 }*/
|
|
2629
|
|
2630 if ( maxTag == -1 )
|
|
2631 {
|
|
2632 maxTag = recoverCandidate[j] ;
|
|
2633 //maxCover = cover ;
|
|
2634 continue ;
|
|
2635 }
|
|
2636
|
|
2637 /*if ( cover > maxCover )
|
|
2638 {
|
|
2639 maxTag = recoverCandidate[j] ;
|
|
2640 maxCover = cover ;
|
|
2641 }
|
|
2642 else if ( cover == maxCover )
|
|
2643 {*/
|
|
2644 if ( txptSampleSupport[ transcripts[ recoverCandidate[j] ].id ] >
|
|
2645 txptSampleSupport[
|
|
2646 transcripts[ maxTag ].id
|
|
2647 ] )
|
|
2648 maxTag = recoverCandidate[j] ;
|
|
2649 else if ( txptSampleSupport[ transcripts[ recoverCandidate[j] ].id ] ==
|
|
2650 txptSampleSupport[ transcripts[ maxTag ].id ] )
|
|
2651 {
|
|
2652 if ( transcripts[ recoverCandidate[j] ].FPKM > transcripts[ maxTag ].FPKM )
|
|
2653 maxTag = recoverCandidate[j] ;
|
|
2654 }
|
|
2655
|
|
2656 /*else if ( transcripts[ recoverCandidate[j] ].FPKM > transcripts[ maxTag ].FPKM )
|
|
2657 maxTag = recoverCandidate[j] ;
|
|
2658 else if ( transcripts[ recoverCandidate[j] ].FPKM == transcripts[ maxTag ].FPKM )
|
|
2659 {
|
|
2660 if ( txptSampleSupport[ transcripts[ recoverCandidate[j] ].id ] >
|
|
2661 txptSampleSupport[ transcripts[ maxTag ].id ] )
|
|
2662 maxTag = recoverCandidate[j] ;
|
|
2663 }*/
|
|
2664 //}
|
|
2665 }
|
|
2666 }
|
|
2667
|
|
2668 if ( maxTag == -1 || txptSampleSupport[ transcripts[ maxTag ].id ] <= 2
|
|
2669 || txptSampleSupport[ transcripts[maxTag].id ] < 0.5 * sampleCnt )
|
|
2670 break ;
|
|
2671
|
|
2672 used[maxTag] = true ;
|
|
2673 if ( geneRecoverCnt[ txptGid[maxTag] - baseGeneId ] >= round )
|
|
2674 continue ;
|
|
2675 ++geneRecoverCnt[ txptGid[maxTag] - baseGeneId ] ;
|
|
2676
|
|
2677 k = missingChain.size() ;
|
|
2678 int cnt = 0 ;
|
|
2679 for ( j = 0 ; j < k ; ++j )
|
|
2680 {
|
|
2681 if ( missingChain[j].a == -1 )
|
|
2682 continue ;
|
|
2683
|
|
2684 tmpC.vector.Reset() ;
|
|
2685 tmpC.vector.Set( missingChain[j].a ) ;
|
|
2686 tmpC.vector.Set( missingChain[j].b ) ;
|
|
2687 tmpC.first = missingChain[j].a ;
|
|
2688 tmpC.last = missingChain[j].b ;
|
|
2689
|
|
2690 if ( IsConstraintInTranscript( transcripts[maxTag], tmpC ) )
|
|
2691 {
|
|
2692 missingChain[j].a = -1 ;
|
|
2693 ++cnt ;
|
|
2694 }
|
|
2695 }
|
|
2696
|
|
2697 int len = GetTranscriptLengthFromAbundanceAndFPKM( transcripts[maxTag].abundance, transcripts[maxTag].FPKM ) ;
|
|
2698 double cov = ( transcripts[maxTag].abundance * alignments.readLen ) / len ;
|
|
2699 if ( cnt >= 1 && cov > 1.0 )
|
|
2700 {
|
|
2701 transcripts[maxTag].abundance *= -1 ;
|
|
2702 }
|
|
2703 }
|
|
2704 delete[] used ;
|
|
2705 delete[] geneRecoverCnt ;
|
|
2706 tmpC.vector.Release() ;
|
|
2707
|
|
2708
|
|
2709 tcnt = RemoveNegativeAbundTranscripts( transcripts ) ;
|
|
2710
|
|
2711
|
|
2712
|
|
2713 delete []geneMaxCov ;
|
|
2714 bufferTable.Release() ;
|
|
2715 delete []geneMaxFPKM ;
|
|
2716 delete []geneMaxFPKMTag ;
|
|
2717 delete []nonOverlapMaxFPKM ;
|
|
2718 delete []txptGid ;
|
|
2719
|
|
2720 /*==================================================================
|
|
2721 Remove transcripts that seems duplicated
|
|
2722 ====================================================================*/
|
|
2723 for ( i = 0 ; i < tcnt ; ++i )
|
|
2724 {
|
|
2725 int support = 0 ;
|
|
2726 int uniqSupport = 0 ;
|
|
2727
|
|
2728 for ( j = 0 ; j < tcCnt ; ++j )
|
|
2729 {
|
|
2730 if ( !IsConstraintInTranscript( transcripts[i], scc[ tc[j].i ] ) || !IsConstraintInTranscript( transcripts[i], scc[ tc[j].j ] ) )
|
|
2731 continue ;
|
|
2732 //support += scc[ tc[j].i ].support + scc[ tc[j].j ].support ;
|
|
2733 //uniqSupport += scc[ tc[j].i ].uniqSupport + scc[ tc[j].j ].uniqSupport ;
|
|
2734 support += tc[j].support ;
|
|
2735 uniqSupport += tc[j].uniqSupport ;
|
|
2736
|
|
2737 //printf( "constraint uniqness: %d: %d %d\n", i, tc[j].uniqSupport, tc[j].support ) ;
|
|
2738 }
|
|
2739 //printf( "%d: %d %d\n", i, uniqSupport, support ) ;
|
|
2740 if ( (double)uniqSupport < 0.03 * support )
|
|
2741 transcripts[i].abundance = -1 ;
|
|
2742 }
|
|
2743 tcnt = RemoveNegativeAbundTranscripts( transcripts ) ;
|
|
2744
|
|
2745
|
|
2746 /*==================================================================
|
|
2747 Remove shadow transcripts, the abnormal 2-exon txpt whose intron is very close to the true one or one of the anchor exon is shorter than 25bp....
|
|
2748 ====================================================================*/
|
|
2749 int minusCnt = 0, plusCnt = 0 ;
|
|
2750 int mainStrand ;
|
|
2751 for ( i = 0 ; i < seCnt ; ++i )
|
|
2752 {
|
|
2753 if ( subexons[i].rightStrand == 1 )
|
|
2754 ++plusCnt ;
|
|
2755 else if ( subexons[i].rightStrand == -1 )
|
|
2756 ++minusCnt ;
|
|
2757 }
|
|
2758 if ( plusCnt > minusCnt )
|
|
2759 mainStrand = 1 ;
|
|
2760 else
|
|
2761 mainStrand = -1 ;
|
|
2762
|
|
2763 for ( i = 0 ; i < tcnt ; ++i )
|
|
2764 {
|
|
2765 std::vector<int> subexonIdx ;
|
|
2766 transcripts[i].seVector.GetOnesIndices( subexonIdx ) ;
|
|
2767 int size = subexonIdx.size() ;
|
|
2768 int intronCnt = 0 ;
|
|
2769 int anchorIdx = 0 ; // the subexon adjacent to the only intron.
|
|
2770
|
|
2771 for ( j = 0 ; j < size - 1 ; ++j )
|
|
2772 {
|
|
2773 if ( subexons[ subexonIdx[j] ].end + 1 < subexons[ subexonIdx[j + 1] ].start )
|
|
2774 {
|
|
2775 ++intronCnt ;
|
|
2776 anchorIdx = j ;
|
|
2777 }
|
|
2778 }
|
|
2779 if ( intronCnt != 1 )
|
|
2780 continue ;
|
|
2781
|
|
2782 int anchorExonLength[2] = {0, 0};
|
|
2783 int tag = 0 ;
|
|
2784 for ( j = 0 ; j < size ; ++j )
|
|
2785 {
|
|
2786 anchorExonLength[tag] += subexons[ subexonIdx[j] ].end - subexons[ subexonIdx[j] ].start + 1 ;
|
|
2787 if ( tag == 0 && subexons[ subexonIdx[j] ].end + 1 < subexons[ subexonIdx[j + 1] ].start )
|
|
2788 ++tag ;
|
|
2789 }
|
|
2790
|
|
2791 int flag = 0 ;
|
|
2792 if ( subexons[ subexonIdx[anchorIdx] ].rightStrand == mainStrand )
|
|
2793 {
|
|
2794 j = subexonIdx[ anchorIdx ] ;
|
|
2795 if ( subexons[j].end - subexons[j].start + 1 <= 20 ||
|
|
2796 ( subexons[j+ 1].start == subexons[j].end + 1 && subexons[j + 1].end - subexons[j + 1].start + 1 <= 20
|
|
2797 && subexons[j + 1].rightStrand == mainStrand ) )
|
|
2798 ++flag ;
|
|
2799 j = subexonIdx[ anchorIdx + 1 ] ;
|
|
2800 if ( subexons[j].end - subexons[j].start + 1 <= 20 ||
|
|
2801 ( subexons[j].start == subexons[j - 1].end + 1 && subexons[j - 1].end - subexons[j - 1].start + 1 <= 20
|
|
2802 && subexons[j - 1].leftStrand == mainStrand ) )
|
|
2803 ++flag ;
|
|
2804 }
|
|
2805
|
|
2806 if ( anchorExonLength[0] <= 25 || anchorExonLength[1] <= 25 )
|
|
2807 flag = 2 ;
|
|
2808
|
|
2809 // the alignment support the intron must be unique and has enough support.
|
|
2810 int support = 0 ;
|
|
2811 int uniqSupport = 0 ;
|
|
2812 for ( j = 0 ; j < tcCnt ; ++j )
|
|
2813 {
|
|
2814 if ( !IsConstraintInTranscript( transcripts[i], scc[ tc[j].i ] ) || !IsConstraintInTranscript( transcripts[i], scc[ tc[j].j ] ) )
|
|
2815 continue ;
|
|
2816 if ( ( scc[ tc[j].i ].vector.Test( subexonIdx[ anchorIdx ] ) && scc[ tc[j].i ].vector.Test( subexonIdx[ anchorIdx + 1 ] ) )
|
|
2817 || ( scc[ tc[j].j ].vector.Test( subexonIdx[ anchorIdx ] ) && scc[ tc[j].j ].vector.Test( subexonIdx[ anchorIdx + 1 ] ) ) )
|
|
2818 {
|
|
2819 support += tc[j].support ;
|
|
2820 uniqSupport += tc[j].uniqSupport ;
|
|
2821 }
|
|
2822
|
|
2823 }
|
|
2824
|
|
2825 if ( (double)uniqSupport < 0.3 * support || support < txptMinReadDepth )
|
|
2826 {
|
|
2827 flag = 2 ;
|
|
2828 }
|
|
2829
|
|
2830 if ( flag == 2 )
|
|
2831 transcripts[i].abundance = -1 ;
|
|
2832
|
|
2833 }
|
|
2834 tcnt = RemoveNegativeAbundTranscripts( transcripts ) ;
|
|
2835
|
|
2836 return transcripts.size() ;
|
|
2837 }
|
|
2838
|
|
2839 void TranscriptDecider::ComputeTranscriptsScore( struct _subexon *subexons, int seCnt, std::map<int, int> *subexonChainSupport, std::vector<struct _transcript> &transcripts )
|
|
2840 {
|
|
2841 int i, j ;
|
|
2842 int tcnt = transcripts.size() ;
|
|
2843 struct _constraint tmpC ;
|
|
2844 tmpC.vector.Init( seCnt ) ;
|
|
2845
|
|
2846 for ( i = 0 ; i < tcnt ; ++i )
|
|
2847 transcripts[i].correlationScore = 0 ;
|
|
2848
|
|
2849 for ( i = 0 ; i < seCnt ; ++i )
|
|
2850 {
|
|
2851 for ( std::map<int, int>::iterator it = subexonChainSupport[i].begin() ;
|
|
2852 it != subexonChainSupport[i].end() ; ++it )
|
|
2853 {
|
|
2854 if ( sampleCnt >= 0 && ( it->second < 3 || it->second < (int)( 0.1 * sampleCnt ) ) && it->second <= sampleCnt / 2 )
|
|
2855 continue ;
|
|
2856
|
|
2857 tmpC.vector.Reset() ;
|
|
2858 tmpC.vector.Set( i ) ;
|
|
2859 tmpC.vector.Set( it->first ) ;
|
|
2860 tmpC.first = i ;
|
|
2861 tmpC.last = it->first ;
|
|
2862
|
|
2863 for ( j = 0 ; j < tcnt ; ++j )
|
|
2864 {
|
|
2865 if ( IsConstraintInTranscript( transcripts[j], tmpC ) )
|
|
2866 ++transcripts[j].correlationScore ;
|
|
2867 }
|
|
2868 }
|
|
2869 }
|
|
2870
|
|
2871 tmpC.vector.Release() ;
|
|
2872 }
|
|
2873
|
|
2874 int TranscriptDecider::Solve( struct _subexon *subexons, int seCnt, std::vector<Constraints> &constraints, SubexonCorrelation &subexonCorrelation )
|
|
2875 {
|
|
2876 int i, j, k ;
|
|
2877 int cnt = 0 ;
|
|
2878 int *f = new int[seCnt] ; // this is a general buffer for a type of usage.
|
|
2879 bool useDP = false ;
|
|
2880
|
|
2881 compatibleTestVectorT.Init( seCnt ) ; // this is the bittable used in compatible test function.
|
|
2882 compatibleTestVectorC.Init( seCnt ) ;
|
|
2883
|
|
2884 for ( i = 0 ; i < seCnt ; ++i )
|
|
2885 {
|
|
2886 subexons[i].canBeStart = subexons[i].canBeEnd = false ;
|
|
2887
|
|
2888 if ( subexons[i].prevCnt == 0 )
|
|
2889 subexons[i].canBeStart = true ;
|
|
2890 else if ( subexons[i].leftClassifier < canBeSoftBoundaryThreshold && subexons[i].leftClassifier != -1
|
|
2891 && subexons[i].leftStrand != 0 ) // The case of overhang.
|
|
2892 {
|
|
2893 // We then look into whether there is a left-side end already showed up before this subexon in this region of subexons.
|
|
2894 bool flag = true ;
|
|
2895 for ( j = i - 1 ; j >= 0 ; --j )
|
|
2896 {
|
|
2897 if ( subexons[j].end + 1 != subexons[j + 1].start )
|
|
2898 break ;
|
|
2899 if ( subexons[i].canBeStart == true )
|
|
2900 {
|
|
2901 flag = false ;
|
|
2902 break ;
|
|
2903 }
|
|
2904 }
|
|
2905 subexons[i].canBeStart = flag ;
|
|
2906 }
|
|
2907
|
|
2908 if ( subexons[i].nextCnt == 0 )
|
|
2909 subexons[i].canBeEnd = true ;
|
|
2910 else if ( subexons[i].rightClassifier < canBeSoftBoundaryThreshold && subexons[i].rightClassifier != -1
|
|
2911 && subexons[i].rightStrand != 0 )
|
|
2912 {
|
|
2913 subexons[i].canBeEnd = true ;
|
|
2914 }
|
|
2915 // Remove other soft end already showed up in this region of subexons.
|
|
2916 if ( subexons[i].canBeEnd == true )
|
|
2917 {
|
|
2918 for ( j = i - 1 ; j >= 0 ; --j )
|
|
2919 {
|
|
2920 if ( subexons[j].end + 1 != subexons[j + 1].start )
|
|
2921 break ;
|
|
2922 if ( subexons[j].canBeEnd == true )
|
|
2923 {
|
|
2924 subexons[j].canBeEnd = false ;
|
|
2925 break ;
|
|
2926 }
|
|
2927 }
|
|
2928 }
|
|
2929 //printf( "%d: %d %lf\n", subexons[i].canBeStart, subexons[i].prevCnt, subexons[i].leftClassifier ) ;
|
|
2930 }
|
|
2931
|
|
2932 // Go through the cases of mixture region to set canBeStart/End.
|
|
2933 // e.g: +[...]+_____+[....]-...]+____+[..)_____-[...]-
|
|
2934 // ^ then we need to force a start point here.
|
|
2935 // Do we need to associate a strand information with canBeStart, canBeEnd?
|
|
2936 for ( i = 0 ; i < seCnt ; )
|
|
2937 {
|
|
2938 // [i, j) is a region.
|
|
2939 for ( j = i + 1 ; j < seCnt ; ++j )
|
|
2940 if ( subexons[j].start > subexons[j - 1].end + 1 )
|
|
2941 break ;
|
|
2942 if ( subexons[i].canBeStart == false ) // then subexons[i] must has a hard left boundary.
|
|
2943 {
|
|
2944 int leftStrandCnt[2] = {0, 0} ;
|
|
2945 for ( k = i ; k < j ; ++k )
|
|
2946 {
|
|
2947 if ( !SubexonGraph::IsSameStrand( subexons[k].rightStrand, subexons[i].leftStrand ) )
|
|
2948 break ;
|
|
2949 if ( subexons[k].leftStrand != 0 )
|
|
2950 ++leftStrandCnt[ ( subexons[k].leftStrand + 1 ) / 2 ] ;
|
|
2951 }
|
|
2952 if ( k < j && leftStrandCnt[ ( subexons[k].rightStrand + 1 ) / 2 ] == 0 )
|
|
2953 subexons[i].canBeStart = true ;
|
|
2954 }
|
|
2955
|
|
2956 if ( subexons[j - 1].canBeEnd == false )
|
|
2957 {
|
|
2958 int rightStrandCnt[2] = {0, 0} ;
|
|
2959 for ( k = j - 1 ; k >= i ; --k )
|
|
2960 {
|
|
2961 if ( !SubexonGraph::IsSameStrand( subexons[k].leftStrand, subexons[j - 1].rightStrand ) )
|
|
2962 break ;
|
|
2963 if ( subexons[k].rightStrand != 0 )
|
|
2964 ++rightStrandCnt[ ( subexons[k].rightStrand + 1 ) / 2 ] ;
|
|
2965 }
|
|
2966 if ( k >= i && rightStrandCnt[ ( subexons[k].leftStrand + 1 ) / 2 ] == 0 )
|
|
2967 subexons[j - 1].canBeEnd = true ;
|
|
2968 }
|
|
2969
|
|
2970 //if ( subexons[i].start == 6870264)
|
|
2971 // printf( "hi %d %d\n",i , subexons[i].canBeStart ) ;
|
|
2972 i = j ;
|
|
2973 }
|
|
2974 /*for ( i = 0 ; i < seCnt ; ++i )
|
|
2975 {
|
|
2976 printf( "%d %d: %d %d\n", subexons[i].start, subexons[i].end, subexons[i].canBeStart, subexons[i].canBeEnd ) ;
|
|
2977 }*/
|
|
2978
|
|
2979 // Find the gene ids.
|
|
2980 baseGeneId = subexons[0].lcCnt ;
|
|
2981 usedGeneId = subexons[0].rcCnt ;
|
|
2982 defaultGeneId[0] = defaultGeneId[1] = -1 ;
|
|
2983 for ( i = 0 ; i < seCnt ; ++i )
|
|
2984 {
|
|
2985 if ( subexons[i].geneId < 0 )
|
|
2986 continue ;
|
|
2987
|
|
2988 //if ( baseGeneId == -1 || subexons[i].geneId < baseGeneId )
|
|
2989 // baseGeneId = subexons[i].geneId ;
|
|
2990 //if ( subexons[i].geneId > usedGeneId )
|
|
2991 // usedGeneId = subexons[i].geneId ;
|
|
2992
|
|
2993 if ( ( subexons[i].rightStrand == -1 || subexons[i].leftStrand == -1 ) &&
|
|
2994 ( defaultGeneId[0] == -1 || subexons[i].geneId < defaultGeneId[0] ) )
|
|
2995 defaultGeneId[0] = subexons[i].geneId ;
|
|
2996 if ( ( subexons[i].rightStrand == 1 || subexons[i].leftStrand == 1 ) &&
|
|
2997 ( defaultGeneId[1] == -1 || subexons[i].geneId < defaultGeneId[1] ) )
|
|
2998 defaultGeneId[1] = subexons[i].geneId ;
|
|
2999 }
|
|
3000 if ( defaultGeneId[0] == -1 )
|
|
3001 defaultGeneId[0] = baseGeneId ;
|
|
3002 if ( defaultGeneId[1] == -1 )
|
|
3003 defaultGeneId[1] = usedGeneId - 1 ;
|
|
3004
|
|
3005 // Go through the constraints to find the chain of subexons that should be kept.
|
|
3006 std::map<int, int> *subexonChainSupport = new std::map<int, int>[ seCnt ] ;
|
|
3007 for ( i = 0 ; i < sampleCnt ; ++i )
|
|
3008 {
|
|
3009 std::vector<int> subexonIdx ;
|
|
3010 std::vector<struct _pair32> chain ;
|
|
3011
|
|
3012 int tcCnt = constraints[i].constraints.size() ;
|
|
3013 int size ;
|
|
3014 for ( j = 0 ; j < tcCnt ; ++j )
|
|
3015 {
|
|
3016 struct _constraint c = constraints[i].constraints[j] ;
|
|
3017 if ( c.uniqSupport < 0.95 * c.support || c.support < 3 )
|
|
3018 continue ;
|
|
3019
|
|
3020 subexonIdx.clear() ;
|
|
3021 c.vector.GetOnesIndices( subexonIdx ) ;
|
|
3022 size = subexonIdx.size() ;
|
|
3023
|
|
3024 for ( k = 0 ; k < size - 1 ; ++k )
|
|
3025 {
|
|
3026 struct _pair32 p ;
|
|
3027
|
|
3028 p.a = subexonIdx[k] ;
|
|
3029 p.b = subexonIdx[k + 1] ;
|
|
3030 //if ( subexons[p.a].end + 1 == 113235898 && subexons[ p.b ].start + 1 == 113236121 )
|
|
3031 // printf( "bad bad %d %d %d\n", i, c.uniqSupport, c.support ) ;
|
|
3032
|
|
3033 if ( subexons[ p.a ].end + 1 < subexons[ p.b ].start )
|
|
3034 chain.push_back( p ) ;
|
|
3035 }
|
|
3036 }
|
|
3037 // Remove redundancy.
|
|
3038 sort( chain.begin(), chain.end(), CompSortPairs ) ;
|
|
3039 size = chain.size() ;
|
|
3040 k = 0 ;
|
|
3041 for ( j = 1 ; j < size ; ++j )
|
|
3042 {
|
|
3043 if ( chain[j].a == chain[k].a && chain[j].b == chain[k].b )
|
|
3044 continue ;
|
|
3045 else
|
|
3046 {
|
|
3047 ++k ;
|
|
3048 chain[k] = chain[j] ;
|
|
3049 }
|
|
3050 }
|
|
3051 chain.resize( k + 1 ) ;
|
|
3052
|
|
3053 // Add those to sample count
|
|
3054 size = k + 1 ;
|
|
3055 for ( j = 0 ; j < size ; ++j )
|
|
3056 {
|
|
3057 if ( subexonChainSupport[ chain[j].a ].count( chain[j].b ) )
|
|
3058 {
|
|
3059 ++subexonChainSupport[ chain[j].a ][ chain[j].b ] ;
|
|
3060 }
|
|
3061 else
|
|
3062 subexonChainSupport[ chain[j].a ][ chain[j].b ] = 1 ;
|
|
3063 }
|
|
3064 }
|
|
3065
|
|
3066 /*for ( i = 0 ; i < seCnt ; ++i )
|
|
3067 {
|
|
3068 printf( "%d:", i ) ;
|
|
3069 for ( std::map<int, int>::iterator it = subexonChainSupport[i].begin() ; it != subexonChainSupport[i].end() ; ++it )
|
|
3070 printf( " (%d %d) ", it->first, it->second ) ;
|
|
3071 printf( "\n" ) ;
|
|
3072 }*/
|
|
3073
|
|
3074 //printf( "%d %d %d\n", defaultGeneId[0], baseGeneId, usedGeneId ) ;
|
|
3075 cnt = 0 ;
|
|
3076 memset( f, -1, sizeof( int ) * seCnt ) ;
|
|
3077 for ( i = 0 ; i < seCnt ; ++i )
|
|
3078 {
|
|
3079 if ( subexons[i].canBeStart )
|
|
3080 {
|
|
3081 cnt += SubTranscriptCount( i, subexons, f ) ;
|
|
3082 }
|
|
3083 }
|
|
3084 if ( cnt <= USE_DP )
|
|
3085 {
|
|
3086 for ( i = 0 ; i < seCnt ; ++i )
|
|
3087 if ( f[i] > USE_DP )
|
|
3088 {
|
|
3089 useDP = true ;
|
|
3090 break ;
|
|
3091 }
|
|
3092 }
|
|
3093 else
|
|
3094 useDP = true ;
|
|
3095 if ( !useDP )
|
|
3096 {
|
|
3097 for ( i = 0 ; i < sampleCnt ; ++i )
|
|
3098 {
|
|
3099 double msize = constraints[i].matePairs.size() ;
|
|
3100 double csize = constraints[i].constraints.size() ;
|
|
3101 if ( cnt > ( csize / msize ) * ( csize / msize ) * seCnt
|
|
3102 && cnt > USE_DP / ( msize * msize ) && cnt > 50 )
|
|
3103 {
|
|
3104 useDP = true ;
|
|
3105 break ;
|
|
3106 }
|
|
3107 }
|
|
3108 }
|
|
3109
|
|
3110 int atCnt = cnt ;
|
|
3111 printf( "%d: atCnt=%d seCnt=%d %d %d %d\n", subexons[0].start + 1, atCnt, seCnt, useDP, (int)constraints[0].constraints.size(), (int)constraints[0].matePairs.size() ) ;
|
|
3112 fflush( stdout ) ;
|
|
3113 std::vector<struct _transcript> alltranscripts ;
|
|
3114
|
|
3115 if ( !useDP )
|
|
3116 {
|
|
3117 int origSize = atCnt ;
|
|
3118 alltranscripts.resize( atCnt ) ;
|
|
3119 for ( i = 0 ; i < atCnt ; ++i )
|
|
3120 {
|
|
3121 alltranscripts[i].seVector.Init( seCnt ) ;
|
|
3122 alltranscripts[i].correlationScore = 1 ;
|
|
3123 }
|
|
3124
|
|
3125 atCnt = 0 ;
|
|
3126 for ( i = 0 ; i < seCnt ; ++i )
|
|
3127 {
|
|
3128 if ( subexons[i].canBeStart )
|
|
3129 EnumerateTranscript( i, 0, f, 0, subexons, subexonCorrelation, 1, alltranscripts, atCnt ) ;
|
|
3130 }
|
|
3131
|
|
3132 for ( i = atCnt ; i < origSize ; ++i )
|
|
3133 alltranscripts[i].seVector.Release() ;
|
|
3134
|
|
3135 alltranscripts.resize( atCnt ) ;
|
|
3136 //printf( "transcript cnt: %d\n", atCnt ) ;
|
|
3137 //printf( "%d %d\n", alltranscripts[0].seVector.Test( 1 ), constraints[0].matePairs.size() ) ;
|
|
3138 }
|
|
3139 else // Use dynamic programming to pick a set of candidate transcript.
|
|
3140 {
|
|
3141 std::vector<struct _transcript> sampleTranscripts ;
|
|
3142
|
|
3143 // pre allocate the memory.
|
|
3144 struct _dpAttribute attr ;
|
|
3145 attr.f1 = new struct _dp[seCnt] ;
|
|
3146 if ( seCnt <= 10000 )
|
|
3147 {
|
|
3148 attr.f2 = new struct _dp*[seCnt] ;
|
|
3149 for ( i = 0 ; i < seCnt ; ++i )
|
|
3150 attr.f2[i] = new struct _dp[seCnt] ;
|
|
3151 }
|
|
3152 else
|
|
3153 attr.f2 = NULL ;
|
|
3154
|
|
3155 hashMax = HASH_MAX ;
|
|
3156 if (seCnt > 500)
|
|
3157 hashMax = 1000003 ;
|
|
3158 else if (seCnt > 1000)
|
|
3159 hashMax = 10000019 ;
|
|
3160 else if (seCnt > 1500)
|
|
3161 hashMax = 20000003 ;
|
|
3162
|
|
3163 attr.hash = dpHash ;
|
|
3164 if ( hashMax != HASH_MAX )
|
|
3165 attr.hash = new struct _dp[hashMax] ;
|
|
3166
|
|
3167 for ( i = 0 ; i < seCnt ; ++i )
|
|
3168 {
|
|
3169 attr.f1[i].seVector.Nullify() ;
|
|
3170 attr.f1[i].seVector.Init( seCnt ) ;
|
|
3171 for ( j = i ; j < seCnt ; ++j )
|
|
3172 {
|
|
3173 attr.f2[i][j].seVector.Nullify() ;
|
|
3174 attr.f2[i][j].seVector.Init( seCnt ) ;
|
|
3175 }
|
|
3176 }
|
|
3177 for ( i = 0 ; i < hashMax ; ++i )
|
|
3178 {
|
|
3179 attr.hash[i].seVector.Nullify() ;
|
|
3180 attr.hash[i].seVector.Init( seCnt ) ;
|
|
3181 }
|
|
3182
|
|
3183 // select candidate transcripts from each sample.
|
|
3184 struct _pair32 *sampleComplexity = new struct _pair32[ sampleCnt ] ;
|
|
3185 for ( i = 0 ; i < sampleCnt ; ++i )
|
|
3186 {
|
|
3187 sampleComplexity[i].a = i ;
|
|
3188 sampleComplexity[i].b = constraints[i].constraints.size() ;
|
|
3189 }
|
|
3190 qsort( sampleComplexity, sampleCnt, sizeof( sampleComplexity[0] ), CompPairsByB ) ;
|
|
3191 int downsampleCnt = -1 ;
|
|
3192
|
|
3193 for ( i = sampleCnt - 1 ; i >= 0 ; --i )
|
|
3194 {
|
|
3195 sampleTranscripts.clear() ;
|
|
3196 int iterBound = constraints[ sampleComplexity[i].a ].constraints.size() ;
|
|
3197 if ( i < sampleCnt - 1 )
|
|
3198 iterBound = 100 ;
|
|
3199
|
|
3200 if ( i < sampleCnt - 10 && alltranscripts.size() > 1000 )
|
|
3201 iterBound = 10 ;
|
|
3202 //printf( "%d %d: %d %d %d %d\n", subexons[0].start + 1, sampleComplexity[i].a, constraints[ sampleComplexity[i].a ].constraints.size(), constraints[ sampleComplexity[i].a ].matePairs.size(),
|
|
3203 // alltranscripts.size(), iterBound ) ; fflush( stdout ) ;
|
|
3204 if ( maxDpConstraintSize > 0 )
|
|
3205 {
|
|
3206 Constraints truncatedConstraints ;
|
|
3207 truncatedConstraints.TruncateConstraintsCoverFrom( constraints[ sampleComplexity[i].a ], seCnt, maxDpConstraintSize ) ;
|
|
3208 PickTranscriptsByDP( subexons, seCnt, iterBound, truncatedConstraints,
|
|
3209 subexonCorrelation, attr, sampleTranscripts ) ;
|
|
3210 }
|
|
3211 else if ( ( constraints[ sampleComplexity[i].a ].constraints.size() > 1000
|
|
3212 && constraints[ sampleComplexity[i].a ].constraints.size() * 10 < constraints[ sampleComplexity[i].a ].matePairs.size() )
|
|
3213 || ( downsampleCnt > 0 && (int)constraints[ sampleComplexity[i].a ].constraints.size() >= downsampleCnt )
|
|
3214 || seCnt >= 1500 )
|
|
3215 {
|
|
3216 Constraints downsampledConstraints ;
|
|
3217 int stride = (int)constraints[ sampleComplexity[i].a ].matePairs.size() / (int)constraints[ sampleComplexity[i].a ].constraints.size() ;
|
|
3218 if ( downsampleCnt > 0 )
|
|
3219 stride = (int)constraints[ sampleComplexity[i].a ].constraints.size() / downsampleCnt ;
|
|
3220 if ( stride < 1 )
|
|
3221 stride = 1 ;
|
|
3222 downsampledConstraints.DownsampleConstraintsFrom( constraints[ sampleComplexity[i].a ], stride ) ;
|
|
3223 if ( downsampleCnt <= 0 )
|
|
3224 downsampleCnt = downsampledConstraints.constraints.size() ;
|
|
3225 if ( iterBound <= 10 )
|
|
3226 continue ;
|
|
3227 PickTranscriptsByDP( subexons, seCnt, iterBound, downsampledConstraints, subexonCorrelation, attr, sampleTranscripts ) ;
|
|
3228 }
|
|
3229 else
|
|
3230 {
|
|
3231 PickTranscriptsByDP( subexons, seCnt, iterBound, constraints[ sampleComplexity[i].a ], subexonCorrelation, attr, sampleTranscripts ) ;
|
|
3232 }
|
|
3233 int size = sampleTranscripts.size() ;
|
|
3234 for ( j = 0 ; j < size ; ++j )
|
|
3235 alltranscripts.push_back( sampleTranscripts[j] ) ;
|
|
3236
|
|
3237 // we can further pick a smaller subsets of transcripts here if the number is still to big.
|
|
3238 CoalesceSameTranscripts( alltranscripts ) ;
|
|
3239
|
|
3240 AugmentTranscripts( subexons, alltranscripts, 1000, false ) ;
|
|
3241 }
|
|
3242
|
|
3243 // release the memory.
|
|
3244 delete[] sampleComplexity ;
|
|
3245 for ( i = 0 ; i < seCnt ; ++i )
|
|
3246 {
|
|
3247 attr.f1[i].seVector.Release() ;
|
|
3248 for ( j = i ; j < seCnt && attr.f2 ; ++j )
|
|
3249 attr.f2[i][j].seVector.Release() ;
|
|
3250 }
|
|
3251 for ( i = 0 ; i < hashMax ; ++i )
|
|
3252 attr.hash[i].seVector.Release() ;
|
|
3253
|
|
3254 delete[] attr.f1 ;
|
|
3255 for ( i = 0 ; i < seCnt && attr.f2 ; ++i )
|
|
3256 delete[] attr.f2[i] ;
|
|
3257 delete[] attr.f2 ;
|
|
3258 if (hashMax != HASH_MAX)
|
|
3259 delete[] attr.hash ;
|
|
3260
|
|
3261 }
|
|
3262
|
|
3263 transcriptId = new int[usedGeneId - baseGeneId] ;
|
|
3264 std::vector<struct _transcript> *predTranscripts = new std::vector<struct _transcript>[sampleCnt] ;
|
|
3265
|
|
3266 atCnt = alltranscripts.size() ;
|
|
3267 for ( i = 0 ; i < atCnt ; ++i )
|
|
3268 alltranscripts[i].FPKM = 0 ;
|
|
3269
|
|
3270 for ( i = 0 ; i < sampleCnt ; ++i )
|
|
3271 {
|
|
3272 int size = alltranscripts.size() ;
|
|
3273 for ( j = 0 ; j < size ; ++j )
|
|
3274 alltranscripts[j].abundance = -1 ;
|
|
3275 //printf( "pick: %d: %d %d\n", i, constraints[i].matePairs.size(), alltranscripts.size() ) ;
|
|
3276 PickTranscripts( subexons, alltranscripts, constraints[i], subexonCorrelation, predTranscripts[i] ) ;
|
|
3277
|
|
3278 /*double tmp = FPKMFraction ;
|
|
3279 FPKMFraction = 0 ;
|
|
3280 size = predTranscripts.size() ;
|
|
3281 for ( j = 0 ; j < size ; ++j )
|
|
3282 {
|
|
3283 ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[j] ) ;
|
|
3284 }
|
|
3285 RefineTranscripts( subexons, seCnt, predTranscripts, constraints[i] ) ;
|
|
3286 FPKMFraction = tmp ;*/
|
|
3287
|
|
3288 }
|
|
3289
|
|
3290 atCnt = alltranscripts.size() ;
|
|
3291 int *txptSampleSupport = new int[atCnt] ;
|
|
3292 memset( txptSampleSupport, 0, sizeof( int ) * atCnt ) ;
|
|
3293 for ( i = 0 ; i < sampleCnt ; ++i )
|
|
3294 {
|
|
3295 int size = predTranscripts[i].size() ;
|
|
3296 for ( j = 0 ; j < size ; ++j )
|
|
3297 {
|
|
3298 ++txptSampleSupport[ predTranscripts[i][j].id ] ;
|
|
3299 ++alltranscripts[ predTranscripts[i][j].id ].FPKM ;
|
|
3300 }
|
|
3301 }
|
|
3302
|
|
3303 for ( i = 0 ; i < sampleCnt ; ++i )
|
|
3304 {
|
|
3305 int size = alltranscripts.size() ;
|
|
3306 for ( j = 0 ; j < size ; ++j )
|
|
3307 alltranscripts[j].abundance = -1 ;
|
|
3308 //printf( "pick: %d: %d %d\n", i, constraints[i].matePairs.size(), alltranscripts.size() ) ;
|
|
3309
|
|
3310 size = predTranscripts[i].size() ;
|
|
3311 for ( j = 0 ; j < size ; ++j )
|
|
3312 {
|
|
3313 predTranscripts[i][j].seVector.Release() ;
|
|
3314 }
|
|
3315 predTranscripts[i].clear() ;
|
|
3316 PickTranscripts( subexons, alltranscripts, constraints[i], subexonCorrelation, predTranscripts[i] ) ;
|
|
3317 }
|
|
3318
|
|
3319 std::vector<int> *rawPredTranscriptIds = new std::vector<int>[sampleCnt] ;
|
|
3320 std::vector<double> *rawPredTranscriptAbundance = new std::vector<double>[sampleCnt] ;
|
|
3321 for ( i = 0 ; i < sampleCnt ; ++i )
|
|
3322 {
|
|
3323 int size = predTranscripts[i].size() ;
|
|
3324
|
|
3325 for ( j = 0 ; j < size ; ++j )
|
|
3326 {
|
|
3327 rawPredTranscriptIds[i].push_back( predTranscripts[i][j].id ) ;
|
|
3328 rawPredTranscriptAbundance[i].push_back( predTranscripts[i][j].abundance ) ;
|
|
3329 }
|
|
3330 }
|
|
3331
|
|
3332 // Do the filtration.
|
|
3333 for ( i = 0 ; i < sampleCnt ; ++i )
|
|
3334 {
|
|
3335 int size = predTranscripts[i].size() ;
|
|
3336 for ( j = 0 ; j < size ; ++j )
|
|
3337 {
|
|
3338 ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[i][j] ) ;
|
|
3339 }
|
|
3340 size = RefineTranscripts( subexons, seCnt, false, subexonChainSupport, txptSampleSupport, predTranscripts[i], constraints[i] ) ;
|
|
3341
|
|
3342 // Recompute the abundance.
|
|
3343 AbundanceEstimation( subexons, seCnt, constraints[i], predTranscripts[i] ) ;
|
|
3344 for ( j = 0 ; j < size ; ++j )
|
|
3345 ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[i][j] ) ;
|
|
3346 size = RefineTranscripts( subexons, seCnt, true, subexonChainSupport, txptSampleSupport, predTranscripts[i], constraints[i] ) ;
|
|
3347
|
|
3348 //ComputeTranscriptsScore( subexons, seCnt, subexonChainSupport, predTranscripts[i] ) ;
|
|
3349 }
|
|
3350
|
|
3351 // Rescue some filtered transcripts
|
|
3352 memset( txptSampleSupport, 0, sizeof( int ) * atCnt ) ;
|
|
3353 for ( i = 0 ; i < sampleCnt ; ++i )
|
|
3354 {
|
|
3355 int size = predTranscripts[i].size() ;
|
|
3356 for ( j = 0 ; j < size ; ++j )
|
|
3357 {
|
|
3358 ++txptSampleSupport[ predTranscripts[i][j].id ] ;
|
|
3359 }
|
|
3360 }
|
|
3361
|
|
3362 bool *predicted = new bool[atCnt] ;
|
|
3363 for ( i = 0 ; i < sampleCnt ; ++i )
|
|
3364 {
|
|
3365 memset( predicted, false, sizeof( bool ) * atCnt ) ;
|
|
3366 if ( predTranscripts[i].size() != rawPredTranscriptIds[i].size() )
|
|
3367 {
|
|
3368 int psize = predTranscripts[i].size() ;
|
|
3369 int rsize = rawPredTranscriptIds[i].size() ;
|
|
3370 int tcCnt = constraints[i].matePairs.size() ;
|
|
3371
|
|
3372 for ( j = 0 ; j < psize ; ++j )
|
|
3373 predicted[ predTranscripts[i][j].id ] = true ;
|
|
3374
|
|
3375 for ( j = 0 ; j < rsize ; ++j )
|
|
3376 {
|
|
3377 int id = rawPredTranscriptIds[i][j] ;
|
|
3378 if ( predicted[ id ] == false &&
|
|
3379 ( txptSampleSupport[ id ] >= 3 && txptSampleSupport[id] >= 0.25 * sampleCnt ) )
|
|
3380 {
|
|
3381 struct _transcript nt = alltranscripts[id] ;
|
|
3382 nt.seVector.Nullify() ;
|
|
3383 nt.seVector.Duplicate( alltranscripts[id].seVector ) ;
|
|
3384 nt.constraintsSupport = NULL ;
|
|
3385 nt.correlationScore = -1 ;
|
|
3386 nt.abundance = rawPredTranscriptAbundance[i][j] ;
|
|
3387 nt.id = id ;
|
|
3388 predTranscripts[i].push_back( nt ) ;
|
|
3389 }
|
|
3390 }
|
|
3391 if ( psize != predTranscripts[i].size() )
|
|
3392 AbundanceEstimation( subexons, seCnt, constraints[i], predTranscripts[i] ) ;
|
|
3393 }
|
|
3394
|
|
3395 int size = predTranscripts[i].size() ;
|
|
3396
|
|
3397 if ( 0 ) //size == 1 )
|
|
3398 {
|
|
3399 //AugmentTranscripts( subexons, predTranscripts[i], false ) ;
|
|
3400
|
|
3401 int l = predTranscripts[i].size() ;
|
|
3402 int tcCnt = constraints[i].matePairs.size() ;
|
|
3403 for ( j = 0 ; j < l ; ++j )
|
|
3404 {
|
|
3405 predTranscripts[i][j].abundance = 1.0 / alignments.readLen ;
|
|
3406 }
|
|
3407 AbundanceEstimation( subexons, seCnt, constraints[i], predTranscripts[i] ) ;
|
|
3408
|
|
3409 std::vector<int> subexonIdx ;
|
|
3410 for ( j = 0 ; j < l ; ++j )
|
|
3411 {
|
|
3412 subexonIdx.clear() ;
|
|
3413 predTranscripts[i][j].seVector.GetOnesIndices( subexonIdx ) ;
|
|
3414 int subexonIdxCnt = subexonIdx.size() ;
|
|
3415 int len = 0 ;
|
|
3416 for ( k = 0 ; k < subexonIdxCnt ; ++k )
|
|
3417 len += subexons[ subexonIdx[k] ].end - subexons[ subexonIdx[k] ].start + 1 ;
|
|
3418
|
|
3419 if ( predTranscripts[i][j].abundance * alignments.readLen / len < 2.0 )
|
|
3420 predTranscripts[i][j].abundance = -1 ;
|
|
3421 else
|
|
3422 ConvertTranscriptAbundanceToFPKM( subexons, predTranscripts[i][j] ) ;
|
|
3423
|
|
3424 }
|
|
3425 RemoveNegativeAbundTranscripts( predTranscripts[i] ) ;
|
|
3426 }
|
|
3427
|
|
3428 // Output
|
|
3429 size = predTranscripts[i].size() ;
|
|
3430 InitTranscriptId() ;
|
|
3431 for ( j = 0 ; j < size ; ++j )
|
|
3432 {
|
|
3433 OutputTranscript( i, subexons, predTranscripts[i][j] ) ;
|
|
3434 }
|
|
3435 for ( j = 0 ; j < size ; ++j )
|
|
3436 {
|
|
3437 predTranscripts[i][j].seVector.Release() ;
|
|
3438 }
|
|
3439 }
|
|
3440
|
|
3441 delete []predicted ;
|
|
3442 delete []transcriptId ;
|
|
3443 delete []predTranscripts ;
|
|
3444 delete []rawPredTranscriptIds ;
|
|
3445 delete []rawPredTranscriptAbundance ;
|
|
3446 delete []txptSampleSupport ;
|
|
3447
|
|
3448 atCnt = alltranscripts.size() ;
|
|
3449 for ( i = 0 ; i < atCnt ; ++i )
|
|
3450 alltranscripts[i].seVector.Release() ;
|
|
3451 compatibleTestVectorT.Release() ;
|
|
3452 compatibleTestVectorC.Release() ;
|
|
3453 delete[] f ;
|
|
3454 delete[] subexonChainSupport ;
|
|
3455 return 0 ;
|
|
3456 }
|
|
3457
|
|
3458 void *TranscriptDeciderSolve_Wrapper( void *a )
|
|
3459 {
|
|
3460 int i ;
|
|
3461
|
|
3462 struct _transcriptDeciderThreadArg &arg = *( (struct _transcriptDeciderThreadArg *)a ) ;
|
|
3463 TranscriptDecider transcriptDecider( arg.FPKMFraction, arg.classifierThreshold, arg.txptMinReadDepth, arg.sampleCnt, *( arg.alignments ) ) ;
|
|
3464 transcriptDecider.SetNumThreads( arg.numThreads + 1 ) ;
|
|
3465 transcriptDecider.SetMultiThreadOutputHandler( arg.outputHandler ) ;
|
|
3466 transcriptDecider.SetMaxDpConstraintSize( arg.maxDpConstraintSize ) ;
|
|
3467 transcriptDecider.Solve( arg.subexons, arg.seCnt, arg.constraints, arg.subexonCorrelation ) ;
|
|
3468
|
|
3469 int start = arg.subexons[0].start ;
|
|
3470 int end = arg.subexons[ arg.seCnt - 1 ].end ;
|
|
3471 int chrId = arg.subexons[0].chrId ;
|
|
3472 // Release memory
|
|
3473 for ( i = 0 ; i < arg.seCnt ; ++i )
|
|
3474 {
|
|
3475 delete[] arg.subexons[i].prev ;
|
|
3476 delete[] arg.subexons[i].next ;
|
|
3477 }
|
|
3478 delete[] arg.subexons ;
|
|
3479
|
|
3480 // Put the work id back to the free threads queue.
|
|
3481 pthread_mutex_lock( arg.ftLock ) ;
|
|
3482 arg.freeThreads[ *( arg.ftCnt ) ] = arg.tid ;
|
|
3483 ++*( arg.ftCnt ) ;
|
|
3484 if ( *( arg.ftCnt ) == 1 )
|
|
3485 pthread_cond_signal( arg.fullWorkCond ) ;
|
|
3486 pthread_mutex_unlock( arg.ftLock) ;
|
|
3487 printf( "Thread %d: %s %d %d finished.\n", arg.tid, arg.alignments->GetChromName(chrId), start + 1, end + 1 ) ;
|
|
3488 fflush( stdout ) ;
|
|
3489
|
|
3490
|
|
3491 pthread_exit( NULL ) ;
|
|
3492 }
|