comparison PsiCLASS-1.0.2/TranscriptDecider.hpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 #ifndef _MOURISL_CLASSES_TRANSCRIPTDECIDER_HEADER
2 #define _MOURISL_CLASSES_TRANSCRIPTDECIDER_HEADER
3
4 #include <pthread.h>
5 #include <map>
6 #include <time.h>
7 #include <stdarg.h>
8
9 #include "alignments.hpp"
10 #include "SubexonGraph.hpp"
11 #include "SubexonCorrelation.hpp"
12 #include "BitTable.hpp"
13 #include "Constraints.hpp"
14
15 #define HASH_MAX 100003 // default HASH_MAX
16 #define USE_DP 200000
17
18 struct _transcript
19 {
20 BitTable seVector ;
21 double abundance ;
22 double correlationScore ;
23 double FPKM ;
24 double *constraintsSupport ; // record the assign ment of constraints.
25
26 int first, last ; // indicate the index of the first and last subexons.
27 bool partial ; // wehther this is a partial transcript.
28 int id ; // the id for various usage: i.e transcript index in the alltranscripts.
29 } ;
30
31 struct _outputTranscript
32 {
33 int chrId ;
34 int geneId, transcriptId ;
35 struct _pair32 *exons ;
36 int ecnt ;
37 char strand ;
38 int sampleId ;
39
40 double FPKM ;
41 double TPM ;
42 double cov ;
43 } ;
44
45 struct _dp
46 {
47 BitTable seVector ;
48 int first, last ;
49 // The "cnt" is for the hash structure.
50 // the first cnt set bits represent the subexons that are the key of the hash
51 // the remaining set bits are the optimal subtranscript follow the key.
52 int cnt ;
53 double cover ;
54
55 double minAbundance ;
56 int timeStamp ;
57 int strand ;
58 } ;
59
60
61 struct _dpAttribute
62 {
63 struct _dp *f1, **f2 ;
64 struct _dp *hash ;
65
66 struct _transcript bufferTxpt ;
67
68 bool forAbundance ;
69
70 struct _subexon *subexons ;
71 int seCnt ;
72
73 std::map<uint64_t, int> uncoveredPair ;
74
75 double minAbundance ;
76 int timeStamp ;
77 } ;
78
79 class MultiThreadOutputTranscript ;
80
81 struct _transcriptDeciderThreadArg
82 {
83 int tid ;
84 struct _subexon *subexons ;
85 int seCnt ;
86 int sampleCnt ;
87 int numThreads ;
88
89 int maxDpConstraintSize ;
90 double FPKMFraction, classifierThreshold, txptMinReadDepth ;
91 Alignments *alignments ;
92 std::vector<Constraints> constraints ;
93 SubexonCorrelation subexonCorrelation ;
94 MultiThreadOutputTranscript *outputHandler ;
95
96 int *freeThreads ; // the stack for free threads
97 int *ftCnt ;
98 pthread_mutex_t *ftLock ;
99 pthread_cond_t *fullWorkCond ;
100 } ;
101
102 class MultiThreadOutputTranscript
103 {
104 private:
105 std::vector<struct _outputTranscript> outputQueue ;
106 pthread_t *threads ;
107 pthread_mutex_t outputLock ;
108 int sampleCnt ;
109 int numThreads ;
110 std::vector<FILE *> outputFPs ;
111 Alignments &alignments ;
112
113 public:
114 static int CompTranscripts( const struct _outputTranscript &a, const struct _outputTranscript &b )
115 {
116 int i ;
117 if ( a.geneId != b.geneId )
118 return a.geneId - b.geneId ;
119 if ( a.ecnt != b.ecnt )
120 return a.ecnt - b.ecnt ;
121
122 for ( i = 0 ; i < a.ecnt ; ++i )
123 {
124 if ( a.exons[i].a != b.exons[i].a )
125 return a.exons[i].a - b.exons[i].a ;
126
127 if ( a.exons[i].b != b.exons[i].b )
128 return a.exons[i].b - b.exons[i].b ;
129 }
130 return 0 ;
131 }
132
133 static bool CompSortTranscripts( const struct _outputTranscript &a, const struct _outputTranscript &b )
134 {
135 int tmp = CompTranscripts( a, b ) ;
136 if ( tmp < 0 )
137 return true ;
138 else if ( tmp > 0 )
139 return false ;
140 else
141 return a.sampleId < b.sampleId ;
142 }
143
144 MultiThreadOutputTranscript( int cnt, Alignments &a ): alignments( a )
145 {
146 sampleCnt = cnt ;
147 pthread_mutex_init( &outputLock, NULL ) ;
148 }
149 ~MultiThreadOutputTranscript()
150 {
151 pthread_mutex_destroy( &outputLock ) ;
152 int i ;
153 for ( i = 0 ; i < sampleCnt ; ++i )
154 fclose( outputFPs[i] ) ;
155 }
156
157 void SetThreadsPointer( pthread_t *t, int n )
158 {
159 threads = t ;
160 numThreads = n ;
161 }
162
163 void SetOutputFPs( char *outputPrefix )
164 {
165 int i ;
166 char buffer[1024] ;
167 for ( i = 0 ; i < sampleCnt ; ++i )
168 {
169 if ( outputPrefix[0] )
170 sprintf( buffer, "%s_sample_%d.gtf", outputPrefix, i ) ;
171 else
172 sprintf( buffer, "sample_%d.gtf", i ) ;
173 FILE *fp = fopen( buffer, "w" ) ;
174 outputFPs.push_back( fp ) ;
175 }
176 }
177
178 void Add( struct _outputTranscript &t )
179 {
180 pthread_mutex_lock( &outputLock ) ;
181 outputQueue.push_back( t ) ;
182 pthread_mutex_unlock( &outputLock ) ;
183 }
184
185 void Add_SingleThread( struct _outputTranscript &t )
186 {
187 outputQueue.push_back( t ) ;
188 }
189
190 void ComputeFPKMTPM( std::vector<Alignments> &alignmentFiles )
191 {
192 int i ;
193 int qsize = outputQueue.size() ;
194 double *totalFPK = new double[ sampleCnt ] ;
195 memset( totalFPK, 0, sizeof( double ) * sampleCnt ) ;
196 for ( i = 0 ; i < qsize ; ++i )
197 {
198 totalFPK[ outputQueue[i].sampleId ] += outputQueue[i].FPKM ;
199 }
200
201 for ( i = 0 ; i < qsize ; ++i )
202 {
203 outputQueue[i].TPM = outputQueue[i].FPKM / ( totalFPK[ outputQueue[i].sampleId ] / 1000000.0 ) ;
204 outputQueue[i].FPKM /= ( alignmentFiles[ outputQueue[i].sampleId ].totalReadCnt / 1000000.0 ) ;
205 }
206
207 delete[] totalFPK ;
208 }
209
210 void OutputCommandInfo( int argc, char *argv[] )
211 {
212 int i ;
213 int j ;
214 for ( i = 0 ; i < sampleCnt ; ++i )
215 {
216 fprintf( outputFPs[i], "#PsiCLASS_v1.0.1\n#" ) ;
217 for ( j = 0 ; j < argc - 1 ; ++j )
218 {
219 fprintf( outputFPs[i], "%s ", argv[j] ) ;
220 }
221 fprintf( outputFPs[i], "%s\n", argv[j] ) ;
222 }
223 }
224
225 void OutputCommentToSampleGTF( int sampleId, char *s )
226 {
227 fprintf( outputFPs[ sampleId ], "#%s\n", s ) ;
228 }
229
230 void Flush()
231 {
232 std::sort( outputQueue.begin(), outputQueue.end(), CompSortTranscripts ) ;
233 int i, j ;
234 int qsize = outputQueue.size() ;
235 char prefix[10] = "" ;
236
237 // Recompute the transcript id
238 int gid = -1 ;
239 int tid = 0 ;
240 for ( i = 0 ; i < qsize ; )
241 {
242 for ( j = i + 1 ; j < qsize ; ++j )
243 {
244 if ( CompTranscripts( outputQueue[i], outputQueue[j] ) )
245 break ;
246 }
247 int l ;
248 if ( outputQueue[i].geneId != gid )
249 {
250 gid = outputQueue[i].geneId ;
251 tid = 0 ;
252 }
253 else
254 ++tid ;
255
256 for ( l = i ; l < j ; ++l )
257 outputQueue[l].transcriptId = tid ;
258
259 i = j ;
260 }
261
262 // output
263 for ( i = 0 ; i < qsize ; ++i )
264 {
265 struct _outputTranscript &t = outputQueue[i] ;
266 char *chrom = alignments.GetChromName( t.chrId ) ;
267
268 fprintf( outputFPs[t.sampleId], "%s\tPsiCLASS\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\";\n",
269 chrom, t.exons[0].a, t.exons[t.ecnt - 1].b, t.strand,
270 prefix, chrom, t.geneId,
271 prefix, chrom, t.geneId, t.transcriptId, t.FPKM, t.TPM, t.cov ) ;
272 for ( j = 0 ; j < t.ecnt ; ++j )
273 {
274 fprintf( outputFPs[ t.sampleId ], "%s\tPsiCLASS\texon\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; "
275 "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"\%.6lf\";\n",
276 chrom, t.exons[j].a, t.exons[j].b, t.strand,
277 prefix, chrom, t.geneId,
278 prefix, chrom, t.geneId, t.transcriptId,
279 j + 1, t.FPKM, t.TPM, t.cov ) ;
280 }
281 delete []t.exons ;
282 }
283 }
284 } ;
285
286 class TranscriptDecider
287 {
288 private:
289 int sampleCnt ;
290 int numThreads ;
291 double FPKMFraction ;
292 double txptMinReadDepth ;
293 int hashMax ;
294 int maxDpConstraintSize ;
295
296 Constraints *constraints ;
297 //struct _subexon *subexons ;
298 //int seCnt ;
299
300 int usedGeneId ;
301 int baseGeneId, defaultGeneId[2] ;
302
303 int *transcriptId ; // the next transcript id for each gene id (we shift the gene id to 0 in this array.)
304 Alignments &alignments ; // for obtain the chromosome names.
305
306 std::vector<FILE *> outputFPs ;
307
308 BitTable compatibleTestVectorT, compatibleTestVectorC ;
309 double canBeSoftBoundaryThreshold ;
310
311 MultiThreadOutputTranscript *outputHandler ;
312
313 // Test whether subexon tag is a start subexon in a mixture region that corresponds to the start of a gene on another strand.
314 bool IsStartOfMixtureStrandRegion( int tag, struct _subexon *subexons, int seCnt ) ;
315
316 // The functions to pick transcripts through dynamic programming
317 struct _dp *dpHash ;
318 void SearchSubTranscript( int tag, int strand, int parents[], int pcnt, struct _dp &pdp, int visit[], int vcnt, int extends[], int extendCnt, std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr ) ;
319 struct _dp SolveSubTranscript( int visit[], int vcnt, int strand, std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr ) ;
320 void PickTranscriptsByDP( struct _subexon *subexons, int seCnt, int iterBound, Constraints &constraints, SubexonCorrelation &correlation, struct _dpAttribute &attr, std::vector<struct _transcript> &allTranscripts ) ;
321
322 void SetDpContent( struct _dp &a, struct _dp &b, const struct _dpAttribute &attr )
323 {
324 a.seVector.Assign( b.seVector ) ;
325 a.first = b.first ;
326 a.last = b.last ;
327 a.cnt = b.cnt ;
328 a.cover = b.cover ;
329
330 a.strand = b.strand ;
331 a.minAbundance = attr.minAbundance ;
332 a.timeStamp = attr.timeStamp ;
333 }
334
335 void ResetDpContent( struct _dp &d )
336 {
337 d.seVector.Reset() ;
338 d.first = -1 ;
339 d.last = -1 ;
340 d.cnt = -1 ;
341 d.cover = -1 ;
342 d.minAbundance = -1 ;
343 d.timeStamp = -1 ;
344 }
345
346 void AugmentTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, int limit, bool extend ) ;
347 // Test whether a constraints is compatible with the transcript.
348 // Return 0 - uncompatible or does not overlap at all. 1 - fully compatible. 2 - Head of the constraints compatible with the tail of the transcript
349 int IsConstraintInTranscript( struct _transcript transcript, struct _constraint &c ) ;
350 int IsConstraintInTranscriptDebug( struct _transcript transcript, struct _constraint &c ) ;
351
352 // Count how many transcripts are possible starting from subexons[tag].
353 int SubTranscriptCount( int tag, struct _subexon *subexons, int f[] ) ;
354
355 // The methods when there is no need for DP
356 void EnumerateTranscript( int tag, int strand, int visit[], int vcnt, struct _subexon *subexons, SubexonCorrelation &correlation, double correlationScore, std::vector<struct _transcript> &alltranscripts, int &atcnt ) ;
357 // For the simpler case, we can pick sample by sample.
358 void PickTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, Constraints &constraints, SubexonCorrelation &seCorrelation, std::vector<struct _transcript> &transcripts ) ;
359
360 static bool CompSortTranscripts( const struct _transcript &a, const struct _transcript &b )
361 {
362 if ( a.first < b.first )
363 return true ;
364 else if ( a.first > b.first )
365 return false ;
366
367 int diffPos = a.seVector.GetFirstDifference( b.seVector ) ;
368 if ( diffPos == -1 )
369 return false ;
370 if ( a.seVector.Test( diffPos ) )
371 return true ;
372 else
373 return false ;
374 }
375
376 static bool CompSortPairs( const struct _pair32 &x, const struct _pair32 &y )
377 {
378 if ( x.a != y.a )
379 return x.a < y.a ;
380 else
381 return x.b < y.b ;
382 }
383
384 static bool CompSortPairsByB( const struct _pair32 &x, const struct _pair32 &y )
385 {
386 return x.b < y.b ;
387 }
388
389 static int CompPairsByB( const void *p1, const void *p2 )
390 {
391 return ((struct _pair32 *)p1)->b - ((struct _pair32 *)p2)->b ;
392 }
393
394 double ComputeScore( double cnt, double weight, double a, double A, double correlation )
395 {
396 if ( a > A * 0.1 )
397 return ( cnt * weight ) * ( 1 + pow( a / A, 0.25 ) ) + correlation ;
398 else
399 return ( cnt * weight ) * ( 1 + a / A ) + correlation ;
400 //return ( cnt ) * ( exp( 1 + a / A ) ) + correlation ;
401 }
402
403 int GetFather( int f, int *father ) ;
404
405 void ConvertTranscriptAbundanceToFPKM( struct _subexon *subexons, struct _transcript &t, int readCnt = 1000000 )
406 {
407 int txptLen = 0 ;
408 int i, size ;
409
410 std::vector<int> subexonInd ;
411 t.seVector.GetOnesIndices( subexonInd ) ;
412 size = subexonInd.size() ;
413 for ( i = 0 ; i < size ; ++i )
414 txptLen += ( subexons[ subexonInd[i] ].end - subexons[ subexonInd[i] ].start + 1 ) ;
415 double factor = 1 ;
416 if ( alignments.matePaired )
417 factor = 0.5 ;
418 t.FPKM = t.abundance * factor / ( ( readCnt / 1000000.0 ) * ( txptLen / 1000.0 ) ) ;
419 }
420
421 int GetTranscriptLengthFromAbundanceAndFPKM( double abundance, double FPKM, int readCnt = 1000000 )
422 {
423 double factor = 1 ;
424 if ( alignments.matePaired )
425 factor = 0.5 ;
426 return int( abundance * factor / ( FPKM / 1000.0 ) / ( readCnt / 1000000.0 ) + 0.5 ) ;
427 }
428
429 void CoalesceSameTranscripts( std::vector<struct _transcript> &t ) ;
430
431
432 // Initialize the structure to store transcript id
433 void InitTranscriptId() ;
434
435 int GetTranscriptGeneId( std::vector<int> &subexonInd, struct _subexon *subexons ) ;
436 int GetTranscriptGeneId( struct _transcript &t, struct _subexon *subexons ) ;
437 int RemoveNegativeAbundTranscripts( std::vector<struct _transcript> &transcripts )
438 {
439 int i, j ;
440 int tcnt = transcripts.size() ;
441 j = 0 ;
442 for ( i = 0 ; i < tcnt ; ++i )
443 {
444 if ( transcripts[i].abundance < 0 )
445 {
446 transcripts[i].seVector.Release() ; // Don't forget release the memory.
447 continue ;
448 }
449 transcripts[j] = transcripts[i] ;
450 ++j ;
451 }
452 transcripts.resize( j ) ;
453 return j ;
454 }
455
456 void AbundanceEstimation( struct _subexon *subexons, int seCnt, Constraints &constraints, std::vector<struct _transcript> &transcripts ) ;
457
458 int RefineTranscripts( struct _subexon *subexons, int seCnt, bool aggressive, std::map<int, int> *subexonChainSupport, int *txptSampleSupport, std::vector<struct _transcript> &transcripts, Constraints &constraints ) ;
459
460 void ComputeTranscriptsScore( struct _subexon *subexons, int seCnt, std::map<int, int> *subexonChainSupport, std::vector<struct _transcript> &transcripts ) ;
461
462 void OutputTranscript( int sampleId, struct _subexon *subexons, struct _transcript &transcript ) ;
463
464 void PrintLog( const char *fmt, ... )
465 {
466 char buffer[10021] ;
467 va_list args ;
468 va_start( args, fmt ) ;
469 vsprintf( buffer, fmt, args ) ;
470
471 time_t mytime = time(NULL) ;
472 struct tm *localT = localtime( &mytime ) ;
473 char stime[500] ;
474 strftime( stime, sizeof( stime ), "%c", localT ) ;
475 fprintf( stderr, "[%s] %s\n", stime, buffer ) ;
476 }
477
478
479 public:
480 TranscriptDecider( double f, double c, double d, int sampleCnt, Alignments &a ): alignments( a )
481 {
482 FPKMFraction = f ;
483 canBeSoftBoundaryThreshold = c ;
484 txptMinReadDepth = d ;
485 usedGeneId = 0 ;
486 defaultGeneId[0] = -1 ;
487 defaultGeneId[1] = -1 ;
488 maxDpConstraintSize = -1 ;
489 numThreads = 1 ;
490 this->sampleCnt = sampleCnt ;
491 dpHash = new struct _dp[ HASH_MAX ] ; // pre-allocated buffer to hold dp information.
492 }
493 ~TranscriptDecider()
494 {
495 int i ;
496 if ( numThreads == 1 )
497 {
498 int size = outputFPs.size() ;
499 for ( i = 0 ; i < size ; ++i )
500 {
501 fclose( outputFPs[i] ) ;
502 }
503 }
504 delete[] dpHash ;
505 }
506
507
508 // @return: the number of assembled transcript
509 int Solve( struct _subexon *subexons, int seCnt, std::vector<Constraints> &constraints, SubexonCorrelation &subexonCorrelation ) ;
510
511 void SetOutputFPs( char *outputPrefix )
512 {
513 int i ;
514 char buffer[1024] ;
515 for ( i = 0 ; i < sampleCnt ; ++i )
516 {
517 if ( outputPrefix[0] )
518 sprintf( buffer, "%s_sample_%d.gtf", outputPrefix, i ) ;
519 else
520 sprintf( buffer, "sample_%d.gtf", i ) ;
521 FILE *fp = fopen( buffer, "w" ) ;
522 outputFPs.push_back( fp ) ;
523 }
524 }
525
526 void SetMultiThreadOutputHandler( MultiThreadOutputTranscript *h )
527 {
528 outputHandler = h ;
529 }
530
531 void SetNumThreads( int t )
532 {
533 numThreads = t ;
534 }
535
536 void SetMaxDpConstraintSize(int size)
537 {
538 maxDpConstraintSize = size ;
539 }
540 } ;
541
542 void *TranscriptDeciderSolve_Wrapper( void *arg ) ;
543
544 #endif