comparison PsiCLASS-1.0.2/classes.cpp @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 #include <stdio.h>
2 #include <getopt.h>
3 #include <vector>
4 #include <pthread.h>
5
6 #include "alignments.hpp"
7 #include "SubexonGraph.hpp"
8 #include "SubexonCorrelation.hpp"
9 #include "Constraints.hpp"
10 #include "TranscriptDecider.hpp"
11
12 char usage[] = "./classes [OPTIONS]:\n"
13 "Required:\n"
14 "\t-s STRING: path to the subexon file.\n"
15 "\t-b STRING: path to the BAM file.\n"
16 "\t\tor\n"
17 "\t--lb STRING: path to the file of the list of BAM files.\n"
18 "Optional:\n"
19 "\t-p INT: number of threads. (default: 1)\n"
20 "\t-o STRING: the prefix of the output file. (default: not used)\n"
21 "\t-c FLOAT: only use the subexons with classifier score <= than the given number. (default: 0.05)\n"
22 "\t-f FLOAT: filter the transcript from the gene if its abundance is lower than the given number percent of the most abundant one. (default: 0.05)\n"
23 "\t-d FLOAT: filter the transcript whose average read depth is less than the given number. (default: 2.5)\n"
24 "\t--ls STRING: path to the file of the list of single-sample subexon files. (default: not used)\n"
25 "\t--hasMateIdSuffix: the read id has suffix such as .1, .2 for a mate pair. (default: false)\n"
26 "\t--maxDpConstraintSize: the maximum number of subexons a constraint can cover in dynamic programming. (default: 7; -1 for inf)\n"
27 "\t--primaryParalog: use primary alignment to retain paralog genes instead of unique alignments. (default: not used)\n"
28 ;
29
30 static const char *short_options = "s:b:f:o:d:p:c:h" ;
31 static struct option long_options[] =
32 {
33 { "ls", required_argument, 0, 10000 },
34 { "lb", required_argument, 0, 10001 },
35 { "hasMateIdSuffix", no_argument, 0, 10002 },
36 { "primaryParalog", no_argument, 0, 10003 },
37 { "maxDpConstraintSize", required_argument, 0, 10004 },
38 { (char *)0, 0, 0, 0}
39 } ;
40
41 struct _getAlignmentsInfoThreadArg
42 {
43 std::vector<Alignments> *pAlignmentFiles ;
44 int numThreads ;
45 int tid ;
46 } ;
47
48 struct _getConstraintsThreadArg
49 {
50 std::vector<Constraints> *pMultiSampleConstraints ;
51 int numThreads ;
52 int tid ;
53
54 struct _subexon *subexons ;
55 int seCnt ;
56 int start, end ;
57 } ;
58
59 void *GetAlignmentsInfo_Thread( void *pArg )
60 {
61 int i ;
62 std::vector <Alignments> &alignmentFiles = *( ( (struct _getAlignmentsInfoThreadArg *)pArg )->pAlignmentFiles ) ;
63 int tid = ( (struct _getAlignmentsInfoThreadArg *)pArg )->tid ;
64 int numThreads = ( (struct _getAlignmentsInfoThreadArg *)pArg )->numThreads ;
65 int size = alignmentFiles.size() ;
66 for ( i = 0 ; i < size ; ++i )
67 {
68 if ( i % numThreads == tid )
69 {
70 alignmentFiles[i].GetGeneralInfo( true ) ;
71 alignmentFiles[i].Rewind() ;
72 }
73 }
74
75 pthread_exit( NULL ) ;
76 }
77
78
79 void *GetConstraints_Thread( void *pArg )
80 {
81 int i ;
82 struct _getConstraintsThreadArg &arg = *( (struct _getConstraintsThreadArg *)pArg ) ;
83 std::vector<Constraints> &multiSampleConstraints = *( arg.pMultiSampleConstraints ) ;
84 int tid = arg.tid ;
85 int numThreads = arg.numThreads ;
86 int size = multiSampleConstraints.size() ;
87 for ( i = 0 ; i < size ; ++i )
88 {
89 if ( i % numThreads == tid )
90 multiSampleConstraints[i].BuildConstraints( arg.subexons, arg.seCnt, arg.start, arg.end ) ;
91 }
92 pthread_exit( NULL ) ;
93 }
94
95 int main( int argc, char *argv[] )
96 {
97 int i, j ;
98 int size ;
99
100 if ( argc <= 1 )
101 {
102 printf( "%s", usage ) ;
103 return 0 ;
104 }
105
106 int c, option_index ; // For getopt
107 option_index = 0 ;
108 FILE *fpSubexon = NULL ;
109 double FPKMFraction = 0.05 ;
110 double classifierThreshold ;
111 double txptMinReadDepth = 2.5 ;
112 char outputPrefix[1024] = "" ;
113 int numThreads = 1 ;
114 bool hasMateReadIdSuffix = false ;
115 bool usePrimaryAsUnique = false ;
116 int maxDpConstraintSize = 7 ;
117
118 std::vector<Alignments> alignmentFiles ;
119 SubexonCorrelation subexonCorrelation ;
120
121 classifierThreshold = 0.05 ;
122 while ( 1 )
123 {
124 c = getopt_long( argc, argv, short_options, long_options, &option_index ) ;
125 if ( c == -1 )
126 break ;
127
128 if ( c == 's' )
129 {
130 fpSubexon = fopen( optarg, "r" ) ;
131 }
132 else if ( c == 'b' )
133 {
134 Alignments a ;
135 a.Open( optarg ) ;
136 //a.SetAllowClip( false ) ;
137 alignmentFiles.push_back( a ) ;
138 }
139 else if ( c == 'c' )
140 {
141 classifierThreshold = atof( optarg ) ;
142 }
143 else if ( c == 'f' )
144 {
145 FPKMFraction = atof( optarg ) ;
146 }
147 else if ( c == 'o' )
148 {
149 strcpy( outputPrefix, optarg ) ;
150 }
151 else if ( c == 'd' )
152 {
153 txptMinReadDepth = atof( optarg ) ;
154 }
155 else if ( c == 'p' )
156 {
157 numThreads = atoi( optarg ) ;
158 }
159 else if ( c == 10000 ) // the list of subexon files.
160 {
161 subexonCorrelation.Initialize( optarg ) ;
162 }
163 else if ( c == 10001 ) // the list of bam files.
164 {
165 FILE *fp = fopen( optarg, "r" ) ;
166 char buffer[1024] ;
167 while ( fgets( buffer, sizeof( buffer ), fp ) != NULL )
168 {
169 int len = strlen( buffer ) ;
170 if ( buffer[len - 1] == '\n' )
171 {
172 buffer[len - 1] = '\0' ;
173 --len ;
174
175 }
176 Alignments a ;
177 a.Open( buffer ) ;
178 alignmentFiles.push_back( a ) ;
179 }
180 fclose( fp ) ;
181 }
182 else if ( c == 10002 ) // the mate pair read id has suffix.
183 {
184 hasMateReadIdSuffix = true ;
185 }
186 else if ( c == 10003 ) // do not check unique alignment
187 {
188 usePrimaryAsUnique = true ;
189 }
190 else if ( c == 10004 ) // maxDpConstraintSize
191 {
192 maxDpConstraintSize = atoi(optarg) ;
193 }
194 else
195 {
196 printf( "%s", usage ) ;
197 exit( 1 ) ;
198 }
199 }
200 if ( fpSubexon == NULL )
201 {
202 printf( "Cannot find combined subexon file.\n" ) ;
203 exit( 1 ) ;
204 }
205 if ( alignmentFiles.size() < 1 )
206 {
207 printf( "Must use -b option to specify BAM files.\n" ) ;
208 exit( 1 ) ;
209 }
210
211
212 if ( alignmentFiles.size() < 50 )
213 {
214 size = alignmentFiles.size() ;
215 for ( i = 0 ; i < size ; ++i )
216 {
217 alignmentFiles[i].GetGeneralInfo( true ) ;
218 alignmentFiles[i].Rewind() ;
219 }
220 }
221 else
222 {
223 struct _getAlignmentsInfoThreadArg *args = new struct _getAlignmentsInfoThreadArg[ numThreads ] ;
224 pthread_attr_t pthreadAttr ;
225 pthread_t *threads ;
226
227 pthread_attr_init( &pthreadAttr ) ;
228 pthread_attr_setdetachstate( &pthreadAttr, PTHREAD_CREATE_JOINABLE ) ;
229
230 threads = new pthread_t[ numThreads ] ;
231
232 for ( i = 0 ; i < numThreads ; ++i )
233 {
234 args[i].pAlignmentFiles = &alignmentFiles ;
235 args[i].tid = i ;
236 args[i].numThreads = numThreads ;
237
238 pthread_create( &threads[i], &pthreadAttr, GetAlignmentsInfo_Thread, &args[i] ) ;
239 }
240
241 for ( i = 0 ; i < numThreads ; ++i )
242 {
243 pthread_join( threads[i], NULL ) ;
244 }
245
246 pthread_attr_destroy( &pthreadAttr ) ;
247 delete[] args ;
248 delete[] threads ;
249 }
250
251 // Build the subexon graph
252 SubexonGraph subexonGraph( classifierThreshold, alignmentFiles[0], fpSubexon ) ;
253 subexonGraph.ComputeGeneIntervals() ;
254
255 // Solve gene by gene
256 int sampleCnt = alignmentFiles.size() ;
257 std::vector<Constraints> multiSampleConstraints ;
258 for ( i = 0 ; i < sampleCnt ; ++i )
259 {
260 Constraints constraints( &alignmentFiles[i] ) ;
261 constraints.SetHasMateReadIdSuffix( hasMateReadIdSuffix ) ;
262 constraints.SetUsePrimaryAsUnique( usePrimaryAsUnique ) ;
263 multiSampleConstraints.push_back( constraints ) ;
264 }
265 MultiThreadOutputTranscript outputHandler( sampleCnt, alignmentFiles[0] ) ;
266 outputHandler.SetOutputFPs( outputPrefix ) ;
267
268 if ( numThreads <= 1 )
269 {
270 TranscriptDecider transcriptDecider( FPKMFraction, classifierThreshold, txptMinReadDepth, sampleCnt, alignmentFiles[0] ) ;
271
272 transcriptDecider.SetMultiThreadOutputHandler( &outputHandler ) ;
273 transcriptDecider.SetNumThreads( numThreads ) ;
274 transcriptDecider.SetMaxDpConstraintSize( maxDpConstraintSize ) ;
275
276 int giCnt = subexonGraph.geneIntervals.size() ;
277 for ( i = 0 ; i < giCnt ; ++i )
278 {
279 struct _geneInterval gi = subexonGraph.geneIntervals[i] ;
280 struct _subexon *intervalSubexons = new struct _subexon[ gi.endIdx - gi.startIdx + 1 ] ;
281 subexonGraph.ExtractSubexons( gi.startIdx, gi.endIdx, intervalSubexons ) ;
282 printf( "%d: %d %s %d %d\n", i, gi.endIdx - gi.startIdx + 1,
283 alignmentFiles[0].GetChromName( intervalSubexons[0].chrId ),
284 gi.start + 1, gi.end + 1 ) ;
285 fflush( stdout ) ;
286
287 subexonCorrelation.ComputeCorrelation( intervalSubexons, gi.endIdx - gi.startIdx + 1, alignmentFiles[0] ) ;
288 for ( j = 0 ; j < sampleCnt ; ++j )
289 multiSampleConstraints[j].BuildConstraints( intervalSubexons, gi.endIdx - gi.startIdx + 1, gi.start, gi.end ) ;
290
291 transcriptDecider.Solve( intervalSubexons, gi.endIdx - gi.startIdx + 1, multiSampleConstraints, subexonCorrelation ) ;
292
293 for ( j = 0 ; j < gi.endIdx - gi.startIdx + 1 ; ++j )
294 {
295 delete[] intervalSubexons[j].prev ;
296 delete[] intervalSubexons[j].next ;
297 }
298 delete[] intervalSubexons ;
299 }
300 }
301 else // multi-thread case.
302 {
303 --numThreads ; // one thread is used for read in the data.
304
305 // Allocate memory
306 struct _transcriptDeciderThreadArg *pArgs = new struct _transcriptDeciderThreadArg[ numThreads ] ;
307 pthread_mutex_t ftLock ;
308 int *freeThreads ;
309 int ftCnt ;
310 pthread_cond_t fullWorkCond ;
311 pthread_attr_t pthreadAttr ;
312 pthread_t *threads ;
313 pthread_t *getConstraintsThreads ;
314 bool *initThreads ;
315
316 pthread_mutex_init( &ftLock, NULL ) ;
317 pthread_cond_init( &fullWorkCond, NULL ) ;
318 pthread_attr_init( &pthreadAttr ) ;
319 pthread_attr_setdetachstate( &pthreadAttr, PTHREAD_CREATE_JOINABLE ) ;
320
321 threads = new pthread_t[ numThreads ] ;
322 getConstraintsThreads = new pthread_t[ numThreads ] ;
323 initThreads = new bool[numThreads] ;
324 freeThreads = new int[ numThreads ] ;
325 ftCnt = numThreads ;
326 for ( i = 0 ; i < numThreads ; ++i )
327 {
328 pArgs[i].tid = i ;
329 pArgs[i].sampleCnt = sampleCnt ;
330 pArgs[i].numThreads = numThreads ;
331 pArgs[i].maxDpConstraintSize = maxDpConstraintSize ;
332 pArgs[i].FPKMFraction = FPKMFraction ;
333 pArgs[i].classifierThreshold = classifierThreshold ;
334 pArgs[i].txptMinReadDepth = txptMinReadDepth ;
335 pArgs[i].alignments = &alignmentFiles[0] ;
336 //pArgs[i].constraints = new std::vector<Constraints> ;
337 pArgs[i].outputHandler = &outputHandler ;
338
339 freeThreads[i] = i ;
340 pArgs[i].freeThreads = freeThreads ;
341 pArgs[i].ftCnt = &ftCnt ;
342 pArgs[i].ftLock = &ftLock ;
343 pArgs[i].fullWorkCond = &fullWorkCond ;
344
345 for ( j = 0 ; j < sampleCnt ; ++j )
346 {
347 Constraints constraints( &alignmentFiles[j] ) ;
348 pArgs[i].constraints.push_back( constraints ) ;
349 }
350
351 initThreads[i] = false ;
352 }
353
354
355 // Read in and distribute the work
356 int giCnt = subexonGraph.geneIntervals.size() ;
357 for ( i = 0 ; i < giCnt ; ++i )
358 {
359 struct _geneInterval gi = subexonGraph.geneIntervals[i] ;
360 struct _subexon *intervalSubexons = new struct _subexon[ gi.endIdx - gi.startIdx + 1 ] ;
361 subexonGraph.ExtractSubexons( gi.startIdx, gi.endIdx, intervalSubexons ) ;
362 subexonCorrelation.ComputeCorrelation( intervalSubexons, gi.endIdx - gi.startIdx + 1, alignmentFiles[0] ) ;
363 printf( "%d: %d %s %d %d. Free threads: %d/%d\n", i, gi.endIdx - gi.startIdx + 1,
364 alignmentFiles[0].GetChromName( intervalSubexons[0].chrId ),
365 gi.start + 1, gi.end + 1, ftCnt, numThreads + 1 ) ;
366 fflush( stdout ) ;
367
368 int gctCnt = ftCnt ;
369 if ( gctCnt > 1 && sampleCnt > 1 )
370 {
371 gctCnt = ( gctCnt < sampleCnt ? gctCnt : sampleCnt ) ;
372 struct _getConstraintsThreadArg *args = new struct _getConstraintsThreadArg[ gctCnt ] ;
373 for ( j = 0 ; j < gctCnt ; ++j )
374 {
375 args[j].pMultiSampleConstraints = &multiSampleConstraints ;
376 args[j].numThreads = gctCnt ;
377 args[j].tid = j ;
378 args[j].subexons = intervalSubexons ;
379 args[j].seCnt = gi.endIdx - gi.startIdx + 1 ;
380 args[j].start = gi.start ; args[j].end = gi.end ;
381 pthread_create( &getConstraintsThreads[j], &pthreadAttr, GetConstraints_Thread, &args[j] ) ;
382 }
383
384
385
386 for ( j = 0 ; j < gctCnt ; ++j )
387 pthread_join( getConstraintsThreads[j], NULL ) ;
388
389 delete[] args ;
390 }
391 else
392 {
393 for ( j = 0 ; j < sampleCnt ; ++j )
394 multiSampleConstraints[j].BuildConstraints( intervalSubexons, gi.endIdx - gi.startIdx + 1, gi.start, gi.end ) ;
395 }
396
397 // Search for the free queue.
398 int tag = -1 ; // get the working thread.
399 pthread_mutex_lock( &ftLock ) ;
400 if ( ftCnt == 0 )
401 {
402 pthread_cond_wait( &fullWorkCond, &ftLock ) ;
403 }
404 tag = freeThreads[ ftCnt - 1 ] ;
405 --ftCnt ;
406 pthread_mutex_unlock( &ftLock ) ;
407
408 if ( initThreads[tag] )
409 pthread_join( threads[tag], NULL ) ; // Make sure the chosen thread exits.
410
411 // Assign the subexons, the constraints and correlation content.
412 pArgs[tag].subexons = new struct _subexon[gi.endIdx - gi.startIdx + 1] ;
413 pArgs[tag].seCnt = gi.endIdx - gi.startIdx + 1 ;
414 for ( j = 0 ; j < pArgs[tag].seCnt ; ++j )
415 {
416 pArgs[tag].subexons[j] = intervalSubexons[j] ;
417 int cnt = intervalSubexons[j].prevCnt ;
418 pArgs[tag].subexons[j].prev = new int[cnt] ;
419 memcpy( pArgs[tag].subexons[j].prev, intervalSubexons[j].prev, sizeof( int ) * cnt ) ;
420 cnt = intervalSubexons[j].nextCnt ;
421 pArgs[tag].subexons[j].next = new int[cnt] ;
422 memcpy( pArgs[tag].subexons[j].next, intervalSubexons[j].next, sizeof( int ) * cnt ) ;
423 }
424
425 for ( j = 0 ; j < sampleCnt ; ++j )
426 {
427 pArgs[tag].constraints[j].Assign( multiSampleConstraints[ j ] ) ;
428 }
429 pArgs[tag].subexonCorrelation.Assign( subexonCorrelation ) ;
430 pthread_create( &threads[tag], &pthreadAttr, TranscriptDeciderSolve_Wrapper, &pArgs[tag] ) ;
431 initThreads[tag] = true ;
432 for ( j = 0 ; j < gi.endIdx - gi.startIdx + 1 ; ++j )
433 {
434 delete[] intervalSubexons[j].prev ;
435 delete[] intervalSubexons[j].next ;
436 }
437 delete[] intervalSubexons ;
438 }
439
440 for ( i = 0 ; i < numThreads ; ++i )
441 {
442 if ( initThreads[i] )
443 pthread_join( threads[i], NULL ) ;
444 }
445
446 // Release memory
447 for ( i = 0 ; i < numThreads ; ++i )
448 {
449 std::vector<Constraints>().swap( pArgs[i].constraints ) ;
450 }
451 delete []pArgs ;
452 pthread_attr_destroy( &pthreadAttr ) ;
453 pthread_mutex_destroy( &ftLock ) ;
454 pthread_cond_destroy( &fullWorkCond ) ;
455
456 delete[] threads ;
457 delete[] getConstraintsThreads ;
458 delete[] initThreads ;
459 delete[] freeThreads ;
460 } // end of else for multi-thread.
461
462 outputHandler.OutputCommandInfo( argc, argv ) ;
463 for ( i = 0 ; i < sampleCnt ; ++i )
464 {
465 char buffer[1024] ;
466 alignmentFiles[i].GetFileName( buffer ) ;
467 int separator = -1 ;
468
469 // deprive the directory.
470 for ( j = 0 ; buffer[j] ; ++j )
471 {
472 if ( buffer[j] == '/' )
473 separator = j ;
474 }
475 if ( separator != -1 )
476 {
477 ++separator ;
478 for ( j = separator ; buffer[j] ; ++j )
479 buffer[j - separator] = buffer[j] ;
480 buffer[j - separator] = '\0' ;
481 }
482 outputHandler.OutputCommentToSampleGTF( i, buffer ) ;
483 }
484 outputHandler.ComputeFPKMTPM( alignmentFiles ) ;
485 outputHandler.Flush() ;
486
487 for ( i = 0 ; i < sampleCnt ; ++i )
488 alignmentFiles[i].Close() ;
489 fclose( fpSubexon ) ;
490 return 0 ;
491 }