comparison PsiCLASS-1.0.2/psiclass @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 #!/usr/bin/env perl
2
3 use strict ;
4 use warnings ;
5
6 use Cwd 'cwd' ;
7 use Cwd 'abs_path' ;
8 use File::Basename;
9 use threads ;
10 use threads::shared ;
11
12 die "Usage: ./psiclass [OPTIONS]\n".
13 "Required:\n".
14 "\t-b STRING: paths to the alignment BAM files. Use comma to separate multiple BAM files\n".
15 "\t\tor\n".
16 "\t--lb STRING: path to the file listing the alignments BAM files\n".
17 "Optional:\n".
18 "\t-s STRING: path to the trusted splice file (default: not used)\n".
19 "\t-o STRING: prefix of output files (default: ./psiclass)\n".
20 "\t-p INT: number of processes/threads (default: 1)\n".
21 "\t-c FLOAT: only use the subexons with classifier score <= than the given number (default: 0.05)\n".
22 "\t--sa FLOAT: the minimum average number of supported read for retained introns (default: 0.5)\n".
23 "\t--vd FLOAT : the minimum average coverage depth of a transcript to be reported (defaults: 1.0)\n".
24 "\t--maxDpConstraintSize: the number of subexons a constraint can cover in DP. (default: 7. -1 for inf)\n".
25 "\t--bamGroup STRING: path to the file listing the group id of BAMs in the --lb file (default: not used)\n".
26 "\t--primaryParalog: use primary alignment to retain paralog genes (default: use unique alignments)\n".
27 #"\t--mateIdx INT: the read id has suffix such as .1, .2 for a mate pair. (default: auto)\n".
28 "\t--version: print version and exit\n".
29 "\t--stage INT: (default: 0)\n".
30 "\t\t0-start from beginning - building splice sites for each sample\n".
31 "\t\t1-start from building subexon files for each sample\n".
32 "\t\t2-start from combining subexon files across samples\n".
33 "\t\t3-start from assembling the transcripts for each sample\n".
34 "\t\t4-start from voting the consensus transcripts across samples\n"
35 if ( @ARGV == 0 ) ;
36
37 my $WD = dirname( abs_path( $0 ) ) ;
38
39 my $i ;
40 my $cmd ;
41 my $prefix = "psiclass_" ;
42 my $numThreads = 1 ;
43
44 sub system_call
45 {
46 print $_[0], "\n" ;
47 system( $_[0] ) == 0 or die "Terminated\n" ;
48 }
49
50 # Process the arguments
51 my @bamFiles : shared ;
52 my $spliceFile = "" ;
53 my $bamFileList = "" ;
54 my $stage = 0 ;
55 my $classesOpt = "" ;
56 my $juncOpt = "" ;
57 my $trustSpliceOpt = "" ;
58 my $voteOpt = "" ;
59 my $outdir = "." ;
60 my $mateIdx = -1 ;
61 my $spliceAvgSupport = 0.5 ;
62 my $bamGroup = "" ;
63 for ( $i = 0 ; $i < @ARGV ; ++$i )
64 {
65 if ( $ARGV[$i] eq "--lb" )
66 {
67 $bamFileList = $ARGV[$i + 1] ;
68 open FP1, $ARGV[$i + 1] ;
69 while ( <FP1> )
70 {
71 chomp ;
72 push @bamFiles, $_ ;
73 }
74 ++$i ;
75 }
76 elsif ( $ARGV[$i] eq "-b" )
77 {
78 push @bamFiles, ( split /,/, $ARGV[$i + 1] ) ;
79 ++$i ;
80 }
81 elsif ( $ARGV[$i] eq "-s" )
82 {
83 $spliceFile = $ARGV[$i + 1] ;
84 ++$i ;
85 }
86 elsif ( $ARGV[$i] eq "-o" )
87 {
88 my $o = $ARGV[$i + 1] ;
89 # Parse the -o option to get the folder and prefix
90 if ( $o =~ /\// )
91 {
92 if ( substr( $o, -1 ) ne "/" ) # file prefix is in -o
93 {
94 my @cols = split /\/+/, $o ;
95 $prefix = $cols[-1] ;
96 $outdir = join( "/", @cols[0..($#cols-1)] ) ;
97 }
98 else # -o only specifies path
99 {
100 $outdir = substr( $o, 0, length( $o ) - 1 ) ;
101 }
102 }
103 else
104 {
105 $prefix = $o ;
106 }
107
108 if ( substr( $prefix, -1 ) ne "_" )
109 {
110 $prefix .= "_" ;
111 }
112
113 ++$i ;
114 }
115 elsif ( $ARGV[$i] eq "--stage" )
116 {
117 $stage = $ARGV[$i + 1] ;
118 ++$i ;
119 }
120 elsif ( $ARGV[ $i ] eq "-p" )
121 {
122 $numThreads = $ARGV[$i + 1] ;
123 $classesOpt .= " -p $numThreads" ;
124 ++$i ;
125 }
126 elsif ( $ARGV[ $i ] eq "-c" )
127 {
128 $classesOpt .= " -c ".$ARGV[ $i + 1 ] ;
129 ++$i ;
130 }
131 elsif ( $ARGV[ $i ] eq "--mateIdx" )
132 {
133 $mateIdx = $ARGV[ $i + 1 ] ;
134 if ( $mateIdx == 1 )
135 {
136 $classesOpt .= " --hasMateIdSuffix" ;
137 $juncOpt .= " --hasMateIdSuffix" ;
138 }
139 ++$i ;
140 }
141 elsif ( $ARGV[$i] eq "--bamGroup" )
142 {
143 $bamGroup = $ARGV[$i + 1] ;
144 ++$i ;
145 }
146 elsif ( $ARGV[ $i ] eq "--sa" )
147 {
148 $trustSpliceOpt .= " -a ".$ARGV[$i + 1] ;
149 ++$i ;
150 }
151 elsif ( $ARGV[ $i ] eq "--vd" )
152 {
153 $voteOpt .= " -d ".$ARGV[$i + 1] ;
154 ++$i ;
155 }
156 elsif ( $ARGV[$i] eq "--primaryParalog" )
157 {
158 $classesOpt .= " --primaryParalog" ;
159 }
160 elsif ( $ARGV[$i] eq "--maxDpConstraintSize" )
161 {
162 $classesOpt .= " --maxDpConstraintSize ".$ARGV[$i + 1] ;
163 ++$i ;
164 }
165 elsif ( $ARGV[$i] eq "--version" )
166 {
167 die "PsiCLASS v1.0.1\n" ;
168 }
169 else
170 {
171 die "Unknown argument: ", $ARGV[$i], "\n" ;
172 }
173 }
174 if ( scalar( @bamFiles ) == 0 )
175 {
176 die "Must use option --lb to specify the list of bam files.\n" ;
177 }
178
179 if ( $mateIdx == -1 )
180 {
181 open FPsam, "$WD/samtools-0.1.19/samtools view ".$bamFiles[0]."| head -1000 |" ;
182 my $flag = 0 ;
183 while ( <FPsam> )
184 {
185 my @cols = split /\t/ ;
186 my $id = $cols[0] ;
187
188 if ( !( $id =~ /[\.|\/][1|2]$/ ) )
189 {
190 $flag = 1 ;
191 last ;
192 }
193 }
194 close FPsam ;
195
196 if ( $flag == 0 )
197 {
198 print "Found mate read id index suffix(.1 or /1). Calling \"--mateIdx 1\" option. If this is a false calling, please use \"--mateIdx 0\".\n" ;
199 $classesOpt .= " --hasMateIdSuffix" ;
200 $juncOpt .= " --hasMateIdSuffix" ;
201
202 }
203 }
204
205 mkdir $outdir if ( !-d $outdir ) ;
206 mkdir "$outdir/splice" if ( !-d "$outdir/splice" ) ;
207 mkdir "$outdir/subexon" if ( !-d "$outdir/subexon" ) ;
208
209
210 my $threadLock : shared ;
211 my @sharedFiles : shared ;
212 my @threads ;
213 for ( $i = 0 ; $i < $numThreads ; ++$i )
214 {
215 push @threads, $i ;
216 }
217
218 sub threadRunSplice
219 {
220 my $tid = threads->tid() - 1 ;
221 my $i ;
222 for ( $i = 0 ; $i < scalar( @bamFiles ) ; ++$i )
223 {
224 next if ( ( $i % $numThreads ) != $tid ) ;
225 system_call( "$WD/junc ".$bamFiles[$i]." -a $juncOpt > $outdir/splice/${prefix}bam_$i.raw_splice" ) ;
226 }
227 }
228
229 # Generate the splice file for each bam file.
230 if ( $stage <= 0 )
231 {
232 if ( $numThreads == 1 )
233 {
234 for ( $i = 0 ; $i < @bamFiles ; ++$i )
235 {
236 system_call( "$WD/junc ".$bamFiles[$i]." -a $juncOpt > $outdir/splice/${prefix}bam_$i.raw_splice" ) ;
237 #if ( $spliceFile ne "" )
238 #{
239 # system_call( "perl $WD/ManipulateIntronFile.pl $spliceFile ${prefix}bam_$i.raw_splice > ${prefix}bam_$i.splice" ) ;
240 #}
241 #else
242 #{
243 #system_call( "awk \'{if (\$6>1) print;}\' ${prefix}bam_$i.raw_splice > ${prefix}bam_$i.splice" ) ;
244 # system_call( "mv ${prefix}bam_$i.raw_splice ${prefix}bam_$i.splice" ) ;
245 #}
246 }
247 }
248 else
249 {
250 foreach ( @threads )
251 {
252 $_ = threads->create( \&threadRunSplice ) ;
253 }
254 foreach ( @threads )
255 {
256 $_->join() ;
257 }
258 }
259
260 if ( $bamGroup eq "" || $spliceFile ne "" )
261 {
262 open FPls, ">$outdir/splice/${prefix}splice.list" ;
263 for ( $i = 0 ; $i < @bamFiles ; ++$i )
264 {
265 print FPls "$outdir/splice/${prefix}bam_$i.raw_splice\n" ;
266 }
267 close FPls ;
268
269 if ( $spliceFile ne "" )
270 {
271 for ( $i = 0 ; $i < @bamFiles ; ++$i )
272 {
273 system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $spliceFile > $outdir/splice/${prefix}bam_$i.splice" ) ;
274 }
275 }
276 else
277 {
278 system_call( "$WD/trust-splice $outdir/splice/${prefix}splice.list ". $bamFiles[0] ." $trustSpliceOpt > $outdir/splice/${prefix}bam.trusted_splice" ) ;
279 for ( $i = 0 ; $i < @bamFiles ; ++$i )
280 {
281 system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $outdir/splice/${prefix}bam.trusted_splice > $outdir/splice/${prefix}bam_$i.splice" ) ;
282 }
283 }
284 }
285 else
286 {
287 my @bamToGroupId ;
288 my %groupNameToId ;
289 open FPbg, $bamGroup ;
290 my $groupUsed = 0 ;
291 while (<FPbg>)
292 {
293 chomp ;
294 if ( !defined $groupNameToId{$_} )
295 {
296 $groupNameToId{$_} = $groupUsed ;
297 ++$groupUsed ;
298 }
299 push @bamToGroupId, $groupNameToId{$_} ;
300 }
301 close FPbg ;
302
303 # Process each group one by one
304 my $group ;
305 for ( $group = 0 ; $group < $groupUsed ; ++$group )
306 {
307 open FPls, ">$outdir/splice/${prefix}splice_$group.list" ;
308 for ( $i = 0 ; $i < @bamFiles ; ++$i )
309 {
310 next if ( $bamToGroupId[$i] != $group ) ;
311 print FPls "$outdir/splice/${prefix}bam_$i.raw_splice\n" ;
312 }
313 close FPls ;
314
315 system_call( "$WD/trust-splice $outdir/splice/${prefix}splice_$group.list ". $bamFiles[0] ." $trustSpliceOpt > $outdir/splice/${prefix}bam_$group.trusted_splice" ) ;
316 for ( $i = 0 ; $i < @bamFiles ; ++$i )
317 {
318 next if ( $bamToGroupId[$i] != $group ) ;
319 system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $outdir/splice/${prefix}bam_$group.trusted_splice > $outdir/splice/${prefix}bam_$i.splice" ) ;
320 }
321 }
322
323 }
324 }
325
326
327 # Get subexons from each bam file
328 sub threadRunSubexonInfo
329 {
330 my $tidAdjust = 0 ;
331 $tidAdjust = $numThreads if ( $stage < 1 ) ;
332 my $tid = threads->tid() - $tidAdjust - 1 ;
333 my $i ;
334 for ( $i = 0 ; $i < scalar( @bamFiles ) ; ++$i )
335 {
336 next if ( ( $i % $numThreads ) != $tid ) ;
337 system_call( "$WD/subexon-info ".$bamFiles[$i]." $outdir/splice/${prefix}bam_$i.splice > $outdir/subexon/${prefix}subexon_$i.out" ) ;
338 }
339 }
340
341
342 if ( $stage <= 1 )
343 {
344 if ( $numThreads == 1 )
345 {
346 for ( $i = 0 ; $i < @bamFiles ; ++$i )
347 {
348 system_call( "$WD/subexon-info ".$bamFiles[$i]." $outdir/splice/${prefix}bam_$i.splice > $outdir/subexon/${prefix}subexon_$i.out" ) ;
349 }
350 }
351 else
352 {
353 #print "hi ", scalar( @threads ), " ", scalar( @bamFiles), "\n" ;
354 foreach ( @threads )
355 {
356 $_ = threads->create( \&threadRunSubexonInfo ) ;
357 }
358 foreach ( @threads )
359 {
360 $_->join() ;
361 }
362 }
363
364 open FPls, ">$outdir/subexon/${prefix}subexon.list" ;
365 for ( $i = 0 ; $i < @bamFiles ; ++$i )
366 {
367 print FPls "$outdir/subexon/${prefix}subexon_$i.out\n" ;
368 }
369 close FPls ;
370 }
371
372 # combine the subexons.
373 if ( $stage <= 2 )
374 {
375 $cmd = "$WD/combine-subexons --ls $outdir/subexon/${prefix}subexon.list > $outdir/subexon/${prefix}subexon_combined.out" ;
376 system_call( "$cmd" ) ;
377 }
378
379 # Run classes
380 if ( $stage <= 3 )
381 {
382 my $trimPrefix = substr( $prefix, 0, -1 ) ;
383 my $bamPath = "" ;
384 if ( $bamFileList ne "" )
385 {
386 $bamPath = " --lb $bamFileList " ;
387 }
388 else
389 {
390 foreach my $b (@bamFiles)
391 {
392 $bamPath .= " -b $b " ;
393 }
394 }
395 $cmd = "$WD/classes $classesOpt $bamPath -s $outdir/subexon/${prefix}subexon_combined.out -o $outdir/${trimPrefix} > $outdir/${prefix}classes.log" ;
396 system_call( "$cmd" ) ;
397 }
398
399 # Run voting
400 if ( $stage <= 4 )
401 {
402 open FPgtflist, ">$outdir/${prefix}gtf.list" ;
403 for ( $i = 0 ; $i < @bamFiles ; ++$i )
404 {
405 print FPgtflist "$outdir/${prefix}sample_${i}.gtf\n" ;
406 }
407 close FPgtflist ;
408 $cmd = "$WD/vote-transcripts --lg $outdir/${prefix}gtf.list $voteOpt > $outdir/${prefix}vote.gtf" ;
409 system_call( "$cmd" ) ;
410 }
411