Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/psiclass @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 use strict ; | |
4 use warnings ; | |
5 | |
6 use Cwd 'cwd' ; | |
7 use Cwd 'abs_path' ; | |
8 use File::Basename; | |
9 use threads ; | |
10 use threads::shared ; | |
11 | |
12 die "Usage: ./psiclass [OPTIONS]\n". | |
13 "Required:\n". | |
14 "\t-b STRING: paths to the alignment BAM files. Use comma to separate multiple BAM files\n". | |
15 "\t\tor\n". | |
16 "\t--lb STRING: path to the file listing the alignments BAM files\n". | |
17 "Optional:\n". | |
18 "\t-s STRING: path to the trusted splice file (default: not used)\n". | |
19 "\t-o STRING: prefix of output files (default: ./psiclass)\n". | |
20 "\t-p INT: number of processes/threads (default: 1)\n". | |
21 "\t-c FLOAT: only use the subexons with classifier score <= than the given number (default: 0.05)\n". | |
22 "\t--sa FLOAT: the minimum average number of supported read for retained introns (default: 0.5)\n". | |
23 "\t--vd FLOAT : the minimum average coverage depth of a transcript to be reported (defaults: 1.0)\n". | |
24 "\t--maxDpConstraintSize: the number of subexons a constraint can cover in DP. (default: 7. -1 for inf)\n". | |
25 "\t--bamGroup STRING: path to the file listing the group id of BAMs in the --lb file (default: not used)\n". | |
26 "\t--primaryParalog: use primary alignment to retain paralog genes (default: use unique alignments)\n". | |
27 #"\t--mateIdx INT: the read id has suffix such as .1, .2 for a mate pair. (default: auto)\n". | |
28 "\t--version: print version and exit\n". | |
29 "\t--stage INT: (default: 0)\n". | |
30 "\t\t0-start from beginning - building splice sites for each sample\n". | |
31 "\t\t1-start from building subexon files for each sample\n". | |
32 "\t\t2-start from combining subexon files across samples\n". | |
33 "\t\t3-start from assembling the transcripts for each sample\n". | |
34 "\t\t4-start from voting the consensus transcripts across samples\n" | |
35 if ( @ARGV == 0 ) ; | |
36 | |
37 my $WD = dirname( abs_path( $0 ) ) ; | |
38 | |
39 my $i ; | |
40 my $cmd ; | |
41 my $prefix = "psiclass_" ; | |
42 my $numThreads = 1 ; | |
43 | |
44 sub system_call | |
45 { | |
46 print $_[0], "\n" ; | |
47 system( $_[0] ) == 0 or die "Terminated\n" ; | |
48 } | |
49 | |
50 # Process the arguments | |
51 my @bamFiles : shared ; | |
52 my $spliceFile = "" ; | |
53 my $bamFileList = "" ; | |
54 my $stage = 0 ; | |
55 my $classesOpt = "" ; | |
56 my $juncOpt = "" ; | |
57 my $trustSpliceOpt = "" ; | |
58 my $voteOpt = "" ; | |
59 my $outdir = "." ; | |
60 my $mateIdx = -1 ; | |
61 my $spliceAvgSupport = 0.5 ; | |
62 my $bamGroup = "" ; | |
63 for ( $i = 0 ; $i < @ARGV ; ++$i ) | |
64 { | |
65 if ( $ARGV[$i] eq "--lb" ) | |
66 { | |
67 $bamFileList = $ARGV[$i + 1] ; | |
68 open FP1, $ARGV[$i + 1] ; | |
69 while ( <FP1> ) | |
70 { | |
71 chomp ; | |
72 push @bamFiles, $_ ; | |
73 } | |
74 ++$i ; | |
75 } | |
76 elsif ( $ARGV[$i] eq "-b" ) | |
77 { | |
78 push @bamFiles, ( split /,/, $ARGV[$i + 1] ) ; | |
79 ++$i ; | |
80 } | |
81 elsif ( $ARGV[$i] eq "-s" ) | |
82 { | |
83 $spliceFile = $ARGV[$i + 1] ; | |
84 ++$i ; | |
85 } | |
86 elsif ( $ARGV[$i] eq "-o" ) | |
87 { | |
88 my $o = $ARGV[$i + 1] ; | |
89 # Parse the -o option to get the folder and prefix | |
90 if ( $o =~ /\// ) | |
91 { | |
92 if ( substr( $o, -1 ) ne "/" ) # file prefix is in -o | |
93 { | |
94 my @cols = split /\/+/, $o ; | |
95 $prefix = $cols[-1] ; | |
96 $outdir = join( "/", @cols[0..($#cols-1)] ) ; | |
97 } | |
98 else # -o only specifies path | |
99 { | |
100 $outdir = substr( $o, 0, length( $o ) - 1 ) ; | |
101 } | |
102 } | |
103 else | |
104 { | |
105 $prefix = $o ; | |
106 } | |
107 | |
108 if ( substr( $prefix, -1 ) ne "_" ) | |
109 { | |
110 $prefix .= "_" ; | |
111 } | |
112 | |
113 ++$i ; | |
114 } | |
115 elsif ( $ARGV[$i] eq "--stage" ) | |
116 { | |
117 $stage = $ARGV[$i + 1] ; | |
118 ++$i ; | |
119 } | |
120 elsif ( $ARGV[ $i ] eq "-p" ) | |
121 { | |
122 $numThreads = $ARGV[$i + 1] ; | |
123 $classesOpt .= " -p $numThreads" ; | |
124 ++$i ; | |
125 } | |
126 elsif ( $ARGV[ $i ] eq "-c" ) | |
127 { | |
128 $classesOpt .= " -c ".$ARGV[ $i + 1 ] ; | |
129 ++$i ; | |
130 } | |
131 elsif ( $ARGV[ $i ] eq "--mateIdx" ) | |
132 { | |
133 $mateIdx = $ARGV[ $i + 1 ] ; | |
134 if ( $mateIdx == 1 ) | |
135 { | |
136 $classesOpt .= " --hasMateIdSuffix" ; | |
137 $juncOpt .= " --hasMateIdSuffix" ; | |
138 } | |
139 ++$i ; | |
140 } | |
141 elsif ( $ARGV[$i] eq "--bamGroup" ) | |
142 { | |
143 $bamGroup = $ARGV[$i + 1] ; | |
144 ++$i ; | |
145 } | |
146 elsif ( $ARGV[ $i ] eq "--sa" ) | |
147 { | |
148 $trustSpliceOpt .= " -a ".$ARGV[$i + 1] ; | |
149 ++$i ; | |
150 } | |
151 elsif ( $ARGV[ $i ] eq "--vd" ) | |
152 { | |
153 $voteOpt .= " -d ".$ARGV[$i + 1] ; | |
154 ++$i ; | |
155 } | |
156 elsif ( $ARGV[$i] eq "--primaryParalog" ) | |
157 { | |
158 $classesOpt .= " --primaryParalog" ; | |
159 } | |
160 elsif ( $ARGV[$i] eq "--maxDpConstraintSize" ) | |
161 { | |
162 $classesOpt .= " --maxDpConstraintSize ".$ARGV[$i + 1] ; | |
163 ++$i ; | |
164 } | |
165 elsif ( $ARGV[$i] eq "--version" ) | |
166 { | |
167 die "PsiCLASS v1.0.1\n" ; | |
168 } | |
169 else | |
170 { | |
171 die "Unknown argument: ", $ARGV[$i], "\n" ; | |
172 } | |
173 } | |
174 if ( scalar( @bamFiles ) == 0 ) | |
175 { | |
176 die "Must use option --lb to specify the list of bam files.\n" ; | |
177 } | |
178 | |
179 if ( $mateIdx == -1 ) | |
180 { | |
181 open FPsam, "$WD/samtools-0.1.19/samtools view ".$bamFiles[0]."| head -1000 |" ; | |
182 my $flag = 0 ; | |
183 while ( <FPsam> ) | |
184 { | |
185 my @cols = split /\t/ ; | |
186 my $id = $cols[0] ; | |
187 | |
188 if ( !( $id =~ /[\.|\/][1|2]$/ ) ) | |
189 { | |
190 $flag = 1 ; | |
191 last ; | |
192 } | |
193 } | |
194 close FPsam ; | |
195 | |
196 if ( $flag == 0 ) | |
197 { | |
198 print "Found mate read id index suffix(.1 or /1). Calling \"--mateIdx 1\" option. If this is a false calling, please use \"--mateIdx 0\".\n" ; | |
199 $classesOpt .= " --hasMateIdSuffix" ; | |
200 $juncOpt .= " --hasMateIdSuffix" ; | |
201 | |
202 } | |
203 } | |
204 | |
205 mkdir $outdir if ( !-d $outdir ) ; | |
206 mkdir "$outdir/splice" if ( !-d "$outdir/splice" ) ; | |
207 mkdir "$outdir/subexon" if ( !-d "$outdir/subexon" ) ; | |
208 | |
209 | |
210 my $threadLock : shared ; | |
211 my @sharedFiles : shared ; | |
212 my @threads ; | |
213 for ( $i = 0 ; $i < $numThreads ; ++$i ) | |
214 { | |
215 push @threads, $i ; | |
216 } | |
217 | |
218 sub threadRunSplice | |
219 { | |
220 my $tid = threads->tid() - 1 ; | |
221 my $i ; | |
222 for ( $i = 0 ; $i < scalar( @bamFiles ) ; ++$i ) | |
223 { | |
224 next if ( ( $i % $numThreads ) != $tid ) ; | |
225 system_call( "$WD/junc ".$bamFiles[$i]." -a $juncOpt > $outdir/splice/${prefix}bam_$i.raw_splice" ) ; | |
226 } | |
227 } | |
228 | |
229 # Generate the splice file for each bam file. | |
230 if ( $stage <= 0 ) | |
231 { | |
232 if ( $numThreads == 1 ) | |
233 { | |
234 for ( $i = 0 ; $i < @bamFiles ; ++$i ) | |
235 { | |
236 system_call( "$WD/junc ".$bamFiles[$i]." -a $juncOpt > $outdir/splice/${prefix}bam_$i.raw_splice" ) ; | |
237 #if ( $spliceFile ne "" ) | |
238 #{ | |
239 # system_call( "perl $WD/ManipulateIntronFile.pl $spliceFile ${prefix}bam_$i.raw_splice > ${prefix}bam_$i.splice" ) ; | |
240 #} | |
241 #else | |
242 #{ | |
243 #system_call( "awk \'{if (\$6>1) print;}\' ${prefix}bam_$i.raw_splice > ${prefix}bam_$i.splice" ) ; | |
244 # system_call( "mv ${prefix}bam_$i.raw_splice ${prefix}bam_$i.splice" ) ; | |
245 #} | |
246 } | |
247 } | |
248 else | |
249 { | |
250 foreach ( @threads ) | |
251 { | |
252 $_ = threads->create( \&threadRunSplice ) ; | |
253 } | |
254 foreach ( @threads ) | |
255 { | |
256 $_->join() ; | |
257 } | |
258 } | |
259 | |
260 if ( $bamGroup eq "" || $spliceFile ne "" ) | |
261 { | |
262 open FPls, ">$outdir/splice/${prefix}splice.list" ; | |
263 for ( $i = 0 ; $i < @bamFiles ; ++$i ) | |
264 { | |
265 print FPls "$outdir/splice/${prefix}bam_$i.raw_splice\n" ; | |
266 } | |
267 close FPls ; | |
268 | |
269 if ( $spliceFile ne "" ) | |
270 { | |
271 for ( $i = 0 ; $i < @bamFiles ; ++$i ) | |
272 { | |
273 system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $spliceFile > $outdir/splice/${prefix}bam_$i.splice" ) ; | |
274 } | |
275 } | |
276 else | |
277 { | |
278 system_call( "$WD/trust-splice $outdir/splice/${prefix}splice.list ". $bamFiles[0] ." $trustSpliceOpt > $outdir/splice/${prefix}bam.trusted_splice" ) ; | |
279 for ( $i = 0 ; $i < @bamFiles ; ++$i ) | |
280 { | |
281 system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $outdir/splice/${prefix}bam.trusted_splice > $outdir/splice/${prefix}bam_$i.splice" ) ; | |
282 } | |
283 } | |
284 } | |
285 else | |
286 { | |
287 my @bamToGroupId ; | |
288 my %groupNameToId ; | |
289 open FPbg, $bamGroup ; | |
290 my $groupUsed = 0 ; | |
291 while (<FPbg>) | |
292 { | |
293 chomp ; | |
294 if ( !defined $groupNameToId{$_} ) | |
295 { | |
296 $groupNameToId{$_} = $groupUsed ; | |
297 ++$groupUsed ; | |
298 } | |
299 push @bamToGroupId, $groupNameToId{$_} ; | |
300 } | |
301 close FPbg ; | |
302 | |
303 # Process each group one by one | |
304 my $group ; | |
305 for ( $group = 0 ; $group < $groupUsed ; ++$group ) | |
306 { | |
307 open FPls, ">$outdir/splice/${prefix}splice_$group.list" ; | |
308 for ( $i = 0 ; $i < @bamFiles ; ++$i ) | |
309 { | |
310 next if ( $bamToGroupId[$i] != $group ) ; | |
311 print FPls "$outdir/splice/${prefix}bam_$i.raw_splice\n" ; | |
312 } | |
313 close FPls ; | |
314 | |
315 system_call( "$WD/trust-splice $outdir/splice/${prefix}splice_$group.list ". $bamFiles[0] ." $trustSpliceOpt > $outdir/splice/${prefix}bam_$group.trusted_splice" ) ; | |
316 for ( $i = 0 ; $i < @bamFiles ; ++$i ) | |
317 { | |
318 next if ( $bamToGroupId[$i] != $group ) ; | |
319 system_call( "perl $WD/FilterSplice.pl $outdir/splice/${prefix}bam_$i.raw_splice $outdir/splice/${prefix}bam_$group.trusted_splice > $outdir/splice/${prefix}bam_$i.splice" ) ; | |
320 } | |
321 } | |
322 | |
323 } | |
324 } | |
325 | |
326 | |
327 # Get subexons from each bam file | |
328 sub threadRunSubexonInfo | |
329 { | |
330 my $tidAdjust = 0 ; | |
331 $tidAdjust = $numThreads if ( $stage < 1 ) ; | |
332 my $tid = threads->tid() - $tidAdjust - 1 ; | |
333 my $i ; | |
334 for ( $i = 0 ; $i < scalar( @bamFiles ) ; ++$i ) | |
335 { | |
336 next if ( ( $i % $numThreads ) != $tid ) ; | |
337 system_call( "$WD/subexon-info ".$bamFiles[$i]." $outdir/splice/${prefix}bam_$i.splice > $outdir/subexon/${prefix}subexon_$i.out" ) ; | |
338 } | |
339 } | |
340 | |
341 | |
342 if ( $stage <= 1 ) | |
343 { | |
344 if ( $numThreads == 1 ) | |
345 { | |
346 for ( $i = 0 ; $i < @bamFiles ; ++$i ) | |
347 { | |
348 system_call( "$WD/subexon-info ".$bamFiles[$i]." $outdir/splice/${prefix}bam_$i.splice > $outdir/subexon/${prefix}subexon_$i.out" ) ; | |
349 } | |
350 } | |
351 else | |
352 { | |
353 #print "hi ", scalar( @threads ), " ", scalar( @bamFiles), "\n" ; | |
354 foreach ( @threads ) | |
355 { | |
356 $_ = threads->create( \&threadRunSubexonInfo ) ; | |
357 } | |
358 foreach ( @threads ) | |
359 { | |
360 $_->join() ; | |
361 } | |
362 } | |
363 | |
364 open FPls, ">$outdir/subexon/${prefix}subexon.list" ; | |
365 for ( $i = 0 ; $i < @bamFiles ; ++$i ) | |
366 { | |
367 print FPls "$outdir/subexon/${prefix}subexon_$i.out\n" ; | |
368 } | |
369 close FPls ; | |
370 } | |
371 | |
372 # combine the subexons. | |
373 if ( $stage <= 2 ) | |
374 { | |
375 $cmd = "$WD/combine-subexons --ls $outdir/subexon/${prefix}subexon.list > $outdir/subexon/${prefix}subexon_combined.out" ; | |
376 system_call( "$cmd" ) ; | |
377 } | |
378 | |
379 # Run classes | |
380 if ( $stage <= 3 ) | |
381 { | |
382 my $trimPrefix = substr( $prefix, 0, -1 ) ; | |
383 my $bamPath = "" ; | |
384 if ( $bamFileList ne "" ) | |
385 { | |
386 $bamPath = " --lb $bamFileList " ; | |
387 } | |
388 else | |
389 { | |
390 foreach my $b (@bamFiles) | |
391 { | |
392 $bamPath .= " -b $b " ; | |
393 } | |
394 } | |
395 $cmd = "$WD/classes $classesOpt $bamPath -s $outdir/subexon/${prefix}subexon_combined.out -o $outdir/${trimPrefix} > $outdir/${prefix}classes.log" ; | |
396 system_call( "$cmd" ) ; | |
397 } | |
398 | |
399 # Run voting | |
400 if ( $stage <= 4 ) | |
401 { | |
402 open FPgtflist, ">$outdir/${prefix}gtf.list" ; | |
403 for ( $i = 0 ; $i < @bamFiles ; ++$i ) | |
404 { | |
405 print FPgtflist "$outdir/${prefix}sample_${i}.gtf\n" ; | |
406 } | |
407 close FPgtflist ; | |
408 $cmd = "$WD/vote-transcripts --lg $outdir/${prefix}gtf.list $voteOpt > $outdir/${prefix}vote.gtf" ; | |
409 system_call( "$cmd" ) ; | |
410 } | |
411 |