Mercurial > repos > mini > strelka
comparison configureStrelkaWorkflow.pl @ 6:87568e5a7d4f
Testing strelka version 0.0.1
author | mini |
---|---|
date | Fri, 26 Sep 2014 13:24:13 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:07cbbd662111 | 6:87568e5a7d4f |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 =head1 LICENSE | |
4 | |
5 Strelka Workflow Software | |
6 Copyright (c) 2009-2013 Illumina, Inc. | |
7 | |
8 This software is provided under the terms and conditions of the | |
9 Illumina Open Source Software License 1. | |
10 | |
11 You should have received a copy of the Illumina Open Source | |
12 Software License 1 along with this program. If not, see | |
13 <https://github.com/downloads/sequencing/licenses/>. | |
14 | |
15 =head1 SYNOPSIS | |
16 | |
17 configureStrelkaWorkflow.pl --tumor FILE --normal FILE --ref FILE --config FILE [options] | |
18 | |
19 This script configures the strelka workflow for somatic variant | |
20 calling on matched tumor-normal BAM files. The configuration process | |
21 will produce an analysis makefile and directory structure. The | |
22 makefile can be used to run the analysis on a workstation or compute | |
23 cluster via make/qmake or other makefile compatible process. | |
24 | |
25 =head1 ARGUMENTS | |
26 | |
27 =over 4 | |
28 | |
29 =item --tumor FILE | |
30 | |
31 Path to tumor sample BAM file (required) | |
32 | |
33 =item --normal FILE | |
34 | |
35 Path to normal sample BAM file (required) | |
36 | |
37 =item --ref FILE | |
38 | |
39 Path to reference genome fasta (required) | |
40 | |
41 =item --config FILE | |
42 | |
43 Strelka configuration file. Default config files can be found in | |
44 ${STRELKA_INSTALL_ROOT}/etc/ for both ELAND and BWA alignments. (required) | |
45 | |
46 =back | |
47 | |
48 =head1 OPTIONS | |
49 | |
50 =over 4 | |
51 | |
52 =item --output-dir DIRECTORY | |
53 | |
54 Root of the analysis directory. This script will place all | |
55 configuration files in the analysis directory, and after configuration | |
56 all results and intermediate files will be written within the analysis | |
57 directory during a run. This directory must not already | |
58 exist. (default: ./strelkaAnalysis) | |
59 | |
60 =back | |
61 | |
62 =cut | |
63 | |
64 use warnings FATAL => 'all'; | |
65 use strict; | |
66 | |
67 use Carp; | |
68 $SIG{__DIE__} = \&Carp::confess; | |
69 | |
70 use Cwd qw(getcwd); | |
71 use File::Spec; | |
72 use Getopt::Long; | |
73 use Pod::Usage; | |
74 | |
75 my $baseDir; | |
76 my $libDir; | |
77 BEGIN { | |
78 my $thisDir=(File::Spec->splitpath($0))[1]; | |
79 $baseDir=$thisDir;#$baseDir=File::Spec->catdir($thisDir,File::Spec->updir()); | |
80 $libDir=File::Spec->catdir($baseDir,'lib'); | |
81 } | |
82 use lib $libDir; | |
83 use Utils; | |
84 | |
85 if(getAbsPath($baseDir)) { | |
86 errorX("Can't resolve path for strelka_workflow install directory: '$baseDir'"); | |
87 } | |
88 my $libexecDir=File::Spec->catdir($baseDir,'libexec'); | |
89 #my $optDir=File::Spec->catdir($baseDir,'opt'); | |
90 | |
91 | |
92 my $scriptName=(File::Spec->splitpath($0))[2]; | |
93 my $argCount=scalar(@ARGV); | |
94 my $cmdline = join(' ',$0,@ARGV); | |
95 | |
96 | |
97 sub usage() { pod2usage(-verbose => 1, | |
98 -exitval => 2); } | |
99 | |
100 # | |
101 # user configuration: | |
102 # | |
103 | |
104 my ($tumorBam, $normalBam, $refFile, $configFile, $outDir); | |
105 my $help; | |
106 | |
107 GetOptions( "tumor=s" => \$tumorBam, | |
108 "normal=s" => \$normalBam, | |
109 "ref=s" => \$refFile, | |
110 "config=s" => \$configFile, | |
111 "output-dir=s" => \$outDir, | |
112 "help|h" => \$help) || usage(); | |
113 | |
114 usage() if($help); | |
115 usage() unless($argCount); | |
116 | |
117 | |
118 # | |
119 # Validate input conditions: | |
120 # | |
121 | |
122 sub checkFileArg($$) { | |
123 my ($file,$label) = @_; | |
124 | |
125 errorX("Must specify $label file") unless(defined($file)); #raise an error if file not defined | |
126 checkFile($file,$label); | |
127 } | |
128 | |
129 checkFileArg($tumorBam,"tumor BAM"); | |
130 checkFileArg($normalBam,"normal BAM"); | |
131 checkFileArg($refFile,"reference fasta"); | |
132 checkFileArg($configFile,"configuration ini"); | |
133 | |
134 sub makeAbsoluteFilePaths(\$) { | |
135 my ($filePathRef) = @_; | |
136 | |
137 my ($v,$fileDir,$fileName) = File::Spec->splitpath($$filePathRef); | |
138 if(getAbsPath($fileDir)) { | |
139 errorX("Can't resolve directory path for '$fileDir' from input file argument: '$$filePathRef'"); | |
140 } | |
141 $$filePathRef = File::Spec->catfile($fileDir,$fileName); | |
142 } | |
143 | |
144 makeAbsoluteFilePaths($tumorBam); | |
145 makeAbsoluteFilePaths($normalBam); | |
146 makeAbsoluteFilePaths($refFile); | |
147 makeAbsoluteFilePaths($configFile); | |
148 | |
149 # also check for BAM index files: | |
150 sub checkBamIndex($) { | |
151 my ($file) = @_; | |
152 my $ifile = $file . ".bai"; | |
153 if(! -f $ifile) { | |
154 errorX("Can't find index for BAM file '$file'"); | |
155 } | |
156 } | |
157 | |
158 checkBamIndex($tumorBam); | |
159 checkBamIndex($normalBam); | |
160 | |
161 | |
162 sub checkFaIndex($) { | |
163 my ($file) = @_; | |
164 my $ifile = $file . ".fai"; | |
165 if(! -f $ifile) { | |
166 errorX("Can't find index for fasta file '$file'"); | |
167 } | |
168 # check that fai file isn't improperly formatted (a la the GATK bundle NCBI 37 fai files) | |
169 open(my $FH,"< $ifile") || errorX("Can't open fai file '$ifile'"); | |
170 my $lineno=1; | |
171 while(<$FH>) { | |
172 chomp; | |
173 my @F=split(); | |
174 if(scalar(@F) != 5) { | |
175 errorX("Unexpected format for line number '$lineno' of fasta index file: '$ifile'\n\tRe-running fasta indexing may fix the issue. To do so, run: \"samtools faidx $file\""); | |
176 } | |
177 $lineno++; | |
178 } | |
179 close($FH); | |
180 } | |
181 | |
182 checkFaIndex($refFile); | |
183 | |
184 | |
185 if(defined($outDir)) { | |
186 if(getAbsPath($outDir)) { | |
187 errorX("Can't resolve path for ouput directory: '$outDir'"); | |
188 } | |
189 } else { | |
190 $outDir=File::Spec->catdir(Cwd::getcwd(),'strelkaAnalysis'); | |
191 } | |
192 | |
193 if(-e $outDir) { | |
194 errorX("Output path already exists: '$outDir'"); | |
195 } | |
196 | |
197 if(getAbsPath($baseDir)) { | |
198 errorX("Can't resolve path for strelka install directory: '$baseDir'"); | |
199 } | |
200 | |
201 #my $samtoolsDir = File::Spec->catdir($optDir,'samtools'); | |
202 checkDir($libexecDir,"strelka libexec"); | |
203 #checkDir($samtoolsDir,"samtools"); | |
204 | |
205 my $callScriptName = "callSomaticVariants.pl"; | |
206 my $filterScriptName = "filterSomaticVariants.pl"; | |
207 my $finishScriptName = "consolidateResults.pl"; | |
208 my $callScript = File::Spec->catfile($libexecDir,$callScriptName); | |
209 my $filterScript = File::Spec->catfile($libexecDir,$filterScriptName); | |
210 my $finishScript = File::Spec->catfile($libexecDir,$finishScriptName); | |
211 my $countFasta = File::Spec->catfile($libexecDir,"countFastaBases"); | |
212 #my $samtoolsBin = File::Spec->catfile($samtoolsDir,"samtools"); | |
213 checkFile($callScript,"somatic variant call script"); | |
214 checkFile($filterScript,"somatic variant filter script"); | |
215 checkFile($finishScript,"result consolidation script"); | |
216 checkFile($countFasta,"fasta scanner"); | |
217 #checkFile($samtoolsBin,"samtools"); | |
218 | |
219 # | |
220 # Configure bin runs: | |
221 # | |
222 checkMakeDir($outDir); | |
223 | |
224 # | |
225 # Configure bin runs: open and validate config ini | |
226 # | |
227 my $config = parseConfigIni($configFile); | |
228 | |
229 sub checkConfigKeys($) { | |
230 my ($keyref) = @_; | |
231 for (@$keyref) { | |
232 errorX("Undefined configuration option: '$_'") unless(defined($config->{user}{$_})); | |
233 } | |
234 } | |
235 | |
236 # these are the keys we need at configuration time: | |
237 my @config_keys = qw(binSize); | |
238 | |
239 # these are additional keys we will need to run the workflow: | |
240 # (note we don't check for (maxInputDepth,minTier2Mapq) for back compatibility with older config files) | |
241 my @workflow_keys = qw( | |
242 minTier1Mapq isWriteRealignedBam ssnvPrior sindelPrior ssnvNoise sindelNoise ssnvNoiseStrandBiasFrac | |
243 ssnvQuality_LowerBound sindelQuality_LowerBound isSkipDepthFilters depthFilterMultiple | |
244 snvMaxFilteredBasecallFrac snvMaxSpanningDeletionFrac indelMaxRefRepeat | |
245 indelMaxWindowFilteredBasecallFrac indelMaxIntHpolLength); | |
246 | |
247 checkConfigKeys(\@config_keys); | |
248 checkConfigKeys(\@workflow_keys); | |
249 | |
250 | |
251 my $binSize = int($config->{user}{binSize}); | |
252 | |
253 $config->{derived}{configurationCmdline} = $cmdline; | |
254 $config->{derived}{normalBam} = $normalBam; | |
255 $config->{derived}{tumorBam} = $tumorBam; | |
256 $config->{derived}{refFile} = $refFile; | |
257 $config->{derived}{outDir} = $outDir; | |
258 | |
259 # | |
260 # Configure bin runs: check for consistent chrom info between BAMs and reference | |
261 # | |
262 sub getBamChromInfo($) { | |
263 my $file = shift; | |
264 my $cmd = "samtools view -H $file |"; | |
265 open(my $FH,$cmd) || errorX("Can't open process $cmd"); | |
266 | |
267 my %info; | |
268 my $n=0; | |
269 while(<$FH>) { | |
270 next unless(/^\@SQ/); | |
271 chomp; | |
272 my @F = split(/\t/); | |
273 scalar(@F) >= 3 || errorX("Unexpected bam header for file '$file'"); | |
274 | |
275 my %h = (); | |
276 foreach (@F) { | |
277 my @vals = split(':'); | |
278 $h{$vals[0]} = $vals[1]; | |
279 } | |
280 $F[1] = $h{'SN'}; | |
281 $F[2] = $h{'LN'}; | |
282 | |
283 my $size = int($F[2]); | |
284 ($size > 0) || errorX("Unexpected chromosome size '$size' in bam header for file '$file'"); | |
285 $info{$F[1]}{size} = $size; | |
286 $info{$F[1]}{order} = $n; | |
287 $n++; | |
288 } | |
289 close($FH) || errorX("Can't close process $cmd"); | |
290 return %info; | |
291 } | |
292 | |
293 | |
294 my %chromInfo = getBamChromInfo($normalBam); | |
295 my @chroms = sort { $chromInfo{$a}{order} <=> $chromInfo{$b}{order} } (keys(%chromInfo)); | |
296 | |
297 { | |
298 #consistency check: | |
299 my %tumorChromInfo = getBamChromInfo($tumorBam); | |
300 for my $chrom (@chroms) { | |
301 my $ln = $chromInfo{$chrom}{size}; | |
302 my $tln = $tumorChromInfo{$chrom}{size}; | |
303 my $order = $chromInfo{$chrom}{order}; | |
304 my $torder = $tumorChromInfo{$chrom}{order}; | |
305 unless(defined($tln) && ($tln==$ln) && ($torder==$order)) { | |
306 errorX("Tumor and normal BAM file headers disagree on chromosome: '$chrom'"); | |
307 } | |
308 delete $tumorChromInfo{$chrom}; | |
309 } | |
310 for my $chrom (keys(%tumorChromInfo)) { | |
311 errorX("Tumor and normal BAM file headers disagree on chromosome: '$chrom'"); | |
312 } | |
313 } | |
314 | |
315 | |
316 my %refChromInfo; | |
317 logX("Scanning reference genome"); | |
318 { | |
319 my $knownGenomeSize=0; | |
320 my $cmd="$countFasta $refFile |"; | |
321 open(my $FFH,$cmd) || errorX("Failed to open process '$cmd'"); | |
322 | |
323 while(<$FFH>) { | |
324 chomp; | |
325 my @F = split(/\t/); | |
326 scalar(@F) == 4 || errorX("Unexpected value from '$cmd'"); | |
327 $knownGenomeSize += int($F[2]); | |
328 $refChromInfo{$F[1]}{knownSize} = int($F[2]); | |
329 $refChromInfo{$F[1]}{size} = int($F[3]); | |
330 } | |
331 close($FFH) || errorX("Failed to close process '$cmd'"); | |
332 | |
333 #consistency check: | |
334 for my $chrom (@chroms) { | |
335 my $ln = $chromInfo{$chrom}{size}; | |
336 my $rln = $refChromInfo{$chrom}{size}; | |
337 unless(defined($rln) && ($rln==$ln)) { | |
338 errorX("BAM headers and reference fasta disagree on chromosome: '$chrom'"); | |
339 } | |
340 $config->{derived}{"chrom_${chrom}_size"} = $rln; | |
341 $config->{derived}{"chrom_${chrom}_knownSize"} = $refChromInfo{$chrom}{knownSize}; | |
342 } | |
343 $config->{derived}{chromOrder} = join("\t",@chroms); | |
344 | |
345 $config->{derived}{knownGenomeSize} = $knownGenomeSize; | |
346 } | |
347 logX("Scanning reference genome complete"); | |
348 | |
349 | |
350 | |
351 # | |
352 # Configure bin runs: create directory structure | |
353 # | |
354 my $resultsDir = File::Spec->catdir($outDir,'results'); | |
355 checkMakeDir($resultsDir); | |
356 if($config->{user}{isWriteRealignedBam}) { | |
357 my $bamDir = File::Spec->catdir($outDir,'realigned'); | |
358 checkMakeDir($bamDir); | |
359 } | |
360 my $chromRootDir = File::Spec->catdir($outDir,'chromosomes'); | |
361 checkMakeDir($chromRootDir); | |
362 for my $chrom (@chroms) { | |
363 my $chromDir = File::Spec->catdir($chromRootDir,$chrom); | |
364 checkMakeDir($chromDir); | |
365 | |
366 my $chromRef = $chromInfo{$chrom}; | |
367 $chromRef->{dir} = $chromDir; | |
368 $chromRef->{binList} = getBinList($chromRef->{size},$binSize); | |
369 | |
370 my $binRootDir = File::Spec->catdir($chromDir,'bins'); | |
371 checkMakeDir($binRootDir); | |
372 | |
373 for my $binId ( @{$chromRef->{binList}} ) { | |
374 my $binDir = File::Spec->catdir($binRootDir,$binId); | |
375 checkMakeDir($binDir); | |
376 } | |
377 } | |
378 | |
379 | |
380 | |
381 # | |
382 # write run config file: | |
383 # | |
384 my $runConfigFile; | |
385 { | |
386 my $cstr = <<END; | |
387 ; | |
388 ; Strelka workflow configuration file | |
389 ; | |
390 ; This is an automatically generated file, you probably don't want to edit it. If starting a new run, | |
391 ; input configuration templates (with comments) can be found in the Strelka etc/ directory. | |
392 ; | |
393 END | |
394 | |
395 $cstr .= writeConfigIni($config); | |
396 | |
397 my $configDir = File::Spec->catdir($outDir,'config'); | |
398 checkMakeDir($configDir); | |
399 $runConfigFile = File::Spec->catdir($configDir,'run.config.ini'); | |
400 open(my $FH,"> $runConfigFile") || errorX("Can't open file '$runConfigFile'"); | |
401 print $FH $cstr; | |
402 close($FH); | |
403 } | |
404 | |
405 | |
406 | |
407 # | |
408 # create makefile | |
409 # | |
410 my $makeFile = File::Spec->catfile($outDir,"Makefile"); | |
411 open(my $MAKEFH, "> $makeFile") || errorX("Can't open file: '$makeFile'"); | |
412 | |
413 my $completeFile = "task.complete"; | |
414 | |
415 print $MAKEFH <<ENDE; | |
416 # This makefile was automatically generated by $scriptName | |
417 # | |
418 # Please do not edit. | |
419 | |
420 script_dir := $libexecDir | |
421 call_script := \$(script_dir)/$callScriptName | |
422 filter_script := \$(script_dir)/$filterScriptName | |
423 finish_script := \$(script_dir)/$finishScriptName | |
424 | |
425 config_file := $runConfigFile | |
426 | |
427 analysis_dir := $outDir | |
428 results_dir := \$(analysis_dir)/results | |
429 | |
430 ENDE | |
431 | |
432 print $MAKEFH <<'ENDE'; | |
433 | |
434 complete_tag := task.complete | |
435 | |
436 finish_task := $(analysis_dir)/$(complete_tag) | |
437 | |
438 get_chrom_dir = $(analysis_dir)/chromosomes/$1 | |
439 get_chrom_task = $(call get_chrom_dir,$1)/$(complete_tag) | |
440 get_bin_task = $(call get_chrom_dir,$1)/bins/$2/$(complete_tag) | |
441 | |
442 | |
443 | |
444 all: $(finish_task) | |
445 @$(print_success) | |
446 | |
447 | |
448 define print_success | |
449 echo;\ | |
450 echo Analysis complete. Final somatic calls can be found in $(results_dir);\ | |
451 echo | |
452 endef | |
453 | |
454 | |
455 # top level results target: | |
456 # | |
457 $(finish_task): | |
458 $(finish_script) --config=$(config_file) && touch $@ | |
459 | |
460 | |
461 # chromosome targets: | |
462 # | |
463 ENDE | |
464 | |
465 for my $chrom (@chroms) { | |
466 | |
467 print $MAKEFH <<ENDE; | |
468 chrom_${chrom}_task := \$(call get_chrom_task,$chrom) | |
469 \$(finish_task): \$(chrom_${chrom}_task) | |
470 \$(chrom_${chrom}_task): | |
471 \$(filter_script) --config=\$(config_file) --chrom=$chrom && touch \$@ | |
472 | |
473 ENDE | |
474 | |
475 } | |
476 | |
477 print $MAKEFH <<ENDE; | |
478 | |
479 # chromosome bin targets: | |
480 # | |
481 ENDE | |
482 | |
483 for my $chrom (@chroms) { | |
484 for my $bin (@{$chromInfo{$chrom}{binList}}) { | |
485 | |
486 print $MAKEFH <<ENDE; | |
487 chrom_${chrom}_bin_${bin}_task := \$(call get_bin_task,$chrom,$bin) | |
488 \$(chrom_${chrom}_task): \$(chrom_${chrom}_bin_${bin}_task) | |
489 \$(chrom_${chrom}_bin_${bin}_task): | |
490 \$(call_script) --config=\$(config_file) --chrom=$chrom --bin=$bin && touch \$@ | |
491 | |
492 ENDE | |
493 | |
494 } | |
495 } | |
496 | |
497 | |
498 # If the eval function is available, this is the way we could finish | |
499 # the makefile without being so verbose but it doesn't look like qmake | |
500 # understands this function. | |
501 | |
502 =cut | |
503 | |
504 print $MAKEFH <<ENDE; | |
505 | |
506 chroms := @chroms | |
507 | |
508 ENDE | |
509 | |
510 for my $chrom (@chroms) { | |
511 print $MAKEFH "${chrom}_bins := " . join(" ",@{$chromInfo{$chrom}{binList}}) . "\n"; | |
512 } | |
513 | |
514 print $MAKEFH <<'ENDE'; | |
515 | |
516 define chrom_task_template | |
517 chrom_$1_task := $(call get_chrom_task,$1) | |
518 $(finish_task): $$(chrom_$1_task) | |
519 $$(chrom_$1_task): | |
520 $$(filter_script) --config=$$(config_file) --chrom=$1 && touch $$@ | |
521 endef | |
522 | |
523 $(foreach c,$(chroms),$(eval $(call chrom_task_template,$c))) | |
524 | |
525 | |
526 # chromosome bin targets: | |
527 # | |
528 define chrom_bin_task_template | |
529 chrom_$1_bin_$2_task := $(call get_bin_task,$1,$2) | |
530 $$(chrom_$1_task): $$(chrom_$1_bin_$2_task) | |
531 $$(chrom_$1_bin_$2_task): | |
532 $$(call_script) --config=$$(config_file) --chrom=$1 --bin=$2 && touch $$@ | |
533 endef | |
534 | |
535 $(foreach c,$(chroms), \ | |
536 $(foreach b,$($c_bins),$(eval $(call chrom_bin_task_template,$c,$b))) \ | |
537 ) | |
538 | |
539 ENDE | |
540 | |
541 =cut | |
542 | |
543 | |
544 | |
545 print <<END; | |
546 | |
547 | |
548 Successfully configured analysis and created makefile '$makeFile'. | |
549 | |
550 To run the analysis locally using make, run: | |
551 | |
552 make -C $outDir | |
553 | |
554 ...or: | |
555 | |
556 cd $outDir | |
557 make | |
558 | |
559 END | |
560 | |
561 1; | |
562 | |
563 __END__ | |
564 |