Mercurial > repos > mini > strelka
view strelka2/configureStrelkaWorkflow.pl @ 0:7a9f20ca4ad5
Uploaded
author | mini |
---|---|
date | Thu, 25 Sep 2014 11:59:08 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl =head1 LICENSE Strelka Workflow Software Copyright (c) 2009-2013 Illumina, Inc. This software is provided under the terms and conditions of the Illumina Open Source Software License 1. You should have received a copy of the Illumina Open Source Software License 1 along with this program. If not, see <https://github.com/downloads/sequencing/licenses/>. =head1 SYNOPSIS configureStrelkaWorkflow.pl --tumor FILE --normal FILE --ref FILE --config FILE [options] This script configures the strelka workflow for somatic variant calling on matched tumor-normal BAM files. The configuration process will produce an analysis makefile and directory structure. The makefile can be used to run the analysis on a workstation or compute cluster via make/qmake or other makefile compatible process. =head1 ARGUMENTS =over 4 =item --tumor FILE Path to tumor sample BAM file (required) =item --normal FILE Path to normal sample BAM file (required) =item --ref FILE Path to reference genome fasta (required) =item --config FILE Strelka configuration file. Default config files can be found in ${STRELKA_INSTALL_ROOT}/etc/ for both ELAND and BWA alignments. (required) =back =head1 OPTIONS =over 4 =item --output-dir DIRECTORY Root of the analysis directory. This script will place all configuration files in the analysis directory, and after configuration all results and intermediate files will be written within the analysis directory during a run. This directory must not already exist. (default: ./strelkaAnalysis) =back =cut use warnings FATAL => 'all'; use strict; use Carp; $SIG{__DIE__} = \&Carp::confess; use Cwd qw(getcwd); use File::Spec; use Getopt::Long; use Pod::Usage; my $baseDir; my $libDir; BEGIN { my $thisDir=(File::Spec->splitpath($0))[1]; $baseDir=$thisDir;#$baseDir=File::Spec->catdir($thisDir,File::Spec->updir()); $libDir=File::Spec->catdir($baseDir,'lib'); } use lib $libDir; use Utils; if(getAbsPath($baseDir)) { errorX("Can't resolve path for strelka_workflow install directory: '$baseDir'"); } my $libexecDir=File::Spec->catdir($baseDir,'libexec'); #my $optDir=File::Spec->catdir($baseDir,'opt'); my $scriptName=(File::Spec->splitpath($0))[2]; my $argCount=scalar(@ARGV); my $cmdline = join(' ',$0,@ARGV); sub usage() { pod2usage(-verbose => 1, -exitval => 2); } # # user configuration: # my ($tumorBam, $normalBam, $refFile, $configFile, $outDir); my $help; GetOptions( "tumor=s" => \$tumorBam, "normal=s" => \$normalBam, "ref=s" => \$refFile, "config=s" => \$configFile, "output-dir=s" => \$outDir, "help|h" => \$help) || usage(); usage() if($help); usage() unless($argCount); # # Validate input conditions: # sub checkFileArg($$) { my ($file,$label) = @_; errorX("Must specify $label file") unless(defined($file)); #raise an error if file not defined checkFile($file,$label); } checkFileArg($tumorBam,"tumor BAM"); checkFileArg($normalBam,"normal BAM"); checkFileArg($refFile,"reference fasta"); checkFileArg($configFile,"configuration ini"); sub makeAbsoluteFilePaths(\$) { my ($filePathRef) = @_; my ($v,$fileDir,$fileName) = File::Spec->splitpath($$filePathRef); if(getAbsPath($fileDir)) { errorX("Can't resolve directory path for '$fileDir' from input file argument: '$$filePathRef'"); } $$filePathRef = File::Spec->catfile($fileDir,$fileName); } makeAbsoluteFilePaths($tumorBam); makeAbsoluteFilePaths($normalBam); makeAbsoluteFilePaths($refFile); makeAbsoluteFilePaths($configFile); # also check for BAM index files: sub checkBamIndex($) { my ($file) = @_; my $ifile = $file . ".bai"; if(! -f $ifile) { errorX("Can't find index for BAM file '$file'"); } } checkBamIndex($tumorBam); checkBamIndex($normalBam); sub checkFaIndex($) { my ($file) = @_; my $ifile = $file . ".fai"; if(! -f $ifile) { errorX("Can't find index for fasta file '$file'"); } # check that fai file isn't improperly formatted (a la the GATK bundle NCBI 37 fai files) open(my $FH,"< $ifile") || errorX("Can't open fai file '$ifile'"); my $lineno=1; while(<$FH>) { chomp; my @F=split(); if(scalar(@F) != 5) { errorX("Unexpected format for line number '$lineno' of fasta index file: '$ifile'\n\tRe-running fasta indexing may fix the issue. To do so, run: \"samtools faidx $file\""); } $lineno++; } close($FH); } checkFaIndex($refFile); if(defined($outDir)) { if(getAbsPath($outDir)) { errorX("Can't resolve path for ouput directory: '$outDir'"); } } else { $outDir=File::Spec->catdir(Cwd::getcwd(),'strelkaAnalysis'); } if(-e $outDir) { errorX("Output path already exists: '$outDir'"); } if(getAbsPath($baseDir)) { errorX("Can't resolve path for strelka install directory: '$baseDir'"); } #my $samtoolsDir = File::Spec->catdir($optDir,'samtools'); checkDir($libexecDir,"strelka libexec"); #checkDir($samtoolsDir,"samtools"); my $callScriptName = "callSomaticVariants.pl"; my $filterScriptName = "filterSomaticVariants.pl"; my $finishScriptName = "consolidateResults.pl"; my $callScript = File::Spec->catfile($libexecDir,$callScriptName); my $filterScript = File::Spec->catfile($libexecDir,$filterScriptName); my $finishScript = File::Spec->catfile($libexecDir,$finishScriptName); my $countFasta = File::Spec->catfile($libexecDir,"countFastaBases"); #my $samtoolsBin = File::Spec->catfile($samtoolsDir,"samtools"); checkFile($callScript,"somatic variant call script"); checkFile($filterScript,"somatic variant filter script"); checkFile($finishScript,"result consolidation script"); checkFile($countFasta,"fasta scanner"); #checkFile($samtoolsBin,"samtools"); # # Configure bin runs: # checkMakeDir($outDir); # # Configure bin runs: open and validate config ini # my $config = parseConfigIni($configFile); sub checkConfigKeys($) { my ($keyref) = @_; for (@$keyref) { errorX("Undefined configuration option: '$_'") unless(defined($config->{user}{$_})); } } # these are the keys we need at configuration time: my @config_keys = qw(binSize); # these are additional keys we will need to run the workflow: # (note we don't check for (maxInputDepth,minTier2Mapq) for back compatibility with older config files) my @workflow_keys = qw( minTier1Mapq isWriteRealignedBam ssnvPrior sindelPrior ssnvNoise sindelNoise ssnvNoiseStrandBiasFrac ssnvQuality_LowerBound sindelQuality_LowerBound isSkipDepthFilters depthFilterMultiple snvMaxFilteredBasecallFrac snvMaxSpanningDeletionFrac indelMaxRefRepeat indelMaxWindowFilteredBasecallFrac indelMaxIntHpolLength); checkConfigKeys(\@config_keys); checkConfigKeys(\@workflow_keys); my $binSize = int($config->{user}{binSize}); $config->{derived}{configurationCmdline} = $cmdline; $config->{derived}{normalBam} = $normalBam; $config->{derived}{tumorBam} = $tumorBam; $config->{derived}{refFile} = $refFile; $config->{derived}{outDir} = $outDir; # # Configure bin runs: check for consistent chrom info between BAMs and reference # sub getBamChromInfo($) { my $file = shift; my $cmd = "samtools view -H $file |"; open(my $FH,$cmd) || errorX("Can't open process $cmd"); my %info; my $n=0; while(<$FH>) { next unless(/^\@SQ/); chomp; my @F = split(/\t/); scalar(@F) >= 3 || errorX("Unexpected bam header for file '$file'"); my %h = (); foreach (@F) { my @vals = split(':'); $h{$vals[0]} = $vals[1]; } $F[1] = $h{'SN'}; $F[2] = $h{'LN'}; my $size = int($F[2]); ($size > 0) || errorX("Unexpected chromosome size '$size' in bam header for file '$file'"); $info{$F[1]}{size} = $size; $info{$F[1]}{order} = $n; $n++; } close($FH) || errorX("Can't close process $cmd"); return %info; } my %chromInfo = getBamChromInfo($normalBam); my @chroms = sort { $chromInfo{$a}{order} <=> $chromInfo{$b}{order} } (keys(%chromInfo)); { #consistency check: my %tumorChromInfo = getBamChromInfo($tumorBam); for my $chrom (@chroms) { my $ln = $chromInfo{$chrom}{size}; my $tln = $tumorChromInfo{$chrom}{size}; my $order = $chromInfo{$chrom}{order}; my $torder = $tumorChromInfo{$chrom}{order}; unless(defined($tln) && ($tln==$ln) && ($torder==$order)) { errorX("Tumor and normal BAM file headers disagree on chromosome: '$chrom'"); } delete $tumorChromInfo{$chrom}; } for my $chrom (keys(%tumorChromInfo)) { errorX("Tumor and normal BAM file headers disagree on chromosome: '$chrom'"); } } my %refChromInfo; logX("Scanning reference genome"); { my $knownGenomeSize=0; my $cmd="$countFasta $refFile |"; open(my $FFH,$cmd) || errorX("Failed to open process '$cmd'"); while(<$FFH>) { chomp; my @F = split(/\t/); scalar(@F) == 4 || errorX("Unexpected value from '$cmd'"); $knownGenomeSize += int($F[2]); $refChromInfo{$F[1]}{knownSize} = int($F[2]); $refChromInfo{$F[1]}{size} = int($F[3]); } close($FFH) || errorX("Failed to close process '$cmd'"); #consistency check: for my $chrom (@chroms) { my $ln = $chromInfo{$chrom}{size}; my $rln = $refChromInfo{$chrom}{size}; unless(defined($rln) && ($rln==$ln)) { errorX("BAM headers and reference fasta disagree on chromosome: '$chrom'"); } $config->{derived}{"chrom_${chrom}_size"} = $rln; $config->{derived}{"chrom_${chrom}_knownSize"} = $refChromInfo{$chrom}{knownSize}; } $config->{derived}{chromOrder} = join("\t",@chroms); $config->{derived}{knownGenomeSize} = $knownGenomeSize; } logX("Scanning reference genome complete"); # # Configure bin runs: create directory structure # my $resultsDir = File::Spec->catdir($outDir,'results'); checkMakeDir($resultsDir); if($config->{user}{isWriteRealignedBam}) { my $bamDir = File::Spec->catdir($outDir,'realigned'); checkMakeDir($bamDir); } my $chromRootDir = File::Spec->catdir($outDir,'chromosomes'); checkMakeDir($chromRootDir); for my $chrom (@chroms) { my $chromDir = File::Spec->catdir($chromRootDir,$chrom); checkMakeDir($chromDir); my $chromRef = $chromInfo{$chrom}; $chromRef->{dir} = $chromDir; $chromRef->{binList} = getBinList($chromRef->{size},$binSize); my $binRootDir = File::Spec->catdir($chromDir,'bins'); checkMakeDir($binRootDir); for my $binId ( @{$chromRef->{binList}} ) { my $binDir = File::Spec->catdir($binRootDir,$binId); checkMakeDir($binDir); } } # # write run config file: # my $runConfigFile; { my $cstr = <<END; ; ; Strelka workflow configuration file ; ; This is an automatically generated file, you probably don't want to edit it. If starting a new run, ; input configuration templates (with comments) can be found in the Strelka etc/ directory. ; END $cstr .= writeConfigIni($config); my $configDir = File::Spec->catdir($outDir,'config'); checkMakeDir($configDir); $runConfigFile = File::Spec->catdir($configDir,'run.config.ini'); open(my $FH,"> $runConfigFile") || errorX("Can't open file '$runConfigFile'"); print $FH $cstr; close($FH); } # # create makefile # my $makeFile = File::Spec->catfile($outDir,"Makefile"); open(my $MAKEFH, "> $makeFile") || errorX("Can't open file: '$makeFile'"); my $completeFile = "task.complete"; print $MAKEFH <<ENDE; # This makefile was automatically generated by $scriptName # # Please do not edit. script_dir := $libexecDir call_script := \$(script_dir)/$callScriptName filter_script := \$(script_dir)/$filterScriptName finish_script := \$(script_dir)/$finishScriptName config_file := $runConfigFile analysis_dir := $outDir results_dir := \$(analysis_dir)/results ENDE print $MAKEFH <<'ENDE'; complete_tag := task.complete finish_task := $(analysis_dir)/$(complete_tag) get_chrom_dir = $(analysis_dir)/chromosomes/$1 get_chrom_task = $(call get_chrom_dir,$1)/$(complete_tag) get_bin_task = $(call get_chrom_dir,$1)/bins/$2/$(complete_tag) all: $(finish_task) @$(print_success) define print_success echo;\ echo Analysis complete. Final somatic calls can be found in $(results_dir);\ echo endef # top level results target: # $(finish_task): $(finish_script) --config=$(config_file) && touch $@ # chromosome targets: # ENDE for my $chrom (@chroms) { print $MAKEFH <<ENDE; chrom_${chrom}_task := \$(call get_chrom_task,$chrom) \$(finish_task): \$(chrom_${chrom}_task) \$(chrom_${chrom}_task): \$(filter_script) --config=\$(config_file) --chrom=$chrom && touch \$@ ENDE } print $MAKEFH <<ENDE; # chromosome bin targets: # ENDE for my $chrom (@chroms) { for my $bin (@{$chromInfo{$chrom}{binList}}) { print $MAKEFH <<ENDE; chrom_${chrom}_bin_${bin}_task := \$(call get_bin_task,$chrom,$bin) \$(chrom_${chrom}_task): \$(chrom_${chrom}_bin_${bin}_task) \$(chrom_${chrom}_bin_${bin}_task): \$(call_script) --config=\$(config_file) --chrom=$chrom --bin=$bin && touch \$@ ENDE } } # If the eval function is available, this is the way we could finish # the makefile without being so verbose but it doesn't look like qmake # understands this function. =cut print $MAKEFH <<ENDE; chroms := @chroms ENDE for my $chrom (@chroms) { print $MAKEFH "${chrom}_bins := " . join(" ",@{$chromInfo{$chrom}{binList}}) . "\n"; } print $MAKEFH <<'ENDE'; define chrom_task_template chrom_$1_task := $(call get_chrom_task,$1) $(finish_task): $$(chrom_$1_task) $$(chrom_$1_task): $$(filter_script) --config=$$(config_file) --chrom=$1 && touch $$@ endef $(foreach c,$(chroms),$(eval $(call chrom_task_template,$c))) # chromosome bin targets: # define chrom_bin_task_template chrom_$1_bin_$2_task := $(call get_bin_task,$1,$2) $$(chrom_$1_task): $$(chrom_$1_bin_$2_task) $$(chrom_$1_bin_$2_task): $$(call_script) --config=$$(config_file) --chrom=$1 --bin=$2 && touch $$@ endef $(foreach c,$(chroms), \ $(foreach b,$($c_bins),$(eval $(call chrom_bin_task_template,$c,$b))) \ ) ENDE =cut print <<END; Successfully configured analysis and created makefile '$makeFile'. To run the analysis locally using make, run: make -C $outDir ...or: cd $outDir make END 1; __END__