Mercurial > repos > mini > strelka
diff strelka2/libexec/filterSomaticVariants.pl @ 0:7a9f20ca4ad5
Uploaded
author | mini |
---|---|
date | Thu, 25 Sep 2014 11:59:08 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/strelka2/libexec/filterSomaticVariants.pl Thu Sep 25 11:59:08 2014 -0400 @@ -0,0 +1,435 @@ +#!/usr/bin/env perl + +=head1 LICENSE + +Strelka Workflow Software +Copyright (c) 2009-2013 Illumina, Inc. + +This software is provided under the terms and conditions of the +Illumina Open Source Software License 1. + +You should have received a copy of the Illumina Open Source +Software License 1 along with this program. If not, see +<https://github.com/downloads/sequencing/licenses/>. + +=head1 SYNOPSIS + +filterSomaticVariants.pl [options] | --help + +=head2 SUMMARY + +Aggregate and filter the variant caller results by chromosome + +=cut + +use warnings FATAL => 'all'; +use strict; + +use Carp; +$SIG{__DIE__} = \&Carp::confess; + +use File::Spec; +use Getopt::Long; +use Pod::Usage; + +my $baseDir; +my $libDir; +#my $vcftDir; +BEGIN { + my $thisDir=(File::Spec->splitpath($0))[1]; + $baseDir=File::Spec->catdir($thisDir,File::Spec->updir()); + $libDir=File::Spec->catdir($baseDir,'lib'); + #my $optDir=File::Spec->catdir($baseDir,'opt'); + #$vcftDir=File::Spec->catdir($optDir,'vcftools','lib','perl5','site_perl'); +} +use lib $libDir; +use Utils; +#use lib $vcftDir; +#use Vcf; + + +my $scriptName=(File::Spec->splitpath($0))[2]; +my $argCount=scalar(@ARGV); +my $cmdline = join(' ',$0,@ARGV); + + +my ($chrom, $configFile); +my $help; + +GetOptions( "chrom=s" => \$chrom, + "config=s" => \$configFile, + "help|h" => \$help) or pod2usage(2); + +pod2usage(2) if($help); +pod2usage(2) unless(defined($chrom)); +pod2usage(2) unless(defined($configFile)); + + + +# +# read config and validate values +# +checkFile($configFile,"configuration ini"); +my $config = parseConfigIni($configFile); + +my $chromSizeKey = "chrom_" . $chrom . "_size"; +my $chromKnownSizeKey = "chrom_" . $chrom . "_knownSize"; + +for (("outDir", $chromSizeKey, $chromKnownSizeKey)) { + errorX("Undefined configuration option: '$_'") unless(defined($config->{derived}{$_})); +} +for (qw(binSize ssnvQuality_LowerBound sindelQuality_LowerBound + isSkipDepthFilters depthFilterMultiple + snvMaxFilteredBasecallFrac snvMaxSpanningDeletionFrac + indelMaxRefRepeat indelMaxWindowFilteredBasecallFrac + indelMaxIntHpolLength)) { + errorX("Undefined configuration option: '$_'") unless(defined($config->{user}{$_})); +} + +my $outDir = $config->{derived}{outDir}; +my $chromDir = File::Spec->catdir($outDir,'chromosomes',$chrom); +checkDir($outDir,"output"); +checkDir($chromDir,"output chromosome"); + +my $userconfig = $config->{user}; + +my $binList = getBinList($config->{derived}{$chromSizeKey},$userconfig->{binSize}); +for my $binId (@$binList) { + my $dir = File::Spec->catdir($chromDir,'bins',$binId); + checkDir($dir,"input bin"); +} + +# +# parameters from user config file: +# + +# minimum passed ssnv_nt Q-score: +my $minQSSNT=$userconfig->{ssnvQuality_LowerBound}; +# minimum passed sindel_nt Q-score: +my $minQSINT=$userconfig->{sindelQuality_LowerBound}; + +# +# filtration parameters from user config file: +# + +# skip depth filters for targeted resequencing: +my $isUseDepthFilter=(! $userconfig->{isSkipDepthFilters}); +# multiple of the normal mean coverage to filter snvs and indels +my $depthFilterMultiple=$userconfig->{depthFilterMultiple}; +# max filtered basecall fraction for any sample +my $snvMaxFilteredBasecallFrac=$userconfig->{snvMaxFilteredBasecallFrac}; +# max snv spanning-deletion fraction for any sample +my $snvMaxSpanningDeletionFrac=$userconfig->{snvMaxSpanningDeletionFrac}; +# max indel reference repeat length +my $indelMaxRefRepeat=$userconfig->{indelMaxRefRepeat}; +# max indel window filter fraction +my $indelMaxWindowFilteredBasecallFrac=$userconfig->{indelMaxWindowFilteredBasecallFrac}; +# max indel interupted hompolymer length: +my $indelMaxIntHpolLength=$userconfig->{indelMaxIntHpolLength}; + + +# first we want the normal sample mean chromosome depth: +# +my $filterCoverage; +if($isUseDepthFilter) { + my $totalCoverage = 0; + for my $binId (@$binList) { + my $sFile = File::Spec->catfile($chromDir,'bins',$binId,'strelka.stats'); + checkFile($sFile,"strelka bin stats"); + open(my $sFH, '<', $sFile) + || errorX("Can't open file: '$sFile' $!"); + + my $is_found=0; + while(<$sFH>) { + next unless(/^NORMAL_NO_REF_N_COVERAGE\s/); + my $is_bad = 0; + if(not /sample_size:\s*(\d+)\s+/) { $is_bad=1; } + my $ss=$1; + # leave the regex for mean fairly loose to pick up scientific notation, etc.. + if(not /mean:\s*(\d[^\s]*|nan)\s+/) { $is_bad=1; } + my $mean=$1; + errorX("Unexpected format in file: '$sFile'") if($is_bad); + + my $coverage = ( $ss==0 ? 0 : int($ss*$mean) ); + + $totalCoverage += $coverage; + $is_found=1; + last; + } + close($sFH); + errorX("Unexpected format in file: '$sFile'") unless($is_found); + } + + my $chromKnownSize = $config->{derived}{$chromKnownSizeKey}; + my $normalMeanCoverage = ($totalCoverage/$chromKnownSize); + $filterCoverage = $normalMeanCoverage*$depthFilterMultiple; +} + + +# add filter description to vcf header unless it already exists +# return 1 if filter id already exists, client can decide if this is an error +# +sub add_vcf_filter($$$) { + my ($vcf,$id,$desc) = @_; + return 1 if(scalar(@{$vcf->get_header_line(key=>'FILTER', ID=>$id)})); + $vcf->add_header_line({key=>'FILTER', ID=>$id,Description=>$desc}); + return 0; +} + + +sub check_vcf_for_sample($$$) { + my ($vcf,$sample,$file) = @_; + my $is_found=0; + for ($vcf->get_samples()) { + $is_found=1 if($_ eq $sample); + } + errorX("Failed to find sample '$sample' in vcf file '$file'") unless($is_found); +} + + + +my $depthFiltId="DP"; + + +# Runs all post-call vcf filters: +# +sub filterSnvFileList(\@$$$) { + my ($ifiles,$depthFilterVal,$acceptFileName,$isUseDepthFilter) = @_; + + my $baseFiltId="BCNoise"; + my $spanFiltId="SpanDel"; + my $qFiltId="QSS_ref"; + + open(my $aFH,'>',$acceptFileName) + or errorX("Failed to open file: '$acceptFileName'"); + + my $is_first=1; + for my $ifile (@$ifiles) { + open(my $iFH,'<',"$ifile") + or errorX("Failed to open file: '$ifile'"); + my $vcf = Vcf->new(fh=>$iFH); + + # run some simple header validation on each vcf: + $vcf->parse_header(); + check_vcf_for_sample($vcf,'NORMAL',$ifile); + check_vcf_for_sample($vcf,'TUMOR',$ifile); + + if($is_first) { + # TODO: update vcf meta-data for chromosome-level filtered files + # + $vcf->remove_header_line(key=>"cmdline"); + $vcf->add_header_line({key=>"cmdline",value=>$cmdline}); + if($isUseDepthFilter) { + $vcf->add_header_line({key=>"maxDepth_$chrom",value=>$depthFilterVal}); + add_vcf_filter($vcf,$depthFiltId,"Greater than ${depthFilterMultiple}x chromosomal mean depth in Normal sample"); + } + add_vcf_filter($vcf,$baseFiltId,"Fraction of basecalls filtered at this site in either sample is at or above $snvMaxFilteredBasecallFrac"); + add_vcf_filter($vcf,$spanFiltId,"Fraction of reads crossing site with spanning deletions in either sample exceeeds $snvMaxSpanningDeletionFrac"); + add_vcf_filter($vcf,$qFiltId,"Normal sample is not homozygous ref or ssnv Q-score < $minQSSNT, ie calls with NT!=ref or QSS_NT < $minQSSNT"); + print $aFH $vcf->format_header(); + $is_first=0; + } + + while(my $x=$vcf->next_data_hash()) { + my $norm=$x->{gtypes}->{NORMAL}; + my $tumr=$x->{gtypes}->{TUMOR}; + + my %filters; + + # normal depth filter: + my $normalDP=$norm->{DP}; + if($isUseDepthFilter) { + $filters{$depthFiltId} = ($normalDP > $depthFilterVal); + } + + # filtered basecall fraction: + my $normal_filt=($normalDP>0 ? $norm->{FDP}/$normalDP : 0); + + my $tumorDP=$tumr->{DP}; + my $tumor_filt=($tumorDP>0 ? $tumr->{FDP}/$tumorDP : 0); + + $filters{$baseFiltId}=(($normal_filt >= $snvMaxFilteredBasecallFrac) or + ($tumor_filt >= $snvMaxFilteredBasecallFrac)); + + # spanning deletion fraction: + my $normalSDP=$norm->{SDP}; + my $normalSpanTot=($normalDP+$normalSDP); + my $normalSpanDelFrac=($normalSpanTot>0 ? ($normalSDP/$normalSpanTot) : 0); + + my $tumorSDP=$tumr->{SDP}; + my $tumorSpanTot=($tumorDP+$tumorSDP); + my $tumorSpanDelFrac=($tumorSpanTot>0 ? ($tumorSDP/$tumorSpanTot) : 0); + + $filters{$spanFiltId}=(($normalSpanDelFrac > $snvMaxSpanningDeletionFrac) or + ($tumorSpanDelFrac > $snvMaxSpanningDeletionFrac)); + + # Q-val filter: + $filters{$qFiltId}=(($x->{INFO}->{NT} ne "ref") or + ($x->{INFO}->{QSS_NT} < $minQSSNT)); + + $x->{FILTER} = $vcf->add_filter($x->{FILTER},%filters); + + print $aFH $vcf->format_line($x); + } + + $vcf->close(); + close($iFH); + } + + close($aFH); +} + + + +sub updateA2(\@$) { + my ($a2,$FH) = @_; + my $line=<$FH>; + unless(defined($line)) { errorX("Unexpected end of somatic indel window file"); } + chomp $line; + @$a2 = split("\t",$line); + unless(scalar(@$a2)) { errorX("Unexpected format in somatic indel window file"); } +} + + + +sub filterIndelFileList(\@$$$) { + my ($ifiles,$depthFilterVal,$acceptFileName,$isUseDepthFilter) = @_; + + my $repeatFiltId="Repeat"; + my $iHpolFiltId="iHpol"; + my $baseFiltId="BCNoise"; + my $qFiltId="QSI_ref"; + + open(my $aFH,'>',$acceptFileName) + or errorX("Failed to open file: '$acceptFileName'"); + + my $is_first=1; + for my $ifile (@$ifiles) { + open(my $iFH,'<',"$ifile") + or errorX("Failed to open somatic indel file: '$ifile'"); + + my $iwfile = $ifile . ".window"; + open(my $iwFH,'<',"$iwfile") + or errorX("Failed to open somatic indel window file: '$iwfile'"); + + my @a2; # hold window file data for one line in case we overstep... + + my $vcf = Vcf->new(fh=>$iFH); + + # run some simple header validation on each vcf: + $vcf->parse_header(); + check_vcf_for_sample($vcf,'NORMAL',$ifile); + check_vcf_for_sample($vcf,'TUMOR',$ifile); + + if($is_first) { + # TODO: update all vcf meta-data for chromosome-level filtered files + # + $vcf->remove_header_line(key=>"cmdline"); + $vcf->add_header_line({key=>"cmdline",value=>$cmdline}); + if($isUseDepthFilter) { + $vcf->add_header_line({key=>"maxDepth_$chrom",value=>$depthFilterVal}); + } + $vcf->add_header_line({key=>'FORMAT', ID=>'DP50',Number=>1,Type=>'Float',Description=>'Average tier1 read depth within 50 bases'}); + $vcf->add_header_line({key=>'FORMAT', ID=>'FDP50',Number=>1,Type=>'Float',Description=>'Average tier1 number of basecalls filtered from original read depth within 50 bases'}); + $vcf->add_header_line({key=>'FORMAT', ID=>'SUBDP50',Number=>1,Type=>'Float',Description=>'Average number of reads below tier1 mapping quality threshold aligned across sites within 50 bases'}); + if($isUseDepthFilter) { + add_vcf_filter($vcf,$depthFiltId,"Greater than ${depthFilterMultiple}x chromosomal mean depth in Normal sample"); + } + add_vcf_filter($vcf,$repeatFiltId,"Sequence repeat of more than ${indelMaxRefRepeat}x in the reference sequence"); + add_vcf_filter($vcf,$iHpolFiltId,"Indel overlaps an interupted homopolymer longer than ${indelMaxIntHpolLength}x in the reference sequence"); + add_vcf_filter($vcf,$baseFiltId,"Average fraction of filtered basecalls within 50 bases of the indel exceeds $indelMaxWindowFilteredBasecallFrac"); + add_vcf_filter($vcf,$qFiltId,"Normal sample is not homozygous ref or sindel Q-score < $minQSINT, ie calls with NT!=ref or QSI_NT < $minQSINT"); + print $aFH $vcf->format_header(); + $is_first=0; + } + + while(my $x=$vcf->next_data_hash()) { + $vcf->add_format_field($x,'DP50'); + $vcf->add_format_field($x,'FDP50'); + $vcf->add_format_field($x,'SUBDP50'); + + my $norm=$x->{gtypes}->{NORMAL}; + my $tumr=$x->{gtypes}->{TUMOR}; + + my $chrom=$x->{CHROM}; + my $pos=int($x->{POS}); + + # get matching line from window file: + while((scalar(@a2)<2) or + (($a2[0] le $chrom) and (int($a2[1]) < $pos))) { + updateA2(@a2,$iwFH); + } + unless(scalar(@a2) and ($a2[0] eq $chrom) and (int($a2[1]) == $pos)) + { errorX("Can't find somatic indel window position.\nIndel line: " . $vcf->format_line($x) ); } + + # add window data to vcf record: + # + $norm->{DP50} = $a2[2]+$a2[3]; + $norm->{FDP50} = $a2[3]; + $norm->{SUBDP50} = $a2[4]; + $tumr->{DP50} = $a2[5]+$a2[6]; + $tumr->{FDP50} = $a2[6]; + $tumr->{SUBDP50} = $a2[7]; + + my %filters; + + # normal depth filter: + my $normalDP=$norm->{DP}; + if($isUseDepthFilter) { + $filters{$depthFiltId}=($normalDP > $depthFilterVal); + } + + # ref repeat + my $refRep=$x->{INFO}->{RC}; + $filters{$repeatFiltId}=(defined($refRep) and + ($refRep > $indelMaxRefRepeat)); + + # ihpol + my $iHpol=$x->{INFO}->{IHP}; + $filters{$iHpolFiltId}=(defined($iHpol) and + ($iHpol > $indelMaxIntHpolLength)); + + # base filt: + my $normWinFrac=( $norm->{DP50} ? $norm->{FDP50}/$norm->{DP50} : 0 ); + my $tumrWinFrac=( $tumr->{DP50} ? $tumr->{FDP50}/$tumr->{DP50} : 0 ); + $filters{$baseFiltId}=( ($normWinFrac >= $indelMaxWindowFilteredBasecallFrac) or + ($tumrWinFrac >= $indelMaxWindowFilteredBasecallFrac) ); + + # Q-val filter: + $filters{$qFiltId}=(($x->{INFO}->{NT} ne "ref") or + ($x->{INFO}->{QSI_NT} < $minQSINT)); + + $x->{FILTER} = $vcf->add_filter($x->{FILTER},%filters); + + print $aFH $vcf->format_line($x); + } + + $vcf->close(); + close($iFH); + close($iwFH); + } + + close($aFH); +} + + + +my @ssnvFiles; +my @sindelFiles; +for my $binId (@$binList) { + my $ssnvFile = File::Spec->catfile($chromDir,'bins',$binId,'somatic.snvs.unfiltered.vcf'); + my $sindelFile = File::Spec->catfile($chromDir,'bins',$binId,'somatic.indels.unfiltered.vcf'); + checkFile($ssnvFile,"bin snv file"); + checkFile($sindelFile,"bin indel file"); + push @ssnvFiles,$ssnvFile; + push @sindelFiles,$sindelFile; +} + +my $ssnvOutFile = File::Spec->catfile($chromDir,"somatic.snvs.vcf"); +filterSnvFileList(@ssnvFiles,$filterCoverage,$ssnvOutFile,$isUseDepthFilter); + +my $sindelOutFile = File::Spec->catfile($chromDir,"somatic.indels.vcf"); +filterIndelFileList(@sindelFiles,$filterCoverage,$sindelOutFile,$isUseDepthFilter); + + +1;