Mercurial > repos > mini > strelka
view strelka2/libexec/filterSomaticVariants.pl @ 0:7a9f20ca4ad5
Uploaded
author | mini |
---|---|
date | Thu, 25 Sep 2014 11:59:08 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl =head1 LICENSE Strelka Workflow Software Copyright (c) 2009-2013 Illumina, Inc. This software is provided under the terms and conditions of the Illumina Open Source Software License 1. You should have received a copy of the Illumina Open Source Software License 1 along with this program. If not, see <https://github.com/downloads/sequencing/licenses/>. =head1 SYNOPSIS filterSomaticVariants.pl [options] | --help =head2 SUMMARY Aggregate and filter the variant caller results by chromosome =cut use warnings FATAL => 'all'; use strict; use Carp; $SIG{__DIE__} = \&Carp::confess; use File::Spec; use Getopt::Long; use Pod::Usage; my $baseDir; my $libDir; #my $vcftDir; BEGIN { my $thisDir=(File::Spec->splitpath($0))[1]; $baseDir=File::Spec->catdir($thisDir,File::Spec->updir()); $libDir=File::Spec->catdir($baseDir,'lib'); #my $optDir=File::Spec->catdir($baseDir,'opt'); #$vcftDir=File::Spec->catdir($optDir,'vcftools','lib','perl5','site_perl'); } use lib $libDir; use Utils; #use lib $vcftDir; #use Vcf; my $scriptName=(File::Spec->splitpath($0))[2]; my $argCount=scalar(@ARGV); my $cmdline = join(' ',$0,@ARGV); my ($chrom, $configFile); my $help; GetOptions( "chrom=s" => \$chrom, "config=s" => \$configFile, "help|h" => \$help) or pod2usage(2); pod2usage(2) if($help); pod2usage(2) unless(defined($chrom)); pod2usage(2) unless(defined($configFile)); # # read config and validate values # checkFile($configFile,"configuration ini"); my $config = parseConfigIni($configFile); my $chromSizeKey = "chrom_" . $chrom . "_size"; my $chromKnownSizeKey = "chrom_" . $chrom . "_knownSize"; for (("outDir", $chromSizeKey, $chromKnownSizeKey)) { errorX("Undefined configuration option: '$_'") unless(defined($config->{derived}{$_})); } for (qw(binSize ssnvQuality_LowerBound sindelQuality_LowerBound isSkipDepthFilters depthFilterMultiple snvMaxFilteredBasecallFrac snvMaxSpanningDeletionFrac indelMaxRefRepeat indelMaxWindowFilteredBasecallFrac indelMaxIntHpolLength)) { errorX("Undefined configuration option: '$_'") unless(defined($config->{user}{$_})); } my $outDir = $config->{derived}{outDir}; my $chromDir = File::Spec->catdir($outDir,'chromosomes',$chrom); checkDir($outDir,"output"); checkDir($chromDir,"output chromosome"); my $userconfig = $config->{user}; my $binList = getBinList($config->{derived}{$chromSizeKey},$userconfig->{binSize}); for my $binId (@$binList) { my $dir = File::Spec->catdir($chromDir,'bins',$binId); checkDir($dir,"input bin"); } # # parameters from user config file: # # minimum passed ssnv_nt Q-score: my $minQSSNT=$userconfig->{ssnvQuality_LowerBound}; # minimum passed sindel_nt Q-score: my $minQSINT=$userconfig->{sindelQuality_LowerBound}; # # filtration parameters from user config file: # # skip depth filters for targeted resequencing: my $isUseDepthFilter=(! $userconfig->{isSkipDepthFilters}); # multiple of the normal mean coverage to filter snvs and indels my $depthFilterMultiple=$userconfig->{depthFilterMultiple}; # max filtered basecall fraction for any sample my $snvMaxFilteredBasecallFrac=$userconfig->{snvMaxFilteredBasecallFrac}; # max snv spanning-deletion fraction for any sample my $snvMaxSpanningDeletionFrac=$userconfig->{snvMaxSpanningDeletionFrac}; # max indel reference repeat length my $indelMaxRefRepeat=$userconfig->{indelMaxRefRepeat}; # max indel window filter fraction my $indelMaxWindowFilteredBasecallFrac=$userconfig->{indelMaxWindowFilteredBasecallFrac}; # max indel interupted hompolymer length: my $indelMaxIntHpolLength=$userconfig->{indelMaxIntHpolLength}; # first we want the normal sample mean chromosome depth: # my $filterCoverage; if($isUseDepthFilter) { my $totalCoverage = 0; for my $binId (@$binList) { my $sFile = File::Spec->catfile($chromDir,'bins',$binId,'strelka.stats'); checkFile($sFile,"strelka bin stats"); open(my $sFH, '<', $sFile) || errorX("Can't open file: '$sFile' $!"); my $is_found=0; while(<$sFH>) { next unless(/^NORMAL_NO_REF_N_COVERAGE\s/); my $is_bad = 0; if(not /sample_size:\s*(\d+)\s+/) { $is_bad=1; } my $ss=$1; # leave the regex for mean fairly loose to pick up scientific notation, etc.. if(not /mean:\s*(\d[^\s]*|nan)\s+/) { $is_bad=1; } my $mean=$1; errorX("Unexpected format in file: '$sFile'") if($is_bad); my $coverage = ( $ss==0 ? 0 : int($ss*$mean) ); $totalCoverage += $coverage; $is_found=1; last; } close($sFH); errorX("Unexpected format in file: '$sFile'") unless($is_found); } my $chromKnownSize = $config->{derived}{$chromKnownSizeKey}; my $normalMeanCoverage = ($totalCoverage/$chromKnownSize); $filterCoverage = $normalMeanCoverage*$depthFilterMultiple; } # add filter description to vcf header unless it already exists # return 1 if filter id already exists, client can decide if this is an error # sub add_vcf_filter($$$) { my ($vcf,$id,$desc) = @_; return 1 if(scalar(@{$vcf->get_header_line(key=>'FILTER', ID=>$id)})); $vcf->add_header_line({key=>'FILTER', ID=>$id,Description=>$desc}); return 0; } sub check_vcf_for_sample($$$) { my ($vcf,$sample,$file) = @_; my $is_found=0; for ($vcf->get_samples()) { $is_found=1 if($_ eq $sample); } errorX("Failed to find sample '$sample' in vcf file '$file'") unless($is_found); } my $depthFiltId="DP"; # Runs all post-call vcf filters: # sub filterSnvFileList(\@$$$) { my ($ifiles,$depthFilterVal,$acceptFileName,$isUseDepthFilter) = @_; my $baseFiltId="BCNoise"; my $spanFiltId="SpanDel"; my $qFiltId="QSS_ref"; open(my $aFH,'>',$acceptFileName) or errorX("Failed to open file: '$acceptFileName'"); my $is_first=1; for my $ifile (@$ifiles) { open(my $iFH,'<',"$ifile") or errorX("Failed to open file: '$ifile'"); my $vcf = Vcf->new(fh=>$iFH); # run some simple header validation on each vcf: $vcf->parse_header(); check_vcf_for_sample($vcf,'NORMAL',$ifile); check_vcf_for_sample($vcf,'TUMOR',$ifile); if($is_first) { # TODO: update vcf meta-data for chromosome-level filtered files # $vcf->remove_header_line(key=>"cmdline"); $vcf->add_header_line({key=>"cmdline",value=>$cmdline}); if($isUseDepthFilter) { $vcf->add_header_line({key=>"maxDepth_$chrom",value=>$depthFilterVal}); add_vcf_filter($vcf,$depthFiltId,"Greater than ${depthFilterMultiple}x chromosomal mean depth in Normal sample"); } add_vcf_filter($vcf,$baseFiltId,"Fraction of basecalls filtered at this site in either sample is at or above $snvMaxFilteredBasecallFrac"); add_vcf_filter($vcf,$spanFiltId,"Fraction of reads crossing site with spanning deletions in either sample exceeeds $snvMaxSpanningDeletionFrac"); add_vcf_filter($vcf,$qFiltId,"Normal sample is not homozygous ref or ssnv Q-score < $minQSSNT, ie calls with NT!=ref or QSS_NT < $minQSSNT"); print $aFH $vcf->format_header(); $is_first=0; } while(my $x=$vcf->next_data_hash()) { my $norm=$x->{gtypes}->{NORMAL}; my $tumr=$x->{gtypes}->{TUMOR}; my %filters; # normal depth filter: my $normalDP=$norm->{DP}; if($isUseDepthFilter) { $filters{$depthFiltId} = ($normalDP > $depthFilterVal); } # filtered basecall fraction: my $normal_filt=($normalDP>0 ? $norm->{FDP}/$normalDP : 0); my $tumorDP=$tumr->{DP}; my $tumor_filt=($tumorDP>0 ? $tumr->{FDP}/$tumorDP : 0); $filters{$baseFiltId}=(($normal_filt >= $snvMaxFilteredBasecallFrac) or ($tumor_filt >= $snvMaxFilteredBasecallFrac)); # spanning deletion fraction: my $normalSDP=$norm->{SDP}; my $normalSpanTot=($normalDP+$normalSDP); my $normalSpanDelFrac=($normalSpanTot>0 ? ($normalSDP/$normalSpanTot) : 0); my $tumorSDP=$tumr->{SDP}; my $tumorSpanTot=($tumorDP+$tumorSDP); my $tumorSpanDelFrac=($tumorSpanTot>0 ? ($tumorSDP/$tumorSpanTot) : 0); $filters{$spanFiltId}=(($normalSpanDelFrac > $snvMaxSpanningDeletionFrac) or ($tumorSpanDelFrac > $snvMaxSpanningDeletionFrac)); # Q-val filter: $filters{$qFiltId}=(($x->{INFO}->{NT} ne "ref") or ($x->{INFO}->{QSS_NT} < $minQSSNT)); $x->{FILTER} = $vcf->add_filter($x->{FILTER},%filters); print $aFH $vcf->format_line($x); } $vcf->close(); close($iFH); } close($aFH); } sub updateA2(\@$) { my ($a2,$FH) = @_; my $line=<$FH>; unless(defined($line)) { errorX("Unexpected end of somatic indel window file"); } chomp $line; @$a2 = split("\t",$line); unless(scalar(@$a2)) { errorX("Unexpected format in somatic indel window file"); } } sub filterIndelFileList(\@$$$) { my ($ifiles,$depthFilterVal,$acceptFileName,$isUseDepthFilter) = @_; my $repeatFiltId="Repeat"; my $iHpolFiltId="iHpol"; my $baseFiltId="BCNoise"; my $qFiltId="QSI_ref"; open(my $aFH,'>',$acceptFileName) or errorX("Failed to open file: '$acceptFileName'"); my $is_first=1; for my $ifile (@$ifiles) { open(my $iFH,'<',"$ifile") or errorX("Failed to open somatic indel file: '$ifile'"); my $iwfile = $ifile . ".window"; open(my $iwFH,'<',"$iwfile") or errorX("Failed to open somatic indel window file: '$iwfile'"); my @a2; # hold window file data for one line in case we overstep... my $vcf = Vcf->new(fh=>$iFH); # run some simple header validation on each vcf: $vcf->parse_header(); check_vcf_for_sample($vcf,'NORMAL',$ifile); check_vcf_for_sample($vcf,'TUMOR',$ifile); if($is_first) { # TODO: update all vcf meta-data for chromosome-level filtered files # $vcf->remove_header_line(key=>"cmdline"); $vcf->add_header_line({key=>"cmdline",value=>$cmdline}); if($isUseDepthFilter) { $vcf->add_header_line({key=>"maxDepth_$chrom",value=>$depthFilterVal}); } $vcf->add_header_line({key=>'FORMAT', ID=>'DP50',Number=>1,Type=>'Float',Description=>'Average tier1 read depth within 50 bases'}); $vcf->add_header_line({key=>'FORMAT', ID=>'FDP50',Number=>1,Type=>'Float',Description=>'Average tier1 number of basecalls filtered from original read depth within 50 bases'}); $vcf->add_header_line({key=>'FORMAT', ID=>'SUBDP50',Number=>1,Type=>'Float',Description=>'Average number of reads below tier1 mapping quality threshold aligned across sites within 50 bases'}); if($isUseDepthFilter) { add_vcf_filter($vcf,$depthFiltId,"Greater than ${depthFilterMultiple}x chromosomal mean depth in Normal sample"); } add_vcf_filter($vcf,$repeatFiltId,"Sequence repeat of more than ${indelMaxRefRepeat}x in the reference sequence"); add_vcf_filter($vcf,$iHpolFiltId,"Indel overlaps an interupted homopolymer longer than ${indelMaxIntHpolLength}x in the reference sequence"); add_vcf_filter($vcf,$baseFiltId,"Average fraction of filtered basecalls within 50 bases of the indel exceeds $indelMaxWindowFilteredBasecallFrac"); add_vcf_filter($vcf,$qFiltId,"Normal sample is not homozygous ref or sindel Q-score < $minQSINT, ie calls with NT!=ref or QSI_NT < $minQSINT"); print $aFH $vcf->format_header(); $is_first=0; } while(my $x=$vcf->next_data_hash()) { $vcf->add_format_field($x,'DP50'); $vcf->add_format_field($x,'FDP50'); $vcf->add_format_field($x,'SUBDP50'); my $norm=$x->{gtypes}->{NORMAL}; my $tumr=$x->{gtypes}->{TUMOR}; my $chrom=$x->{CHROM}; my $pos=int($x->{POS}); # get matching line from window file: while((scalar(@a2)<2) or (($a2[0] le $chrom) and (int($a2[1]) < $pos))) { updateA2(@a2,$iwFH); } unless(scalar(@a2) and ($a2[0] eq $chrom) and (int($a2[1]) == $pos)) { errorX("Can't find somatic indel window position.\nIndel line: " . $vcf->format_line($x) ); } # add window data to vcf record: # $norm->{DP50} = $a2[2]+$a2[3]; $norm->{FDP50} = $a2[3]; $norm->{SUBDP50} = $a2[4]; $tumr->{DP50} = $a2[5]+$a2[6]; $tumr->{FDP50} = $a2[6]; $tumr->{SUBDP50} = $a2[7]; my %filters; # normal depth filter: my $normalDP=$norm->{DP}; if($isUseDepthFilter) { $filters{$depthFiltId}=($normalDP > $depthFilterVal); } # ref repeat my $refRep=$x->{INFO}->{RC}; $filters{$repeatFiltId}=(defined($refRep) and ($refRep > $indelMaxRefRepeat)); # ihpol my $iHpol=$x->{INFO}->{IHP}; $filters{$iHpolFiltId}=(defined($iHpol) and ($iHpol > $indelMaxIntHpolLength)); # base filt: my $normWinFrac=( $norm->{DP50} ? $norm->{FDP50}/$norm->{DP50} : 0 ); my $tumrWinFrac=( $tumr->{DP50} ? $tumr->{FDP50}/$tumr->{DP50} : 0 ); $filters{$baseFiltId}=( ($normWinFrac >= $indelMaxWindowFilteredBasecallFrac) or ($tumrWinFrac >= $indelMaxWindowFilteredBasecallFrac) ); # Q-val filter: $filters{$qFiltId}=(($x->{INFO}->{NT} ne "ref") or ($x->{INFO}->{QSI_NT} < $minQSINT)); $x->{FILTER} = $vcf->add_filter($x->{FILTER},%filters); print $aFH $vcf->format_line($x); } $vcf->close(); close($iFH); close($iwFH); } close($aFH); } my @ssnvFiles; my @sindelFiles; for my $binId (@$binList) { my $ssnvFile = File::Spec->catfile($chromDir,'bins',$binId,'somatic.snvs.unfiltered.vcf'); my $sindelFile = File::Spec->catfile($chromDir,'bins',$binId,'somatic.indels.unfiltered.vcf'); checkFile($ssnvFile,"bin snv file"); checkFile($sindelFile,"bin indel file"); push @ssnvFiles,$ssnvFile; push @sindelFiles,$sindelFile; } my $ssnvOutFile = File::Spec->catfile($chromDir,"somatic.snvs.vcf"); filterSnvFileList(@ssnvFiles,$filterCoverage,$ssnvOutFile,$isUseDepthFilter); my $sindelOutFile = File::Spec->catfile($chromDir,"somatic.indels.vcf"); filterIndelFileList(@sindelFiles,$filterCoverage,$sindelOutFile,$isUseDepthFilter); 1;