Mercurial > repos > mini > strelka
comparison strelka2/libexec/filterSomaticVariants.pl @ 0:7a9f20ca4ad5
Uploaded
| author | mini |
|---|---|
| date | Thu, 25 Sep 2014 11:59:08 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7a9f20ca4ad5 |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 =head1 LICENSE | |
| 4 | |
| 5 Strelka Workflow Software | |
| 6 Copyright (c) 2009-2013 Illumina, Inc. | |
| 7 | |
| 8 This software is provided under the terms and conditions of the | |
| 9 Illumina Open Source Software License 1. | |
| 10 | |
| 11 You should have received a copy of the Illumina Open Source | |
| 12 Software License 1 along with this program. If not, see | |
| 13 <https://github.com/downloads/sequencing/licenses/>. | |
| 14 | |
| 15 =head1 SYNOPSIS | |
| 16 | |
| 17 filterSomaticVariants.pl [options] | --help | |
| 18 | |
| 19 =head2 SUMMARY | |
| 20 | |
| 21 Aggregate and filter the variant caller results by chromosome | |
| 22 | |
| 23 =cut | |
| 24 | |
| 25 use warnings FATAL => 'all'; | |
| 26 use strict; | |
| 27 | |
| 28 use Carp; | |
| 29 $SIG{__DIE__} = \&Carp::confess; | |
| 30 | |
| 31 use File::Spec; | |
| 32 use Getopt::Long; | |
| 33 use Pod::Usage; | |
| 34 | |
| 35 my $baseDir; | |
| 36 my $libDir; | |
| 37 #my $vcftDir; | |
| 38 BEGIN { | |
| 39 my $thisDir=(File::Spec->splitpath($0))[1]; | |
| 40 $baseDir=File::Spec->catdir($thisDir,File::Spec->updir()); | |
| 41 $libDir=File::Spec->catdir($baseDir,'lib'); | |
| 42 #my $optDir=File::Spec->catdir($baseDir,'opt'); | |
| 43 #$vcftDir=File::Spec->catdir($optDir,'vcftools','lib','perl5','site_perl'); | |
| 44 } | |
| 45 use lib $libDir; | |
| 46 use Utils; | |
| 47 #use lib $vcftDir; | |
| 48 #use Vcf; | |
| 49 | |
| 50 | |
| 51 my $scriptName=(File::Spec->splitpath($0))[2]; | |
| 52 my $argCount=scalar(@ARGV); | |
| 53 my $cmdline = join(' ',$0,@ARGV); | |
| 54 | |
| 55 | |
| 56 my ($chrom, $configFile); | |
| 57 my $help; | |
| 58 | |
| 59 GetOptions( "chrom=s" => \$chrom, | |
| 60 "config=s" => \$configFile, | |
| 61 "help|h" => \$help) or pod2usage(2); | |
| 62 | |
| 63 pod2usage(2) if($help); | |
| 64 pod2usage(2) unless(defined($chrom)); | |
| 65 pod2usage(2) unless(defined($configFile)); | |
| 66 | |
| 67 | |
| 68 | |
| 69 # | |
| 70 # read config and validate values | |
| 71 # | |
| 72 checkFile($configFile,"configuration ini"); | |
| 73 my $config = parseConfigIni($configFile); | |
| 74 | |
| 75 my $chromSizeKey = "chrom_" . $chrom . "_size"; | |
| 76 my $chromKnownSizeKey = "chrom_" . $chrom . "_knownSize"; | |
| 77 | |
| 78 for (("outDir", $chromSizeKey, $chromKnownSizeKey)) { | |
| 79 errorX("Undefined configuration option: '$_'") unless(defined($config->{derived}{$_})); | |
| 80 } | |
| 81 for (qw(binSize ssnvQuality_LowerBound sindelQuality_LowerBound | |
| 82 isSkipDepthFilters depthFilterMultiple | |
| 83 snvMaxFilteredBasecallFrac snvMaxSpanningDeletionFrac | |
| 84 indelMaxRefRepeat indelMaxWindowFilteredBasecallFrac | |
| 85 indelMaxIntHpolLength)) { | |
| 86 errorX("Undefined configuration option: '$_'") unless(defined($config->{user}{$_})); | |
| 87 } | |
| 88 | |
| 89 my $outDir = $config->{derived}{outDir}; | |
| 90 my $chromDir = File::Spec->catdir($outDir,'chromosomes',$chrom); | |
| 91 checkDir($outDir,"output"); | |
| 92 checkDir($chromDir,"output chromosome"); | |
| 93 | |
| 94 my $userconfig = $config->{user}; | |
| 95 | |
| 96 my $binList = getBinList($config->{derived}{$chromSizeKey},$userconfig->{binSize}); | |
| 97 for my $binId (@$binList) { | |
| 98 my $dir = File::Spec->catdir($chromDir,'bins',$binId); | |
| 99 checkDir($dir,"input bin"); | |
| 100 } | |
| 101 | |
| 102 # | |
| 103 # parameters from user config file: | |
| 104 # | |
| 105 | |
| 106 # minimum passed ssnv_nt Q-score: | |
| 107 my $minQSSNT=$userconfig->{ssnvQuality_LowerBound}; | |
| 108 # minimum passed sindel_nt Q-score: | |
| 109 my $minQSINT=$userconfig->{sindelQuality_LowerBound}; | |
| 110 | |
| 111 # | |
| 112 # filtration parameters from user config file: | |
| 113 # | |
| 114 | |
| 115 # skip depth filters for targeted resequencing: | |
| 116 my $isUseDepthFilter=(! $userconfig->{isSkipDepthFilters}); | |
| 117 # multiple of the normal mean coverage to filter snvs and indels | |
| 118 my $depthFilterMultiple=$userconfig->{depthFilterMultiple}; | |
| 119 # max filtered basecall fraction for any sample | |
| 120 my $snvMaxFilteredBasecallFrac=$userconfig->{snvMaxFilteredBasecallFrac}; | |
| 121 # max snv spanning-deletion fraction for any sample | |
| 122 my $snvMaxSpanningDeletionFrac=$userconfig->{snvMaxSpanningDeletionFrac}; | |
| 123 # max indel reference repeat length | |
| 124 my $indelMaxRefRepeat=$userconfig->{indelMaxRefRepeat}; | |
| 125 # max indel window filter fraction | |
| 126 my $indelMaxWindowFilteredBasecallFrac=$userconfig->{indelMaxWindowFilteredBasecallFrac}; | |
| 127 # max indel interupted hompolymer length: | |
| 128 my $indelMaxIntHpolLength=$userconfig->{indelMaxIntHpolLength}; | |
| 129 | |
| 130 | |
| 131 # first we want the normal sample mean chromosome depth: | |
| 132 # | |
| 133 my $filterCoverage; | |
| 134 if($isUseDepthFilter) { | |
| 135 my $totalCoverage = 0; | |
| 136 for my $binId (@$binList) { | |
| 137 my $sFile = File::Spec->catfile($chromDir,'bins',$binId,'strelka.stats'); | |
| 138 checkFile($sFile,"strelka bin stats"); | |
| 139 open(my $sFH, '<', $sFile) | |
| 140 || errorX("Can't open file: '$sFile' $!"); | |
| 141 | |
| 142 my $is_found=0; | |
| 143 while(<$sFH>) { | |
| 144 next unless(/^NORMAL_NO_REF_N_COVERAGE\s/); | |
| 145 my $is_bad = 0; | |
| 146 if(not /sample_size:\s*(\d+)\s+/) { $is_bad=1; } | |
| 147 my $ss=$1; | |
| 148 # leave the regex for mean fairly loose to pick up scientific notation, etc.. | |
| 149 if(not /mean:\s*(\d[^\s]*|nan)\s+/) { $is_bad=1; } | |
| 150 my $mean=$1; | |
| 151 errorX("Unexpected format in file: '$sFile'") if($is_bad); | |
| 152 | |
| 153 my $coverage = ( $ss==0 ? 0 : int($ss*$mean) ); | |
| 154 | |
| 155 $totalCoverage += $coverage; | |
| 156 $is_found=1; | |
| 157 last; | |
| 158 } | |
| 159 close($sFH); | |
| 160 errorX("Unexpected format in file: '$sFile'") unless($is_found); | |
| 161 } | |
| 162 | |
| 163 my $chromKnownSize = $config->{derived}{$chromKnownSizeKey}; | |
| 164 my $normalMeanCoverage = ($totalCoverage/$chromKnownSize); | |
| 165 $filterCoverage = $normalMeanCoverage*$depthFilterMultiple; | |
| 166 } | |
| 167 | |
| 168 | |
| 169 # add filter description to vcf header unless it already exists | |
| 170 # return 1 if filter id already exists, client can decide if this is an error | |
| 171 # | |
| 172 sub add_vcf_filter($$$) { | |
| 173 my ($vcf,$id,$desc) = @_; | |
| 174 return 1 if(scalar(@{$vcf->get_header_line(key=>'FILTER', ID=>$id)})); | |
| 175 $vcf->add_header_line({key=>'FILTER', ID=>$id,Description=>$desc}); | |
| 176 return 0; | |
| 177 } | |
| 178 | |
| 179 | |
| 180 sub check_vcf_for_sample($$$) { | |
| 181 my ($vcf,$sample,$file) = @_; | |
| 182 my $is_found=0; | |
| 183 for ($vcf->get_samples()) { | |
| 184 $is_found=1 if($_ eq $sample); | |
| 185 } | |
| 186 errorX("Failed to find sample '$sample' in vcf file '$file'") unless($is_found); | |
| 187 } | |
| 188 | |
| 189 | |
| 190 | |
| 191 my $depthFiltId="DP"; | |
| 192 | |
| 193 | |
| 194 # Runs all post-call vcf filters: | |
| 195 # | |
| 196 sub filterSnvFileList(\@$$$) { | |
| 197 my ($ifiles,$depthFilterVal,$acceptFileName,$isUseDepthFilter) = @_; | |
| 198 | |
| 199 my $baseFiltId="BCNoise"; | |
| 200 my $spanFiltId="SpanDel"; | |
| 201 my $qFiltId="QSS_ref"; | |
| 202 | |
| 203 open(my $aFH,'>',$acceptFileName) | |
| 204 or errorX("Failed to open file: '$acceptFileName'"); | |
| 205 | |
| 206 my $is_first=1; | |
| 207 for my $ifile (@$ifiles) { | |
| 208 open(my $iFH,'<',"$ifile") | |
| 209 or errorX("Failed to open file: '$ifile'"); | |
| 210 my $vcf = Vcf->new(fh=>$iFH); | |
| 211 | |
| 212 # run some simple header validation on each vcf: | |
| 213 $vcf->parse_header(); | |
| 214 check_vcf_for_sample($vcf,'NORMAL',$ifile); | |
| 215 check_vcf_for_sample($vcf,'TUMOR',$ifile); | |
| 216 | |
| 217 if($is_first) { | |
| 218 # TODO: update vcf meta-data for chromosome-level filtered files | |
| 219 # | |
| 220 $vcf->remove_header_line(key=>"cmdline"); | |
| 221 $vcf->add_header_line({key=>"cmdline",value=>$cmdline}); | |
| 222 if($isUseDepthFilter) { | |
| 223 $vcf->add_header_line({key=>"maxDepth_$chrom",value=>$depthFilterVal}); | |
| 224 add_vcf_filter($vcf,$depthFiltId,"Greater than ${depthFilterMultiple}x chromosomal mean depth in Normal sample"); | |
| 225 } | |
| 226 add_vcf_filter($vcf,$baseFiltId,"Fraction of basecalls filtered at this site in either sample is at or above $snvMaxFilteredBasecallFrac"); | |
| 227 add_vcf_filter($vcf,$spanFiltId,"Fraction of reads crossing site with spanning deletions in either sample exceeeds $snvMaxSpanningDeletionFrac"); | |
| 228 add_vcf_filter($vcf,$qFiltId,"Normal sample is not homozygous ref or ssnv Q-score < $minQSSNT, ie calls with NT!=ref or QSS_NT < $minQSSNT"); | |
| 229 print $aFH $vcf->format_header(); | |
| 230 $is_first=0; | |
| 231 } | |
| 232 | |
| 233 while(my $x=$vcf->next_data_hash()) { | |
| 234 my $norm=$x->{gtypes}->{NORMAL}; | |
| 235 my $tumr=$x->{gtypes}->{TUMOR}; | |
| 236 | |
| 237 my %filters; | |
| 238 | |
| 239 # normal depth filter: | |
| 240 my $normalDP=$norm->{DP}; | |
| 241 if($isUseDepthFilter) { | |
| 242 $filters{$depthFiltId} = ($normalDP > $depthFilterVal); | |
| 243 } | |
| 244 | |
| 245 # filtered basecall fraction: | |
| 246 my $normal_filt=($normalDP>0 ? $norm->{FDP}/$normalDP : 0); | |
| 247 | |
| 248 my $tumorDP=$tumr->{DP}; | |
| 249 my $tumor_filt=($tumorDP>0 ? $tumr->{FDP}/$tumorDP : 0); | |
| 250 | |
| 251 $filters{$baseFiltId}=(($normal_filt >= $snvMaxFilteredBasecallFrac) or | |
| 252 ($tumor_filt >= $snvMaxFilteredBasecallFrac)); | |
| 253 | |
| 254 # spanning deletion fraction: | |
| 255 my $normalSDP=$norm->{SDP}; | |
| 256 my $normalSpanTot=($normalDP+$normalSDP); | |
| 257 my $normalSpanDelFrac=($normalSpanTot>0 ? ($normalSDP/$normalSpanTot) : 0); | |
| 258 | |
| 259 my $tumorSDP=$tumr->{SDP}; | |
| 260 my $tumorSpanTot=($tumorDP+$tumorSDP); | |
| 261 my $tumorSpanDelFrac=($tumorSpanTot>0 ? ($tumorSDP/$tumorSpanTot) : 0); | |
| 262 | |
| 263 $filters{$spanFiltId}=(($normalSpanDelFrac > $snvMaxSpanningDeletionFrac) or | |
| 264 ($tumorSpanDelFrac > $snvMaxSpanningDeletionFrac)); | |
| 265 | |
| 266 # Q-val filter: | |
| 267 $filters{$qFiltId}=(($x->{INFO}->{NT} ne "ref") or | |
| 268 ($x->{INFO}->{QSS_NT} < $minQSSNT)); | |
| 269 | |
| 270 $x->{FILTER} = $vcf->add_filter($x->{FILTER},%filters); | |
| 271 | |
| 272 print $aFH $vcf->format_line($x); | |
| 273 } | |
| 274 | |
| 275 $vcf->close(); | |
| 276 close($iFH); | |
| 277 } | |
| 278 | |
| 279 close($aFH); | |
| 280 } | |
| 281 | |
| 282 | |
| 283 | |
| 284 sub updateA2(\@$) { | |
| 285 my ($a2,$FH) = @_; | |
| 286 my $line=<$FH>; | |
| 287 unless(defined($line)) { errorX("Unexpected end of somatic indel window file"); } | |
| 288 chomp $line; | |
| 289 @$a2 = split("\t",$line); | |
| 290 unless(scalar(@$a2)) { errorX("Unexpected format in somatic indel window file"); } | |
| 291 } | |
| 292 | |
| 293 | |
| 294 | |
| 295 sub filterIndelFileList(\@$$$) { | |
| 296 my ($ifiles,$depthFilterVal,$acceptFileName,$isUseDepthFilter) = @_; | |
| 297 | |
| 298 my $repeatFiltId="Repeat"; | |
| 299 my $iHpolFiltId="iHpol"; | |
| 300 my $baseFiltId="BCNoise"; | |
| 301 my $qFiltId="QSI_ref"; | |
| 302 | |
| 303 open(my $aFH,'>',$acceptFileName) | |
| 304 or errorX("Failed to open file: '$acceptFileName'"); | |
| 305 | |
| 306 my $is_first=1; | |
| 307 for my $ifile (@$ifiles) { | |
| 308 open(my $iFH,'<',"$ifile") | |
| 309 or errorX("Failed to open somatic indel file: '$ifile'"); | |
| 310 | |
| 311 my $iwfile = $ifile . ".window"; | |
| 312 open(my $iwFH,'<',"$iwfile") | |
| 313 or errorX("Failed to open somatic indel window file: '$iwfile'"); | |
| 314 | |
| 315 my @a2; # hold window file data for one line in case we overstep... | |
| 316 | |
| 317 my $vcf = Vcf->new(fh=>$iFH); | |
| 318 | |
| 319 # run some simple header validation on each vcf: | |
| 320 $vcf->parse_header(); | |
| 321 check_vcf_for_sample($vcf,'NORMAL',$ifile); | |
| 322 check_vcf_for_sample($vcf,'TUMOR',$ifile); | |
| 323 | |
| 324 if($is_first) { | |
| 325 # TODO: update all vcf meta-data for chromosome-level filtered files | |
| 326 # | |
| 327 $vcf->remove_header_line(key=>"cmdline"); | |
| 328 $vcf->add_header_line({key=>"cmdline",value=>$cmdline}); | |
| 329 if($isUseDepthFilter) { | |
| 330 $vcf->add_header_line({key=>"maxDepth_$chrom",value=>$depthFilterVal}); | |
| 331 } | |
| 332 $vcf->add_header_line({key=>'FORMAT', ID=>'DP50',Number=>1,Type=>'Float',Description=>'Average tier1 read depth within 50 bases'}); | |
| 333 $vcf->add_header_line({key=>'FORMAT', ID=>'FDP50',Number=>1,Type=>'Float',Description=>'Average tier1 number of basecalls filtered from original read depth within 50 bases'}); | |
| 334 $vcf->add_header_line({key=>'FORMAT', ID=>'SUBDP50',Number=>1,Type=>'Float',Description=>'Average number of reads below tier1 mapping quality threshold aligned across sites within 50 bases'}); | |
| 335 if($isUseDepthFilter) { | |
| 336 add_vcf_filter($vcf,$depthFiltId,"Greater than ${depthFilterMultiple}x chromosomal mean depth in Normal sample"); | |
| 337 } | |
| 338 add_vcf_filter($vcf,$repeatFiltId,"Sequence repeat of more than ${indelMaxRefRepeat}x in the reference sequence"); | |
| 339 add_vcf_filter($vcf,$iHpolFiltId,"Indel overlaps an interupted homopolymer longer than ${indelMaxIntHpolLength}x in the reference sequence"); | |
| 340 add_vcf_filter($vcf,$baseFiltId,"Average fraction of filtered basecalls within 50 bases of the indel exceeds $indelMaxWindowFilteredBasecallFrac"); | |
| 341 add_vcf_filter($vcf,$qFiltId,"Normal sample is not homozygous ref or sindel Q-score < $minQSINT, ie calls with NT!=ref or QSI_NT < $minQSINT"); | |
| 342 print $aFH $vcf->format_header(); | |
| 343 $is_first=0; | |
| 344 } | |
| 345 | |
| 346 while(my $x=$vcf->next_data_hash()) { | |
| 347 $vcf->add_format_field($x,'DP50'); | |
| 348 $vcf->add_format_field($x,'FDP50'); | |
| 349 $vcf->add_format_field($x,'SUBDP50'); | |
| 350 | |
| 351 my $norm=$x->{gtypes}->{NORMAL}; | |
| 352 my $tumr=$x->{gtypes}->{TUMOR}; | |
| 353 | |
| 354 my $chrom=$x->{CHROM}; | |
| 355 my $pos=int($x->{POS}); | |
| 356 | |
| 357 # get matching line from window file: | |
| 358 while((scalar(@a2)<2) or | |
| 359 (($a2[0] le $chrom) and (int($a2[1]) < $pos))) { | |
| 360 updateA2(@a2,$iwFH); | |
| 361 } | |
| 362 unless(scalar(@a2) and ($a2[0] eq $chrom) and (int($a2[1]) == $pos)) | |
| 363 { errorX("Can't find somatic indel window position.\nIndel line: " . $vcf->format_line($x) ); } | |
| 364 | |
| 365 # add window data to vcf record: | |
| 366 # | |
| 367 $norm->{DP50} = $a2[2]+$a2[3]; | |
| 368 $norm->{FDP50} = $a2[3]; | |
| 369 $norm->{SUBDP50} = $a2[4]; | |
| 370 $tumr->{DP50} = $a2[5]+$a2[6]; | |
| 371 $tumr->{FDP50} = $a2[6]; | |
| 372 $tumr->{SUBDP50} = $a2[7]; | |
| 373 | |
| 374 my %filters; | |
| 375 | |
| 376 # normal depth filter: | |
| 377 my $normalDP=$norm->{DP}; | |
| 378 if($isUseDepthFilter) { | |
| 379 $filters{$depthFiltId}=($normalDP > $depthFilterVal); | |
| 380 } | |
| 381 | |
| 382 # ref repeat | |
| 383 my $refRep=$x->{INFO}->{RC}; | |
| 384 $filters{$repeatFiltId}=(defined($refRep) and | |
| 385 ($refRep > $indelMaxRefRepeat)); | |
| 386 | |
| 387 # ihpol | |
| 388 my $iHpol=$x->{INFO}->{IHP}; | |
| 389 $filters{$iHpolFiltId}=(defined($iHpol) and | |
| 390 ($iHpol > $indelMaxIntHpolLength)); | |
| 391 | |
| 392 # base filt: | |
| 393 my $normWinFrac=( $norm->{DP50} ? $norm->{FDP50}/$norm->{DP50} : 0 ); | |
| 394 my $tumrWinFrac=( $tumr->{DP50} ? $tumr->{FDP50}/$tumr->{DP50} : 0 ); | |
| 395 $filters{$baseFiltId}=( ($normWinFrac >= $indelMaxWindowFilteredBasecallFrac) or | |
| 396 ($tumrWinFrac >= $indelMaxWindowFilteredBasecallFrac) ); | |
| 397 | |
| 398 # Q-val filter: | |
| 399 $filters{$qFiltId}=(($x->{INFO}->{NT} ne "ref") or | |
| 400 ($x->{INFO}->{QSI_NT} < $minQSINT)); | |
| 401 | |
| 402 $x->{FILTER} = $vcf->add_filter($x->{FILTER},%filters); | |
| 403 | |
| 404 print $aFH $vcf->format_line($x); | |
| 405 } | |
| 406 | |
| 407 $vcf->close(); | |
| 408 close($iFH); | |
| 409 close($iwFH); | |
| 410 } | |
| 411 | |
| 412 close($aFH); | |
| 413 } | |
| 414 | |
| 415 | |
| 416 | |
| 417 my @ssnvFiles; | |
| 418 my @sindelFiles; | |
| 419 for my $binId (@$binList) { | |
| 420 my $ssnvFile = File::Spec->catfile($chromDir,'bins',$binId,'somatic.snvs.unfiltered.vcf'); | |
| 421 my $sindelFile = File::Spec->catfile($chromDir,'bins',$binId,'somatic.indels.unfiltered.vcf'); | |
| 422 checkFile($ssnvFile,"bin snv file"); | |
| 423 checkFile($sindelFile,"bin indel file"); | |
| 424 push @ssnvFiles,$ssnvFile; | |
| 425 push @sindelFiles,$sindelFile; | |
| 426 } | |
| 427 | |
| 428 my $ssnvOutFile = File::Spec->catfile($chromDir,"somatic.snvs.vcf"); | |
| 429 filterSnvFileList(@ssnvFiles,$filterCoverage,$ssnvOutFile,$isUseDepthFilter); | |
| 430 | |
| 431 my $sindelOutFile = File::Spec->catfile($chromDir,"somatic.indels.vcf"); | |
| 432 filterIndelFileList(@sindelFiles,$filterCoverage,$sindelOutFile,$isUseDepthFilter); | |
| 433 | |
| 434 | |
| 435 1; |
