Mercurial > repos > mini > strelka
comparison strelka2/libexec/filterSomaticVariants.pl @ 0:7a9f20ca4ad5
Uploaded
author | mini |
---|---|
date | Thu, 25 Sep 2014 11:59:08 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7a9f20ca4ad5 |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 =head1 LICENSE | |
4 | |
5 Strelka Workflow Software | |
6 Copyright (c) 2009-2013 Illumina, Inc. | |
7 | |
8 This software is provided under the terms and conditions of the | |
9 Illumina Open Source Software License 1. | |
10 | |
11 You should have received a copy of the Illumina Open Source | |
12 Software License 1 along with this program. If not, see | |
13 <https://github.com/downloads/sequencing/licenses/>. | |
14 | |
15 =head1 SYNOPSIS | |
16 | |
17 filterSomaticVariants.pl [options] | --help | |
18 | |
19 =head2 SUMMARY | |
20 | |
21 Aggregate and filter the variant caller results by chromosome | |
22 | |
23 =cut | |
24 | |
25 use warnings FATAL => 'all'; | |
26 use strict; | |
27 | |
28 use Carp; | |
29 $SIG{__DIE__} = \&Carp::confess; | |
30 | |
31 use File::Spec; | |
32 use Getopt::Long; | |
33 use Pod::Usage; | |
34 | |
35 my $baseDir; | |
36 my $libDir; | |
37 #my $vcftDir; | |
38 BEGIN { | |
39 my $thisDir=(File::Spec->splitpath($0))[1]; | |
40 $baseDir=File::Spec->catdir($thisDir,File::Spec->updir()); | |
41 $libDir=File::Spec->catdir($baseDir,'lib'); | |
42 #my $optDir=File::Spec->catdir($baseDir,'opt'); | |
43 #$vcftDir=File::Spec->catdir($optDir,'vcftools','lib','perl5','site_perl'); | |
44 } | |
45 use lib $libDir; | |
46 use Utils; | |
47 #use lib $vcftDir; | |
48 #use Vcf; | |
49 | |
50 | |
51 my $scriptName=(File::Spec->splitpath($0))[2]; | |
52 my $argCount=scalar(@ARGV); | |
53 my $cmdline = join(' ',$0,@ARGV); | |
54 | |
55 | |
56 my ($chrom, $configFile); | |
57 my $help; | |
58 | |
59 GetOptions( "chrom=s" => \$chrom, | |
60 "config=s" => \$configFile, | |
61 "help|h" => \$help) or pod2usage(2); | |
62 | |
63 pod2usage(2) if($help); | |
64 pod2usage(2) unless(defined($chrom)); | |
65 pod2usage(2) unless(defined($configFile)); | |
66 | |
67 | |
68 | |
69 # | |
70 # read config and validate values | |
71 # | |
72 checkFile($configFile,"configuration ini"); | |
73 my $config = parseConfigIni($configFile); | |
74 | |
75 my $chromSizeKey = "chrom_" . $chrom . "_size"; | |
76 my $chromKnownSizeKey = "chrom_" . $chrom . "_knownSize"; | |
77 | |
78 for (("outDir", $chromSizeKey, $chromKnownSizeKey)) { | |
79 errorX("Undefined configuration option: '$_'") unless(defined($config->{derived}{$_})); | |
80 } | |
81 for (qw(binSize ssnvQuality_LowerBound sindelQuality_LowerBound | |
82 isSkipDepthFilters depthFilterMultiple | |
83 snvMaxFilteredBasecallFrac snvMaxSpanningDeletionFrac | |
84 indelMaxRefRepeat indelMaxWindowFilteredBasecallFrac | |
85 indelMaxIntHpolLength)) { | |
86 errorX("Undefined configuration option: '$_'") unless(defined($config->{user}{$_})); | |
87 } | |
88 | |
89 my $outDir = $config->{derived}{outDir}; | |
90 my $chromDir = File::Spec->catdir($outDir,'chromosomes',$chrom); | |
91 checkDir($outDir,"output"); | |
92 checkDir($chromDir,"output chromosome"); | |
93 | |
94 my $userconfig = $config->{user}; | |
95 | |
96 my $binList = getBinList($config->{derived}{$chromSizeKey},$userconfig->{binSize}); | |
97 for my $binId (@$binList) { | |
98 my $dir = File::Spec->catdir($chromDir,'bins',$binId); | |
99 checkDir($dir,"input bin"); | |
100 } | |
101 | |
102 # | |
103 # parameters from user config file: | |
104 # | |
105 | |
106 # minimum passed ssnv_nt Q-score: | |
107 my $minQSSNT=$userconfig->{ssnvQuality_LowerBound}; | |
108 # minimum passed sindel_nt Q-score: | |
109 my $minQSINT=$userconfig->{sindelQuality_LowerBound}; | |
110 | |
111 # | |
112 # filtration parameters from user config file: | |
113 # | |
114 | |
115 # skip depth filters for targeted resequencing: | |
116 my $isUseDepthFilter=(! $userconfig->{isSkipDepthFilters}); | |
117 # multiple of the normal mean coverage to filter snvs and indels | |
118 my $depthFilterMultiple=$userconfig->{depthFilterMultiple}; | |
119 # max filtered basecall fraction for any sample | |
120 my $snvMaxFilteredBasecallFrac=$userconfig->{snvMaxFilteredBasecallFrac}; | |
121 # max snv spanning-deletion fraction for any sample | |
122 my $snvMaxSpanningDeletionFrac=$userconfig->{snvMaxSpanningDeletionFrac}; | |
123 # max indel reference repeat length | |
124 my $indelMaxRefRepeat=$userconfig->{indelMaxRefRepeat}; | |
125 # max indel window filter fraction | |
126 my $indelMaxWindowFilteredBasecallFrac=$userconfig->{indelMaxWindowFilteredBasecallFrac}; | |
127 # max indel interupted hompolymer length: | |
128 my $indelMaxIntHpolLength=$userconfig->{indelMaxIntHpolLength}; | |
129 | |
130 | |
131 # first we want the normal sample mean chromosome depth: | |
132 # | |
133 my $filterCoverage; | |
134 if($isUseDepthFilter) { | |
135 my $totalCoverage = 0; | |
136 for my $binId (@$binList) { | |
137 my $sFile = File::Spec->catfile($chromDir,'bins',$binId,'strelka.stats'); | |
138 checkFile($sFile,"strelka bin stats"); | |
139 open(my $sFH, '<', $sFile) | |
140 || errorX("Can't open file: '$sFile' $!"); | |
141 | |
142 my $is_found=0; | |
143 while(<$sFH>) { | |
144 next unless(/^NORMAL_NO_REF_N_COVERAGE\s/); | |
145 my $is_bad = 0; | |
146 if(not /sample_size:\s*(\d+)\s+/) { $is_bad=1; } | |
147 my $ss=$1; | |
148 # leave the regex for mean fairly loose to pick up scientific notation, etc.. | |
149 if(not /mean:\s*(\d[^\s]*|nan)\s+/) { $is_bad=1; } | |
150 my $mean=$1; | |
151 errorX("Unexpected format in file: '$sFile'") if($is_bad); | |
152 | |
153 my $coverage = ( $ss==0 ? 0 : int($ss*$mean) ); | |
154 | |
155 $totalCoverage += $coverage; | |
156 $is_found=1; | |
157 last; | |
158 } | |
159 close($sFH); | |
160 errorX("Unexpected format in file: '$sFile'") unless($is_found); | |
161 } | |
162 | |
163 my $chromKnownSize = $config->{derived}{$chromKnownSizeKey}; | |
164 my $normalMeanCoverage = ($totalCoverage/$chromKnownSize); | |
165 $filterCoverage = $normalMeanCoverage*$depthFilterMultiple; | |
166 } | |
167 | |
168 | |
169 # add filter description to vcf header unless it already exists | |
170 # return 1 if filter id already exists, client can decide if this is an error | |
171 # | |
172 sub add_vcf_filter($$$) { | |
173 my ($vcf,$id,$desc) = @_; | |
174 return 1 if(scalar(@{$vcf->get_header_line(key=>'FILTER', ID=>$id)})); | |
175 $vcf->add_header_line({key=>'FILTER', ID=>$id,Description=>$desc}); | |
176 return 0; | |
177 } | |
178 | |
179 | |
180 sub check_vcf_for_sample($$$) { | |
181 my ($vcf,$sample,$file) = @_; | |
182 my $is_found=0; | |
183 for ($vcf->get_samples()) { | |
184 $is_found=1 if($_ eq $sample); | |
185 } | |
186 errorX("Failed to find sample '$sample' in vcf file '$file'") unless($is_found); | |
187 } | |
188 | |
189 | |
190 | |
191 my $depthFiltId="DP"; | |
192 | |
193 | |
194 # Runs all post-call vcf filters: | |
195 # | |
196 sub filterSnvFileList(\@$$$) { | |
197 my ($ifiles,$depthFilterVal,$acceptFileName,$isUseDepthFilter) = @_; | |
198 | |
199 my $baseFiltId="BCNoise"; | |
200 my $spanFiltId="SpanDel"; | |
201 my $qFiltId="QSS_ref"; | |
202 | |
203 open(my $aFH,'>',$acceptFileName) | |
204 or errorX("Failed to open file: '$acceptFileName'"); | |
205 | |
206 my $is_first=1; | |
207 for my $ifile (@$ifiles) { | |
208 open(my $iFH,'<',"$ifile") | |
209 or errorX("Failed to open file: '$ifile'"); | |
210 my $vcf = Vcf->new(fh=>$iFH); | |
211 | |
212 # run some simple header validation on each vcf: | |
213 $vcf->parse_header(); | |
214 check_vcf_for_sample($vcf,'NORMAL',$ifile); | |
215 check_vcf_for_sample($vcf,'TUMOR',$ifile); | |
216 | |
217 if($is_first) { | |
218 # TODO: update vcf meta-data for chromosome-level filtered files | |
219 # | |
220 $vcf->remove_header_line(key=>"cmdline"); | |
221 $vcf->add_header_line({key=>"cmdline",value=>$cmdline}); | |
222 if($isUseDepthFilter) { | |
223 $vcf->add_header_line({key=>"maxDepth_$chrom",value=>$depthFilterVal}); | |
224 add_vcf_filter($vcf,$depthFiltId,"Greater than ${depthFilterMultiple}x chromosomal mean depth in Normal sample"); | |
225 } | |
226 add_vcf_filter($vcf,$baseFiltId,"Fraction of basecalls filtered at this site in either sample is at or above $snvMaxFilteredBasecallFrac"); | |
227 add_vcf_filter($vcf,$spanFiltId,"Fraction of reads crossing site with spanning deletions in either sample exceeeds $snvMaxSpanningDeletionFrac"); | |
228 add_vcf_filter($vcf,$qFiltId,"Normal sample is not homozygous ref or ssnv Q-score < $minQSSNT, ie calls with NT!=ref or QSS_NT < $minQSSNT"); | |
229 print $aFH $vcf->format_header(); | |
230 $is_first=0; | |
231 } | |
232 | |
233 while(my $x=$vcf->next_data_hash()) { | |
234 my $norm=$x->{gtypes}->{NORMAL}; | |
235 my $tumr=$x->{gtypes}->{TUMOR}; | |
236 | |
237 my %filters; | |
238 | |
239 # normal depth filter: | |
240 my $normalDP=$norm->{DP}; | |
241 if($isUseDepthFilter) { | |
242 $filters{$depthFiltId} = ($normalDP > $depthFilterVal); | |
243 } | |
244 | |
245 # filtered basecall fraction: | |
246 my $normal_filt=($normalDP>0 ? $norm->{FDP}/$normalDP : 0); | |
247 | |
248 my $tumorDP=$tumr->{DP}; | |
249 my $tumor_filt=($tumorDP>0 ? $tumr->{FDP}/$tumorDP : 0); | |
250 | |
251 $filters{$baseFiltId}=(($normal_filt >= $snvMaxFilteredBasecallFrac) or | |
252 ($tumor_filt >= $snvMaxFilteredBasecallFrac)); | |
253 | |
254 # spanning deletion fraction: | |
255 my $normalSDP=$norm->{SDP}; | |
256 my $normalSpanTot=($normalDP+$normalSDP); | |
257 my $normalSpanDelFrac=($normalSpanTot>0 ? ($normalSDP/$normalSpanTot) : 0); | |
258 | |
259 my $tumorSDP=$tumr->{SDP}; | |
260 my $tumorSpanTot=($tumorDP+$tumorSDP); | |
261 my $tumorSpanDelFrac=($tumorSpanTot>0 ? ($tumorSDP/$tumorSpanTot) : 0); | |
262 | |
263 $filters{$spanFiltId}=(($normalSpanDelFrac > $snvMaxSpanningDeletionFrac) or | |
264 ($tumorSpanDelFrac > $snvMaxSpanningDeletionFrac)); | |
265 | |
266 # Q-val filter: | |
267 $filters{$qFiltId}=(($x->{INFO}->{NT} ne "ref") or | |
268 ($x->{INFO}->{QSS_NT} < $minQSSNT)); | |
269 | |
270 $x->{FILTER} = $vcf->add_filter($x->{FILTER},%filters); | |
271 | |
272 print $aFH $vcf->format_line($x); | |
273 } | |
274 | |
275 $vcf->close(); | |
276 close($iFH); | |
277 } | |
278 | |
279 close($aFH); | |
280 } | |
281 | |
282 | |
283 | |
284 sub updateA2(\@$) { | |
285 my ($a2,$FH) = @_; | |
286 my $line=<$FH>; | |
287 unless(defined($line)) { errorX("Unexpected end of somatic indel window file"); } | |
288 chomp $line; | |
289 @$a2 = split("\t",$line); | |
290 unless(scalar(@$a2)) { errorX("Unexpected format in somatic indel window file"); } | |
291 } | |
292 | |
293 | |
294 | |
295 sub filterIndelFileList(\@$$$) { | |
296 my ($ifiles,$depthFilterVal,$acceptFileName,$isUseDepthFilter) = @_; | |
297 | |
298 my $repeatFiltId="Repeat"; | |
299 my $iHpolFiltId="iHpol"; | |
300 my $baseFiltId="BCNoise"; | |
301 my $qFiltId="QSI_ref"; | |
302 | |
303 open(my $aFH,'>',$acceptFileName) | |
304 or errorX("Failed to open file: '$acceptFileName'"); | |
305 | |
306 my $is_first=1; | |
307 for my $ifile (@$ifiles) { | |
308 open(my $iFH,'<',"$ifile") | |
309 or errorX("Failed to open somatic indel file: '$ifile'"); | |
310 | |
311 my $iwfile = $ifile . ".window"; | |
312 open(my $iwFH,'<',"$iwfile") | |
313 or errorX("Failed to open somatic indel window file: '$iwfile'"); | |
314 | |
315 my @a2; # hold window file data for one line in case we overstep... | |
316 | |
317 my $vcf = Vcf->new(fh=>$iFH); | |
318 | |
319 # run some simple header validation on each vcf: | |
320 $vcf->parse_header(); | |
321 check_vcf_for_sample($vcf,'NORMAL',$ifile); | |
322 check_vcf_for_sample($vcf,'TUMOR',$ifile); | |
323 | |
324 if($is_first) { | |
325 # TODO: update all vcf meta-data for chromosome-level filtered files | |
326 # | |
327 $vcf->remove_header_line(key=>"cmdline"); | |
328 $vcf->add_header_line({key=>"cmdline",value=>$cmdline}); | |
329 if($isUseDepthFilter) { | |
330 $vcf->add_header_line({key=>"maxDepth_$chrom",value=>$depthFilterVal}); | |
331 } | |
332 $vcf->add_header_line({key=>'FORMAT', ID=>'DP50',Number=>1,Type=>'Float',Description=>'Average tier1 read depth within 50 bases'}); | |
333 $vcf->add_header_line({key=>'FORMAT', ID=>'FDP50',Number=>1,Type=>'Float',Description=>'Average tier1 number of basecalls filtered from original read depth within 50 bases'}); | |
334 $vcf->add_header_line({key=>'FORMAT', ID=>'SUBDP50',Number=>1,Type=>'Float',Description=>'Average number of reads below tier1 mapping quality threshold aligned across sites within 50 bases'}); | |
335 if($isUseDepthFilter) { | |
336 add_vcf_filter($vcf,$depthFiltId,"Greater than ${depthFilterMultiple}x chromosomal mean depth in Normal sample"); | |
337 } | |
338 add_vcf_filter($vcf,$repeatFiltId,"Sequence repeat of more than ${indelMaxRefRepeat}x in the reference sequence"); | |
339 add_vcf_filter($vcf,$iHpolFiltId,"Indel overlaps an interupted homopolymer longer than ${indelMaxIntHpolLength}x in the reference sequence"); | |
340 add_vcf_filter($vcf,$baseFiltId,"Average fraction of filtered basecalls within 50 bases of the indel exceeds $indelMaxWindowFilteredBasecallFrac"); | |
341 add_vcf_filter($vcf,$qFiltId,"Normal sample is not homozygous ref or sindel Q-score < $minQSINT, ie calls with NT!=ref or QSI_NT < $minQSINT"); | |
342 print $aFH $vcf->format_header(); | |
343 $is_first=0; | |
344 } | |
345 | |
346 while(my $x=$vcf->next_data_hash()) { | |
347 $vcf->add_format_field($x,'DP50'); | |
348 $vcf->add_format_field($x,'FDP50'); | |
349 $vcf->add_format_field($x,'SUBDP50'); | |
350 | |
351 my $norm=$x->{gtypes}->{NORMAL}; | |
352 my $tumr=$x->{gtypes}->{TUMOR}; | |
353 | |
354 my $chrom=$x->{CHROM}; | |
355 my $pos=int($x->{POS}); | |
356 | |
357 # get matching line from window file: | |
358 while((scalar(@a2)<2) or | |
359 (($a2[0] le $chrom) and (int($a2[1]) < $pos))) { | |
360 updateA2(@a2,$iwFH); | |
361 } | |
362 unless(scalar(@a2) and ($a2[0] eq $chrom) and (int($a2[1]) == $pos)) | |
363 { errorX("Can't find somatic indel window position.\nIndel line: " . $vcf->format_line($x) ); } | |
364 | |
365 # add window data to vcf record: | |
366 # | |
367 $norm->{DP50} = $a2[2]+$a2[3]; | |
368 $norm->{FDP50} = $a2[3]; | |
369 $norm->{SUBDP50} = $a2[4]; | |
370 $tumr->{DP50} = $a2[5]+$a2[6]; | |
371 $tumr->{FDP50} = $a2[6]; | |
372 $tumr->{SUBDP50} = $a2[7]; | |
373 | |
374 my %filters; | |
375 | |
376 # normal depth filter: | |
377 my $normalDP=$norm->{DP}; | |
378 if($isUseDepthFilter) { | |
379 $filters{$depthFiltId}=($normalDP > $depthFilterVal); | |
380 } | |
381 | |
382 # ref repeat | |
383 my $refRep=$x->{INFO}->{RC}; | |
384 $filters{$repeatFiltId}=(defined($refRep) and | |
385 ($refRep > $indelMaxRefRepeat)); | |
386 | |
387 # ihpol | |
388 my $iHpol=$x->{INFO}->{IHP}; | |
389 $filters{$iHpolFiltId}=(defined($iHpol) and | |
390 ($iHpol > $indelMaxIntHpolLength)); | |
391 | |
392 # base filt: | |
393 my $normWinFrac=( $norm->{DP50} ? $norm->{FDP50}/$norm->{DP50} : 0 ); | |
394 my $tumrWinFrac=( $tumr->{DP50} ? $tumr->{FDP50}/$tumr->{DP50} : 0 ); | |
395 $filters{$baseFiltId}=( ($normWinFrac >= $indelMaxWindowFilteredBasecallFrac) or | |
396 ($tumrWinFrac >= $indelMaxWindowFilteredBasecallFrac) ); | |
397 | |
398 # Q-val filter: | |
399 $filters{$qFiltId}=(($x->{INFO}->{NT} ne "ref") or | |
400 ($x->{INFO}->{QSI_NT} < $minQSINT)); | |
401 | |
402 $x->{FILTER} = $vcf->add_filter($x->{FILTER},%filters); | |
403 | |
404 print $aFH $vcf->format_line($x); | |
405 } | |
406 | |
407 $vcf->close(); | |
408 close($iFH); | |
409 close($iwFH); | |
410 } | |
411 | |
412 close($aFH); | |
413 } | |
414 | |
415 | |
416 | |
417 my @ssnvFiles; | |
418 my @sindelFiles; | |
419 for my $binId (@$binList) { | |
420 my $ssnvFile = File::Spec->catfile($chromDir,'bins',$binId,'somatic.snvs.unfiltered.vcf'); | |
421 my $sindelFile = File::Spec->catfile($chromDir,'bins',$binId,'somatic.indels.unfiltered.vcf'); | |
422 checkFile($ssnvFile,"bin snv file"); | |
423 checkFile($sindelFile,"bin indel file"); | |
424 push @ssnvFiles,$ssnvFile; | |
425 push @sindelFiles,$sindelFile; | |
426 } | |
427 | |
428 my $ssnvOutFile = File::Spec->catfile($chromDir,"somatic.snvs.vcf"); | |
429 filterSnvFileList(@ssnvFiles,$filterCoverage,$ssnvOutFile,$isUseDepthFilter); | |
430 | |
431 my $sindelOutFile = File::Spec->catfile($chromDir,"somatic.indels.vcf"); | |
432 filterIndelFileList(@sindelFiles,$filterCoverage,$sindelOutFile,$isUseDepthFilter); | |
433 | |
434 | |
435 1; |