6
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 =head1 LICENSE
|
|
4
|
|
5 Strelka Workflow Software
|
|
6 Copyright (c) 2009-2013 Illumina, Inc.
|
|
7
|
|
8 This software is provided under the terms and conditions of the
|
|
9 Illumina Open Source Software License 1.
|
|
10
|
|
11 You should have received a copy of the Illumina Open Source
|
|
12 Software License 1 along with this program. If not, see
|
|
13 <https://github.com/downloads/sequencing/licenses/>.
|
|
14
|
|
15 =head1 SYNOPSIS
|
|
16
|
|
17 filterSomaticVariants.pl [options] | --help
|
|
18
|
|
19 =head2 SUMMARY
|
|
20
|
|
21 Aggregate and filter the variant caller results by chromosome
|
|
22
|
|
23 =cut
|
|
24
|
|
25 use warnings FATAL => 'all';
|
|
26 use strict;
|
|
27
|
|
28 use Carp;
|
|
29 $SIG{__DIE__} = \&Carp::confess;
|
|
30
|
|
31 use File::Spec;
|
|
32 use Getopt::Long;
|
|
33 use Pod::Usage;
|
|
34
|
|
35 my $baseDir;
|
|
36 my $libDir;
|
|
37 BEGIN {
|
|
38 my $thisDir=(File::Spec->splitpath($0))[1];
|
|
39 $baseDir=File::Spec->catdir($thisDir,File::Spec->updir());
|
|
40 $libDir=File::Spec->catdir($baseDir,'lib');
|
|
41 }
|
|
42 use lib $libDir;
|
|
43 use Utils;
|
10
|
44 use Vcf;
|
|
45 print "all imported in filter";
|
6
|
46
|
|
47
|
|
48 my $scriptName=(File::Spec->splitpath($0))[2];
|
|
49 my $argCount=scalar(@ARGV);
|
|
50 my $cmdline = join(' ',$0,@ARGV);
|
|
51
|
|
52
|
|
53 my ($chrom, $configFile);
|
|
54 my $help;
|
|
55
|
|
56 GetOptions( "chrom=s" => \$chrom,
|
|
57 "config=s" => \$configFile,
|
|
58 "help|h" => \$help) or pod2usage(2);
|
|
59
|
|
60 pod2usage(2) if($help);
|
|
61 pod2usage(2) unless(defined($chrom));
|
|
62 pod2usage(2) unless(defined($configFile));
|
|
63
|
|
64
|
|
65
|
|
66 #
|
|
67 # read config and validate values
|
|
68 #
|
|
69 checkFile($configFile,"configuration ini");
|
|
70 my $config = parseConfigIni($configFile);
|
|
71
|
|
72 my $chromSizeKey = "chrom_" . $chrom . "_size";
|
|
73 my $chromKnownSizeKey = "chrom_" . $chrom . "_knownSize";
|
|
74
|
|
75 for (("outDir", $chromSizeKey, $chromKnownSizeKey)) {
|
|
76 errorX("Undefined configuration option: '$_'") unless(defined($config->{derived}{$_}));
|
|
77 }
|
|
78 for (qw(binSize ssnvQuality_LowerBound sindelQuality_LowerBound
|
|
79 isSkipDepthFilters depthFilterMultiple
|
|
80 snvMaxFilteredBasecallFrac snvMaxSpanningDeletionFrac
|
|
81 indelMaxRefRepeat indelMaxWindowFilteredBasecallFrac
|
|
82 indelMaxIntHpolLength)) {
|
|
83 errorX("Undefined configuration option: '$_'") unless(defined($config->{user}{$_}));
|
|
84 }
|
|
85
|
|
86 my $outDir = $config->{derived}{outDir};
|
|
87 my $chromDir = File::Spec->catdir($outDir,'chromosomes',$chrom);
|
|
88 checkDir($outDir,"output");
|
|
89 checkDir($chromDir,"output chromosome");
|
|
90
|
|
91 my $userconfig = $config->{user};
|
|
92
|
|
93 my $binList = getBinList($config->{derived}{$chromSizeKey},$userconfig->{binSize});
|
|
94 for my $binId (@$binList) {
|
|
95 my $dir = File::Spec->catdir($chromDir,'bins',$binId);
|
|
96 checkDir($dir,"input bin");
|
|
97 }
|
|
98
|
|
99 #
|
|
100 # parameters from user config file:
|
|
101 #
|
|
102
|
|
103 # minimum passed ssnv_nt Q-score:
|
|
104 my $minQSSNT=$userconfig->{ssnvQuality_LowerBound};
|
|
105 # minimum passed sindel_nt Q-score:
|
|
106 my $minQSINT=$userconfig->{sindelQuality_LowerBound};
|
|
107
|
|
108 #
|
|
109 # filtration parameters from user config file:
|
|
110 #
|
|
111
|
|
112 # skip depth filters for targeted resequencing:
|
|
113 my $isUseDepthFilter=(! $userconfig->{isSkipDepthFilters});
|
|
114 # multiple of the normal mean coverage to filter snvs and indels
|
|
115 my $depthFilterMultiple=$userconfig->{depthFilterMultiple};
|
|
116 # max filtered basecall fraction for any sample
|
|
117 my $snvMaxFilteredBasecallFrac=$userconfig->{snvMaxFilteredBasecallFrac};
|
|
118 # max snv spanning-deletion fraction for any sample
|
|
119 my $snvMaxSpanningDeletionFrac=$userconfig->{snvMaxSpanningDeletionFrac};
|
|
120 # max indel reference repeat length
|
|
121 my $indelMaxRefRepeat=$userconfig->{indelMaxRefRepeat};
|
|
122 # max indel window filter fraction
|
|
123 my $indelMaxWindowFilteredBasecallFrac=$userconfig->{indelMaxWindowFilteredBasecallFrac};
|
|
124 # max indel interupted hompolymer length:
|
|
125 my $indelMaxIntHpolLength=$userconfig->{indelMaxIntHpolLength};
|
|
126
|
|
127
|
|
128 # first we want the normal sample mean chromosome depth:
|
|
129 #
|
|
130 my $filterCoverage;
|
|
131 if($isUseDepthFilter) {
|
|
132 my $totalCoverage = 0;
|
|
133 for my $binId (@$binList) {
|
|
134 my $sFile = File::Spec->catfile($chromDir,'bins',$binId,'strelka.stats');
|
|
135 checkFile($sFile,"strelka bin stats");
|
|
136 open(my $sFH, '<', $sFile)
|
|
137 || errorX("Can't open file: '$sFile' $!");
|
|
138
|
|
139 my $is_found=0;
|
|
140 while(<$sFH>) {
|
|
141 next unless(/^NORMAL_NO_REF_N_COVERAGE\s/);
|
|
142 my $is_bad = 0;
|
|
143 if(not /sample_size:\s*(\d+)\s+/) { $is_bad=1; }
|
|
144 my $ss=$1;
|
|
145 # leave the regex for mean fairly loose to pick up scientific notation, etc..
|
|
146 if(not /mean:\s*(\d[^\s]*|nan)\s+/) { $is_bad=1; }
|
|
147 my $mean=$1;
|
|
148 errorX("Unexpected format in file: '$sFile'") if($is_bad);
|
|
149
|
|
150 my $coverage = ( $ss==0 ? 0 : int($ss*$mean) );
|
|
151
|
|
152 $totalCoverage += $coverage;
|
|
153 $is_found=1;
|
|
154 last;
|
|
155 }
|
|
156 close($sFH);
|
|
157 errorX("Unexpected format in file: '$sFile'") unless($is_found);
|
|
158 }
|
|
159
|
|
160 my $chromKnownSize = $config->{derived}{$chromKnownSizeKey};
|
|
161 my $normalMeanCoverage = ($totalCoverage/$chromKnownSize);
|
|
162 $filterCoverage = $normalMeanCoverage*$depthFilterMultiple;
|
|
163 }
|
|
164
|
|
165
|
|
166 # add filter description to vcf header unless it already exists
|
|
167 # return 1 if filter id already exists, client can decide if this is an error
|
|
168 #
|
|
169 sub add_vcf_filter($$$) {
|
|
170 my ($vcf,$id,$desc) = @_;
|
|
171 return 1 if(scalar(@{$vcf->get_header_line(key=>'FILTER', ID=>$id)}));
|
|
172 $vcf->add_header_line({key=>'FILTER', ID=>$id,Description=>$desc});
|
|
173 return 0;
|
|
174 }
|
|
175
|
|
176
|
|
177 sub check_vcf_for_sample($$$) {
|
|
178 my ($vcf,$sample,$file) = @_;
|
|
179 my $is_found=0;
|
|
180 for ($vcf->get_samples()) {
|
|
181 $is_found=1 if($_ eq $sample);
|
|
182 }
|
|
183 errorX("Failed to find sample '$sample' in vcf file '$file'") unless($is_found);
|
|
184 }
|
|
185
|
|
186
|
|
187
|
|
188 my $depthFiltId="DP";
|
|
189
|
|
190
|
|
191 # Runs all post-call vcf filters:
|
|
192 #
|
|
193 sub filterSnvFileList(\@$$$) {
|
|
194 my ($ifiles,$depthFilterVal,$acceptFileName,$isUseDepthFilter) = @_;
|
|
195
|
|
196 my $baseFiltId="BCNoise";
|
|
197 my $spanFiltId="SpanDel";
|
|
198 my $qFiltId="QSS_ref";
|
|
199
|
|
200 open(my $aFH,'>',$acceptFileName)
|
|
201 or errorX("Failed to open file: '$acceptFileName'");
|
|
202
|
|
203 my $is_first=1;
|
|
204 for my $ifile (@$ifiles) {
|
|
205 open(my $iFH,'<',"$ifile")
|
|
206 or errorX("Failed to open file: '$ifile'");
|
|
207 my $vcf = Vcf->new(fh=>$iFH);
|
|
208
|
|
209 # run some simple header validation on each vcf:
|
|
210 $vcf->parse_header();
|
|
211 check_vcf_for_sample($vcf,'NORMAL',$ifile);
|
|
212 check_vcf_for_sample($vcf,'TUMOR',$ifile);
|
|
213
|
|
214 if($is_first) {
|
|
215 # TODO: update vcf meta-data for chromosome-level filtered files
|
|
216 #
|
|
217 $vcf->remove_header_line(key=>"cmdline");
|
|
218 $vcf->add_header_line({key=>"cmdline",value=>$cmdline});
|
|
219 if($isUseDepthFilter) {
|
|
220 $vcf->add_header_line({key=>"maxDepth_$chrom",value=>$depthFilterVal});
|
|
221 add_vcf_filter($vcf,$depthFiltId,"Greater than ${depthFilterMultiple}x chromosomal mean depth in Normal sample");
|
|
222 }
|
|
223 add_vcf_filter($vcf,$baseFiltId,"Fraction of basecalls filtered at this site in either sample is at or above $snvMaxFilteredBasecallFrac");
|
|
224 add_vcf_filter($vcf,$spanFiltId,"Fraction of reads crossing site with spanning deletions in either sample exceeeds $snvMaxSpanningDeletionFrac");
|
|
225 add_vcf_filter($vcf,$qFiltId,"Normal sample is not homozygous ref or ssnv Q-score < $minQSSNT, ie calls with NT!=ref or QSS_NT < $minQSSNT");
|
|
226 print $aFH $vcf->format_header();
|
|
227 $is_first=0;
|
|
228 }
|
|
229
|
|
230 while(my $x=$vcf->next_data_hash()) {
|
|
231 my $norm=$x->{gtypes}->{NORMAL};
|
|
232 my $tumr=$x->{gtypes}->{TUMOR};
|
|
233
|
|
234 my %filters;
|
|
235
|
|
236 # normal depth filter:
|
|
237 my $normalDP=$norm->{DP};
|
|
238 if($isUseDepthFilter) {
|
|
239 $filters{$depthFiltId} = ($normalDP > $depthFilterVal);
|
|
240 }
|
|
241
|
|
242 # filtered basecall fraction:
|
|
243 my $normal_filt=($normalDP>0 ? $norm->{FDP}/$normalDP : 0);
|
|
244
|
|
245 my $tumorDP=$tumr->{DP};
|
|
246 my $tumor_filt=($tumorDP>0 ? $tumr->{FDP}/$tumorDP : 0);
|
|
247
|
|
248 $filters{$baseFiltId}=(($normal_filt >= $snvMaxFilteredBasecallFrac) or
|
|
249 ($tumor_filt >= $snvMaxFilteredBasecallFrac));
|
|
250
|
|
251 # spanning deletion fraction:
|
|
252 my $normalSDP=$norm->{SDP};
|
|
253 my $normalSpanTot=($normalDP+$normalSDP);
|
|
254 my $normalSpanDelFrac=($normalSpanTot>0 ? ($normalSDP/$normalSpanTot) : 0);
|
|
255
|
|
256 my $tumorSDP=$tumr->{SDP};
|
|
257 my $tumorSpanTot=($tumorDP+$tumorSDP);
|
|
258 my $tumorSpanDelFrac=($tumorSpanTot>0 ? ($tumorSDP/$tumorSpanTot) : 0);
|
|
259
|
|
260 $filters{$spanFiltId}=(($normalSpanDelFrac > $snvMaxSpanningDeletionFrac) or
|
|
261 ($tumorSpanDelFrac > $snvMaxSpanningDeletionFrac));
|
|
262
|
|
263 # Q-val filter:
|
|
264 $filters{$qFiltId}=(($x->{INFO}->{NT} ne "ref") or
|
|
265 ($x->{INFO}->{QSS_NT} < $minQSSNT));
|
|
266
|
|
267 $x->{FILTER} = $vcf->add_filter($x->{FILTER},%filters);
|
|
268
|
|
269 print $aFH $vcf->format_line($x);
|
|
270 }
|
|
271
|
|
272 $vcf->close();
|
|
273 close($iFH);
|
|
274 }
|
|
275
|
|
276 close($aFH);
|
|
277 }
|
|
278
|
|
279
|
|
280
|
|
281 sub updateA2(\@$) {
|
|
282 my ($a2,$FH) = @_;
|
|
283 my $line=<$FH>;
|
|
284 unless(defined($line)) { errorX("Unexpected end of somatic indel window file"); }
|
|
285 chomp $line;
|
|
286 @$a2 = split("\t",$line);
|
|
287 unless(scalar(@$a2)) { errorX("Unexpected format in somatic indel window file"); }
|
|
288 }
|
|
289
|
|
290
|
|
291
|
|
292 sub filterIndelFileList(\@$$$) {
|
|
293 my ($ifiles,$depthFilterVal,$acceptFileName,$isUseDepthFilter) = @_;
|
|
294
|
|
295 my $repeatFiltId="Repeat";
|
|
296 my $iHpolFiltId="iHpol";
|
|
297 my $baseFiltId="BCNoise";
|
|
298 my $qFiltId="QSI_ref";
|
|
299
|
|
300 open(my $aFH,'>',$acceptFileName)
|
|
301 or errorX("Failed to open file: '$acceptFileName'");
|
|
302
|
|
303 my $is_first=1;
|
|
304 for my $ifile (@$ifiles) {
|
|
305 open(my $iFH,'<',"$ifile")
|
|
306 or errorX("Failed to open somatic indel file: '$ifile'");
|
|
307
|
|
308 my $iwfile = $ifile . ".window";
|
|
309 open(my $iwFH,'<',"$iwfile")
|
|
310 or errorX("Failed to open somatic indel window file: '$iwfile'");
|
|
311
|
|
312 my @a2; # hold window file data for one line in case we overstep...
|
|
313
|
|
314 my $vcf = Vcf->new(fh=>$iFH);
|
|
315
|
|
316 # run some simple header validation on each vcf:
|
|
317 $vcf->parse_header();
|
|
318 check_vcf_for_sample($vcf,'NORMAL',$ifile);
|
|
319 check_vcf_for_sample($vcf,'TUMOR',$ifile);
|
|
320
|
|
321 if($is_first) {
|
|
322 # TODO: update all vcf meta-data for chromosome-level filtered files
|
|
323 #
|
|
324 $vcf->remove_header_line(key=>"cmdline");
|
|
325 $vcf->add_header_line({key=>"cmdline",value=>$cmdline});
|
|
326 if($isUseDepthFilter) {
|
|
327 $vcf->add_header_line({key=>"maxDepth_$chrom",value=>$depthFilterVal});
|
|
328 }
|
|
329 $vcf->add_header_line({key=>'FORMAT', ID=>'DP50',Number=>1,Type=>'Float',Description=>'Average tier1 read depth within 50 bases'});
|
|
330 $vcf->add_header_line({key=>'FORMAT', ID=>'FDP50',Number=>1,Type=>'Float',Description=>'Average tier1 number of basecalls filtered from original read depth within 50 bases'});
|
|
331 $vcf->add_header_line({key=>'FORMAT', ID=>'SUBDP50',Number=>1,Type=>'Float',Description=>'Average number of reads below tier1 mapping quality threshold aligned across sites within 50 bases'});
|
|
332 if($isUseDepthFilter) {
|
|
333 add_vcf_filter($vcf,$depthFiltId,"Greater than ${depthFilterMultiple}x chromosomal mean depth in Normal sample");
|
|
334 }
|
|
335 add_vcf_filter($vcf,$repeatFiltId,"Sequence repeat of more than ${indelMaxRefRepeat}x in the reference sequence");
|
|
336 add_vcf_filter($vcf,$iHpolFiltId,"Indel overlaps an interupted homopolymer longer than ${indelMaxIntHpolLength}x in the reference sequence");
|
|
337 add_vcf_filter($vcf,$baseFiltId,"Average fraction of filtered basecalls within 50 bases of the indel exceeds $indelMaxWindowFilteredBasecallFrac");
|
|
338 add_vcf_filter($vcf,$qFiltId,"Normal sample is not homozygous ref or sindel Q-score < $minQSINT, ie calls with NT!=ref or QSI_NT < $minQSINT");
|
|
339 print $aFH $vcf->format_header();
|
|
340 $is_first=0;
|
|
341 }
|
|
342
|
|
343 while(my $x=$vcf->next_data_hash()) {
|
|
344 $vcf->add_format_field($x,'DP50');
|
|
345 $vcf->add_format_field($x,'FDP50');
|
|
346 $vcf->add_format_field($x,'SUBDP50');
|
|
347
|
|
348 my $norm=$x->{gtypes}->{NORMAL};
|
|
349 my $tumr=$x->{gtypes}->{TUMOR};
|
|
350
|
|
351 my $chrom=$x->{CHROM};
|
|
352 my $pos=int($x->{POS});
|
|
353
|
|
354 # get matching line from window file:
|
|
355 while((scalar(@a2)<2) or
|
|
356 (($a2[0] le $chrom) and (int($a2[1]) < $pos))) {
|
|
357 updateA2(@a2,$iwFH);
|
|
358 }
|
|
359 unless(scalar(@a2) and ($a2[0] eq $chrom) and (int($a2[1]) == $pos))
|
|
360 { errorX("Can't find somatic indel window position.\nIndel line: " . $vcf->format_line($x) ); }
|
|
361
|
|
362 # add window data to vcf record:
|
|
363 #
|
|
364 $norm->{DP50} = $a2[2]+$a2[3];
|
|
365 $norm->{FDP50} = $a2[3];
|
|
366 $norm->{SUBDP50} = $a2[4];
|
|
367 $tumr->{DP50} = $a2[5]+$a2[6];
|
|
368 $tumr->{FDP50} = $a2[6];
|
|
369 $tumr->{SUBDP50} = $a2[7];
|
|
370
|
|
371 my %filters;
|
|
372
|
|
373 # normal depth filter:
|
|
374 my $normalDP=$norm->{DP};
|
|
375 if($isUseDepthFilter) {
|
|
376 $filters{$depthFiltId}=($normalDP > $depthFilterVal);
|
|
377 }
|
|
378
|
|
379 # ref repeat
|
|
380 my $refRep=$x->{INFO}->{RC};
|
|
381 $filters{$repeatFiltId}=(defined($refRep) and
|
|
382 ($refRep > $indelMaxRefRepeat));
|
|
383
|
|
384 # ihpol
|
|
385 my $iHpol=$x->{INFO}->{IHP};
|
|
386 $filters{$iHpolFiltId}=(defined($iHpol) and
|
|
387 ($iHpol > $indelMaxIntHpolLength));
|
|
388
|
|
389 # base filt:
|
|
390 my $normWinFrac=( $norm->{DP50} ? $norm->{FDP50}/$norm->{DP50} : 0 );
|
|
391 my $tumrWinFrac=( $tumr->{DP50} ? $tumr->{FDP50}/$tumr->{DP50} : 0 );
|
|
392 $filters{$baseFiltId}=( ($normWinFrac >= $indelMaxWindowFilteredBasecallFrac) or
|
|
393 ($tumrWinFrac >= $indelMaxWindowFilteredBasecallFrac) );
|
|
394
|
|
395 # Q-val filter:
|
|
396 $filters{$qFiltId}=(($x->{INFO}->{NT} ne "ref") or
|
|
397 ($x->{INFO}->{QSI_NT} < $minQSINT));
|
|
398
|
|
399 $x->{FILTER} = $vcf->add_filter($x->{FILTER},%filters);
|
|
400
|
|
401 print $aFH $vcf->format_line($x);
|
|
402 }
|
|
403
|
|
404 $vcf->close();
|
|
405 close($iFH);
|
|
406 close($iwFH);
|
|
407 }
|
|
408
|
|
409 close($aFH);
|
|
410 }
|
|
411
|
|
412
|
|
413
|
|
414 my @ssnvFiles;
|
|
415 my @sindelFiles;
|
|
416 for my $binId (@$binList) {
|
|
417 my $ssnvFile = File::Spec->catfile($chromDir,'bins',$binId,'somatic.snvs.unfiltered.vcf');
|
|
418 my $sindelFile = File::Spec->catfile($chromDir,'bins',$binId,'somatic.indels.unfiltered.vcf');
|
|
419 checkFile($ssnvFile,"bin snv file");
|
|
420 checkFile($sindelFile,"bin indel file");
|
|
421 push @ssnvFiles,$ssnvFile;
|
|
422 push @sindelFiles,$sindelFile;
|
|
423 }
|
|
424
|
|
425 my $ssnvOutFile = File::Spec->catfile($chromDir,"somatic.snvs.vcf");
|
|
426 filterSnvFileList(@ssnvFiles,$filterCoverage,$ssnvOutFile,$isUseDepthFilter);
|
|
427
|
|
428 my $sindelOutFile = File::Spec->catfile($chromDir,"somatic.indels.vcf");
|
|
429 filterIndelFileList(@sindelFiles,$filterCoverage,$sindelOutFile,$isUseDepthFilter);
|
|
430
|
|
431
|
|
432 1;
|