comparison libexec/filterSomaticVariants.pl @ 6:87568e5a7d4f

Testing strelka version 0.0.1
author mini
date Fri, 26 Sep 2014 13:24:13 +0200
parents
children 0e8e6011082b
comparison
equal deleted inserted replaced
5:07cbbd662111 6:87568e5a7d4f
1 #!/usr/bin/env perl
2
3 =head1 LICENSE
4
5 Strelka Workflow Software
6 Copyright (c) 2009-2013 Illumina, Inc.
7
8 This software is provided under the terms and conditions of the
9 Illumina Open Source Software License 1.
10
11 You should have received a copy of the Illumina Open Source
12 Software License 1 along with this program. If not, see
13 <https://github.com/downloads/sequencing/licenses/>.
14
15 =head1 SYNOPSIS
16
17 filterSomaticVariants.pl [options] | --help
18
19 =head2 SUMMARY
20
21 Aggregate and filter the variant caller results by chromosome
22
23 =cut
24
25 use warnings FATAL => 'all';
26 use strict;
27
28 use Carp;
29 $SIG{__DIE__} = \&Carp::confess;
30
31 use File::Spec;
32 use Getopt::Long;
33 use Pod::Usage;
34
35 my $baseDir;
36 my $libDir;
37 #my $vcftDir;
38 BEGIN {
39 my $thisDir=(File::Spec->splitpath($0))[1];
40 $baseDir=File::Spec->catdir($thisDir,File::Spec->updir());
41 $libDir=File::Spec->catdir($baseDir,'lib');
42 #my $optDir=File::Spec->catdir($baseDir,'opt');
43 #$vcftDir=File::Spec->catdir($optDir,'vcftools','lib','perl5','site_perl');
44 }
45 use lib $libDir;
46 use Utils;
47 #use lib $vcftDir;
48 #use Vcf;
49
50
51 my $scriptName=(File::Spec->splitpath($0))[2];
52 my $argCount=scalar(@ARGV);
53 my $cmdline = join(' ',$0,@ARGV);
54
55
56 my ($chrom, $configFile);
57 my $help;
58
59 GetOptions( "chrom=s" => \$chrom,
60 "config=s" => \$configFile,
61 "help|h" => \$help) or pod2usage(2);
62
63 pod2usage(2) if($help);
64 pod2usage(2) unless(defined($chrom));
65 pod2usage(2) unless(defined($configFile));
66
67
68
69 #
70 # read config and validate values
71 #
72 checkFile($configFile,"configuration ini");
73 my $config = parseConfigIni($configFile);
74
75 my $chromSizeKey = "chrom_" . $chrom . "_size";
76 my $chromKnownSizeKey = "chrom_" . $chrom . "_knownSize";
77
78 for (("outDir", $chromSizeKey, $chromKnownSizeKey)) {
79 errorX("Undefined configuration option: '$_'") unless(defined($config->{derived}{$_}));
80 }
81 for (qw(binSize ssnvQuality_LowerBound sindelQuality_LowerBound
82 isSkipDepthFilters depthFilterMultiple
83 snvMaxFilteredBasecallFrac snvMaxSpanningDeletionFrac
84 indelMaxRefRepeat indelMaxWindowFilteredBasecallFrac
85 indelMaxIntHpolLength)) {
86 errorX("Undefined configuration option: '$_'") unless(defined($config->{user}{$_}));
87 }
88
89 my $outDir = $config->{derived}{outDir};
90 my $chromDir = File::Spec->catdir($outDir,'chromosomes',$chrom);
91 checkDir($outDir,"output");
92 checkDir($chromDir,"output chromosome");
93
94 my $userconfig = $config->{user};
95
96 my $binList = getBinList($config->{derived}{$chromSizeKey},$userconfig->{binSize});
97 for my $binId (@$binList) {
98 my $dir = File::Spec->catdir($chromDir,'bins',$binId);
99 checkDir($dir,"input bin");
100 }
101
102 #
103 # parameters from user config file:
104 #
105
106 # minimum passed ssnv_nt Q-score:
107 my $minQSSNT=$userconfig->{ssnvQuality_LowerBound};
108 # minimum passed sindel_nt Q-score:
109 my $minQSINT=$userconfig->{sindelQuality_LowerBound};
110
111 #
112 # filtration parameters from user config file:
113 #
114
115 # skip depth filters for targeted resequencing:
116 my $isUseDepthFilter=(! $userconfig->{isSkipDepthFilters});
117 # multiple of the normal mean coverage to filter snvs and indels
118 my $depthFilterMultiple=$userconfig->{depthFilterMultiple};
119 # max filtered basecall fraction for any sample
120 my $snvMaxFilteredBasecallFrac=$userconfig->{snvMaxFilteredBasecallFrac};
121 # max snv spanning-deletion fraction for any sample
122 my $snvMaxSpanningDeletionFrac=$userconfig->{snvMaxSpanningDeletionFrac};
123 # max indel reference repeat length
124 my $indelMaxRefRepeat=$userconfig->{indelMaxRefRepeat};
125 # max indel window filter fraction
126 my $indelMaxWindowFilteredBasecallFrac=$userconfig->{indelMaxWindowFilteredBasecallFrac};
127 # max indel interupted hompolymer length:
128 my $indelMaxIntHpolLength=$userconfig->{indelMaxIntHpolLength};
129
130
131 # first we want the normal sample mean chromosome depth:
132 #
133 my $filterCoverage;
134 if($isUseDepthFilter) {
135 my $totalCoverage = 0;
136 for my $binId (@$binList) {
137 my $sFile = File::Spec->catfile($chromDir,'bins',$binId,'strelka.stats');
138 checkFile($sFile,"strelka bin stats");
139 open(my $sFH, '<', $sFile)
140 || errorX("Can't open file: '$sFile' $!");
141
142 my $is_found=0;
143 while(<$sFH>) {
144 next unless(/^NORMAL_NO_REF_N_COVERAGE\s/);
145 my $is_bad = 0;
146 if(not /sample_size:\s*(\d+)\s+/) { $is_bad=1; }
147 my $ss=$1;
148 # leave the regex for mean fairly loose to pick up scientific notation, etc..
149 if(not /mean:\s*(\d[^\s]*|nan)\s+/) { $is_bad=1; }
150 my $mean=$1;
151 errorX("Unexpected format in file: '$sFile'") if($is_bad);
152
153 my $coverage = ( $ss==0 ? 0 : int($ss*$mean) );
154
155 $totalCoverage += $coverage;
156 $is_found=1;
157 last;
158 }
159 close($sFH);
160 errorX("Unexpected format in file: '$sFile'") unless($is_found);
161 }
162
163 my $chromKnownSize = $config->{derived}{$chromKnownSizeKey};
164 my $normalMeanCoverage = ($totalCoverage/$chromKnownSize);
165 $filterCoverage = $normalMeanCoverage*$depthFilterMultiple;
166 }
167
168
169 # add filter description to vcf header unless it already exists
170 # return 1 if filter id already exists, client can decide if this is an error
171 #
172 sub add_vcf_filter($$$) {
173 my ($vcf,$id,$desc) = @_;
174 return 1 if(scalar(@{$vcf->get_header_line(key=>'FILTER', ID=>$id)}));
175 $vcf->add_header_line({key=>'FILTER', ID=>$id,Description=>$desc});
176 return 0;
177 }
178
179
180 sub check_vcf_for_sample($$$) {
181 my ($vcf,$sample,$file) = @_;
182 my $is_found=0;
183 for ($vcf->get_samples()) {
184 $is_found=1 if($_ eq $sample);
185 }
186 errorX("Failed to find sample '$sample' in vcf file '$file'") unless($is_found);
187 }
188
189
190
191 my $depthFiltId="DP";
192
193
194 # Runs all post-call vcf filters:
195 #
196 sub filterSnvFileList(\@$$$) {
197 my ($ifiles,$depthFilterVal,$acceptFileName,$isUseDepthFilter) = @_;
198
199 my $baseFiltId="BCNoise";
200 my $spanFiltId="SpanDel";
201 my $qFiltId="QSS_ref";
202
203 open(my $aFH,'>',$acceptFileName)
204 or errorX("Failed to open file: '$acceptFileName'");
205
206 my $is_first=1;
207 for my $ifile (@$ifiles) {
208 open(my $iFH,'<',"$ifile")
209 or errorX("Failed to open file: '$ifile'");
210 my $vcf = Vcf->new(fh=>$iFH);
211
212 # run some simple header validation on each vcf:
213 $vcf->parse_header();
214 check_vcf_for_sample($vcf,'NORMAL',$ifile);
215 check_vcf_for_sample($vcf,'TUMOR',$ifile);
216
217 if($is_first) {
218 # TODO: update vcf meta-data for chromosome-level filtered files
219 #
220 $vcf->remove_header_line(key=>"cmdline");
221 $vcf->add_header_line({key=>"cmdline",value=>$cmdline});
222 if($isUseDepthFilter) {
223 $vcf->add_header_line({key=>"maxDepth_$chrom",value=>$depthFilterVal});
224 add_vcf_filter($vcf,$depthFiltId,"Greater than ${depthFilterMultiple}x chromosomal mean depth in Normal sample");
225 }
226 add_vcf_filter($vcf,$baseFiltId,"Fraction of basecalls filtered at this site in either sample is at or above $snvMaxFilteredBasecallFrac");
227 add_vcf_filter($vcf,$spanFiltId,"Fraction of reads crossing site with spanning deletions in either sample exceeeds $snvMaxSpanningDeletionFrac");
228 add_vcf_filter($vcf,$qFiltId,"Normal sample is not homozygous ref or ssnv Q-score < $minQSSNT, ie calls with NT!=ref or QSS_NT < $minQSSNT");
229 print $aFH $vcf->format_header();
230 $is_first=0;
231 }
232
233 while(my $x=$vcf->next_data_hash()) {
234 my $norm=$x->{gtypes}->{NORMAL};
235 my $tumr=$x->{gtypes}->{TUMOR};
236
237 my %filters;
238
239 # normal depth filter:
240 my $normalDP=$norm->{DP};
241 if($isUseDepthFilter) {
242 $filters{$depthFiltId} = ($normalDP > $depthFilterVal);
243 }
244
245 # filtered basecall fraction:
246 my $normal_filt=($normalDP>0 ? $norm->{FDP}/$normalDP : 0);
247
248 my $tumorDP=$tumr->{DP};
249 my $tumor_filt=($tumorDP>0 ? $tumr->{FDP}/$tumorDP : 0);
250
251 $filters{$baseFiltId}=(($normal_filt >= $snvMaxFilteredBasecallFrac) or
252 ($tumor_filt >= $snvMaxFilteredBasecallFrac));
253
254 # spanning deletion fraction:
255 my $normalSDP=$norm->{SDP};
256 my $normalSpanTot=($normalDP+$normalSDP);
257 my $normalSpanDelFrac=($normalSpanTot>0 ? ($normalSDP/$normalSpanTot) : 0);
258
259 my $tumorSDP=$tumr->{SDP};
260 my $tumorSpanTot=($tumorDP+$tumorSDP);
261 my $tumorSpanDelFrac=($tumorSpanTot>0 ? ($tumorSDP/$tumorSpanTot) : 0);
262
263 $filters{$spanFiltId}=(($normalSpanDelFrac > $snvMaxSpanningDeletionFrac) or
264 ($tumorSpanDelFrac > $snvMaxSpanningDeletionFrac));
265
266 # Q-val filter:
267 $filters{$qFiltId}=(($x->{INFO}->{NT} ne "ref") or
268 ($x->{INFO}->{QSS_NT} < $minQSSNT));
269
270 $x->{FILTER} = $vcf->add_filter($x->{FILTER},%filters);
271
272 print $aFH $vcf->format_line($x);
273 }
274
275 $vcf->close();
276 close($iFH);
277 }
278
279 close($aFH);
280 }
281
282
283
284 sub updateA2(\@$) {
285 my ($a2,$FH) = @_;
286 my $line=<$FH>;
287 unless(defined($line)) { errorX("Unexpected end of somatic indel window file"); }
288 chomp $line;
289 @$a2 = split("\t",$line);
290 unless(scalar(@$a2)) { errorX("Unexpected format in somatic indel window file"); }
291 }
292
293
294
295 sub filterIndelFileList(\@$$$) {
296 my ($ifiles,$depthFilterVal,$acceptFileName,$isUseDepthFilter) = @_;
297
298 my $repeatFiltId="Repeat";
299 my $iHpolFiltId="iHpol";
300 my $baseFiltId="BCNoise";
301 my $qFiltId="QSI_ref";
302
303 open(my $aFH,'>',$acceptFileName)
304 or errorX("Failed to open file: '$acceptFileName'");
305
306 my $is_first=1;
307 for my $ifile (@$ifiles) {
308 open(my $iFH,'<',"$ifile")
309 or errorX("Failed to open somatic indel file: '$ifile'");
310
311 my $iwfile = $ifile . ".window";
312 open(my $iwFH,'<',"$iwfile")
313 or errorX("Failed to open somatic indel window file: '$iwfile'");
314
315 my @a2; # hold window file data for one line in case we overstep...
316
317 my $vcf = Vcf->new(fh=>$iFH);
318
319 # run some simple header validation on each vcf:
320 $vcf->parse_header();
321 check_vcf_for_sample($vcf,'NORMAL',$ifile);
322 check_vcf_for_sample($vcf,'TUMOR',$ifile);
323
324 if($is_first) {
325 # TODO: update all vcf meta-data for chromosome-level filtered files
326 #
327 $vcf->remove_header_line(key=>"cmdline");
328 $vcf->add_header_line({key=>"cmdline",value=>$cmdline});
329 if($isUseDepthFilter) {
330 $vcf->add_header_line({key=>"maxDepth_$chrom",value=>$depthFilterVal});
331 }
332 $vcf->add_header_line({key=>'FORMAT', ID=>'DP50',Number=>1,Type=>'Float',Description=>'Average tier1 read depth within 50 bases'});
333 $vcf->add_header_line({key=>'FORMAT', ID=>'FDP50',Number=>1,Type=>'Float',Description=>'Average tier1 number of basecalls filtered from original read depth within 50 bases'});
334 $vcf->add_header_line({key=>'FORMAT', ID=>'SUBDP50',Number=>1,Type=>'Float',Description=>'Average number of reads below tier1 mapping quality threshold aligned across sites within 50 bases'});
335 if($isUseDepthFilter) {
336 add_vcf_filter($vcf,$depthFiltId,"Greater than ${depthFilterMultiple}x chromosomal mean depth in Normal sample");
337 }
338 add_vcf_filter($vcf,$repeatFiltId,"Sequence repeat of more than ${indelMaxRefRepeat}x in the reference sequence");
339 add_vcf_filter($vcf,$iHpolFiltId,"Indel overlaps an interupted homopolymer longer than ${indelMaxIntHpolLength}x in the reference sequence");
340 add_vcf_filter($vcf,$baseFiltId,"Average fraction of filtered basecalls within 50 bases of the indel exceeds $indelMaxWindowFilteredBasecallFrac");
341 add_vcf_filter($vcf,$qFiltId,"Normal sample is not homozygous ref or sindel Q-score < $minQSINT, ie calls with NT!=ref or QSI_NT < $minQSINT");
342 print $aFH $vcf->format_header();
343 $is_first=0;
344 }
345
346 while(my $x=$vcf->next_data_hash()) {
347 $vcf->add_format_field($x,'DP50');
348 $vcf->add_format_field($x,'FDP50');
349 $vcf->add_format_field($x,'SUBDP50');
350
351 my $norm=$x->{gtypes}->{NORMAL};
352 my $tumr=$x->{gtypes}->{TUMOR};
353
354 my $chrom=$x->{CHROM};
355 my $pos=int($x->{POS});
356
357 # get matching line from window file:
358 while((scalar(@a2)<2) or
359 (($a2[0] le $chrom) and (int($a2[1]) < $pos))) {
360 updateA2(@a2,$iwFH);
361 }
362 unless(scalar(@a2) and ($a2[0] eq $chrom) and (int($a2[1]) == $pos))
363 { errorX("Can't find somatic indel window position.\nIndel line: " . $vcf->format_line($x) ); }
364
365 # add window data to vcf record:
366 #
367 $norm->{DP50} = $a2[2]+$a2[3];
368 $norm->{FDP50} = $a2[3];
369 $norm->{SUBDP50} = $a2[4];
370 $tumr->{DP50} = $a2[5]+$a2[6];
371 $tumr->{FDP50} = $a2[6];
372 $tumr->{SUBDP50} = $a2[7];
373
374 my %filters;
375
376 # normal depth filter:
377 my $normalDP=$norm->{DP};
378 if($isUseDepthFilter) {
379 $filters{$depthFiltId}=($normalDP > $depthFilterVal);
380 }
381
382 # ref repeat
383 my $refRep=$x->{INFO}->{RC};
384 $filters{$repeatFiltId}=(defined($refRep) and
385 ($refRep > $indelMaxRefRepeat));
386
387 # ihpol
388 my $iHpol=$x->{INFO}->{IHP};
389 $filters{$iHpolFiltId}=(defined($iHpol) and
390 ($iHpol > $indelMaxIntHpolLength));
391
392 # base filt:
393 my $normWinFrac=( $norm->{DP50} ? $norm->{FDP50}/$norm->{DP50} : 0 );
394 my $tumrWinFrac=( $tumr->{DP50} ? $tumr->{FDP50}/$tumr->{DP50} : 0 );
395 $filters{$baseFiltId}=( ($normWinFrac >= $indelMaxWindowFilteredBasecallFrac) or
396 ($tumrWinFrac >= $indelMaxWindowFilteredBasecallFrac) );
397
398 # Q-val filter:
399 $filters{$qFiltId}=(($x->{INFO}->{NT} ne "ref") or
400 ($x->{INFO}->{QSI_NT} < $minQSINT));
401
402 $x->{FILTER} = $vcf->add_filter($x->{FILTER},%filters);
403
404 print $aFH $vcf->format_line($x);
405 }
406
407 $vcf->close();
408 close($iFH);
409 close($iwFH);
410 }
411
412 close($aFH);
413 }
414
415
416
417 my @ssnvFiles;
418 my @sindelFiles;
419 for my $binId (@$binList) {
420 my $ssnvFile = File::Spec->catfile($chromDir,'bins',$binId,'somatic.snvs.unfiltered.vcf');
421 my $sindelFile = File::Spec->catfile($chromDir,'bins',$binId,'somatic.indels.unfiltered.vcf');
422 checkFile($ssnvFile,"bin snv file");
423 checkFile($sindelFile,"bin indel file");
424 push @ssnvFiles,$ssnvFile;
425 push @sindelFiles,$sindelFile;
426 }
427
428 my $ssnvOutFile = File::Spec->catfile($chromDir,"somatic.snvs.vcf");
429 filterSnvFileList(@ssnvFiles,$filterCoverage,$ssnvOutFile,$isUseDepthFilter);
430
431 my $sindelOutFile = File::Spec->catfile($chromDir,"somatic.indels.vcf");
432 filterIndelFileList(@sindelFiles,$filterCoverage,$sindelOutFile,$isUseDepthFilter);
433
434
435 1;