comparison rare_triage @ 0:0d7a85ddac86 default tip

initial commit of project
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:44:41 -0600
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:0d7a85ddac86
1 #!/usr/bin/env perl
2
3 use strict;
4 use warnings;
5
6 my $trio = 0;
7 if(@ARGV == 1 and $ARGV[0] eq "-v"){
8 print "Version 0.1"; exit;
9 }
10 if($ARGV[0] eq "-t"){
11 shift @ARGV;
12 $trio = 1;
13 }
14
15 @ARGV == 8 or @ARGV == 9 or die "Usage: $0 [-t(rio)] <input hgvs annotated variant table.txt> <novel output.txt> <rare output.txt> <maf reporting threshold> <max. gene DNA length caveat cutoff> <report non-coding variants too (true|false)> <num samples allowed to be missing> <dominant model (true|false)> [known false positive novels.txt]\n";
16
17 my %known_fps; # a priori known false positives
18 if(@ARGV == 9){
19 open(FP, $ARGV[8])
20 or die "Cannot open $ARGV[8] for reading: $!\n";
21 while(<FP>){
22 next if /^\s*#/;
23 chomp;
24 my @F = split /\t/, $_;
25 $known_fps{"$F[0]:$F[1]"} = 1;
26 }
27 close(FP);
28 pop @ARGV;
29 }
30
31 my $maybe_dominant = pop @ARGV; # i.e. should novel non-benign het calls be considered across the cohort?
32 $maybe_dominant = $maybe_dominant =~ /^(?:t(?:rue)?|1|y(?:es)?)/i;
33 my $num_allowed_missing = pop @ARGV; # report rare events even if not universal, as long as coverage is missing in the non-conforming samples
34 my $all_variants = pop @ARGV; # restrict to protein-coding only?
35 $all_variants = $all_variants =~ /^(?:t(?:rue)?|1|y(?:es)?)/i;
36 my $gene_length_threshold = pop @ARGV;
37 my $maf_threshold = pop @ARGV;
38
39 my %gene_vars;
40 my %gene2chr;
41 my %gene2desc;
42 my %pos2info;
43 my %pos2hgvs;
44 my %sample_count;
45 open(IN, $ARGV[0])
46 or die "Cannot open $ARGV[0] for reading: $!\n";
47 open(NOVEL, ">$ARGV[1]")
48 or die "Cannot open $ARGV[1] for writing: $!\n";
49 open(RARE, ">$ARGV[2]")
50 or die "Cannot open $ARGV[2] for writing: $!\n";
51 <IN>; # header
52 while(<IN>){
53 chomp;
54 my @F = split /\t/, $_;
55 my $chr = $F[0];
56 my $pos = $F[1];
57 next if exists $known_fps{"$chr:$pos"};
58 my $var_depth = $F[6];
59 my $total_depth = $F[7];
60 my @rsid_list = split /,/, $F[10];
61 my @dbsnp_ver_list = split /,/, $F[11];
62 my @maf_list = split /,/, $F[12];
63 my $rare_event = 0;
64 for (my $i = 0; $i <= $#maf_list; $i++){
65 next if $maf_list[$i] eq "-";
66 if($maf_list[$i] eq "NA" or $maf_list[$i] <= $maf_threshold){
67 $rare_event = 1 if $rsid_list[$i] eq "novel" or $rsid_list[$i] =~ /^rs/ and $dbsnp_ver_list[$i] >= 132;
68 # print "$chr:$pos $maf_list[$i] $rsid_list[$i] $dbsnp_ver_list[$i] : $rare_event\n";
69 last;
70 }
71 }
72 next unless $rare_event;
73 my $sift = $F[13];
74 my $polyphen = $F[15];
75 my $dna_hgvs = $F[18];
76 next if $F[9] =~ /UTR/;
77 my $transcript_id = $F[19];
78 my $transcript_length = $F[20];
79 my $prot_hgvs = $F[21];
80 next if $prot_hgvs =~ /=$/;
81 my $zygosity_list = $F[22];
82 my $sample_list = $F[$#F];
83 my $phase_list = $F[$#F-1];
84 my $caveats = $F[$#F-2];
85 my $desc = $F[$#F-3];
86 my $tissues = $F[$#F-5];
87 my $gene = $F[17];
88 next if not defined $gene;
89 #for my $gene (split /;/, $gene_list){
90 next if not $gene =~ /\S/; # only work with named genes
91 $gene2chr{$gene} = $chr if not exists $gene2chr{$gene};
92 $gene2desc{$gene} = $desc if not exists $gene2desc{$gene};
93 $pos2info{"$chr:$pos"} = [join(",",@maf_list), $sift, $polyphen, $caveats, $var_depth, $total_depth, $transcript_length, join(",",@rsid_list)] if not exists $pos2info{"$chr:$pos"};
94 $pos2hgvs{"$chr:$pos"}->{$dna_hgvs} = [] if not exists $pos2hgvs{"$chr:$pos"}->{$dna_hgvs};
95 $pos2hgvs{"$chr:$pos"}->{$prot_hgvs} = [] if not exists $pos2hgvs{"$chr:$pos"}->{$dna_hgvs};
96 push @{$pos2hgvs{"$chr:$pos"}->{$dna_hgvs}}, $transcript_id;
97 push @{$pos2hgvs{"$chr:$pos"}->{$prot_hgvs}}, $transcript_id;
98 $gene_vars{$gene} = {} if not exists $gene_vars{$gene};
99 my @zygosities = split /; /, $zygosity_list;
100 my @phases = split /; /, $phase_list;
101 my @samples = split /; /, $sample_list;
102 for(my $i = 0; $i <= $#samples; $i++){
103 my $sample = $samples[$i];
104 my $phase = $phases[$i];
105 my $zygosity = $zygosities[$i];
106 $gene_vars{$gene}->{$sample} = {} if not exists $gene_vars{$gene}->{$sample};
107 $gene_vars{$gene}->{$sample}->{$pos} = [$zygosity, $phase];
108 $sample_count{$sample}++;
109 # }
110 }
111 }
112 close(IN);
113
114 print NOVEL "Gene Name\tChr/Pos\tIn samples\tEvent Type\tMAF\tSIFT\tPolyPhen2\tGenotyping Caveats\tTranscript Length\tdbSNP ID\tHGVS DNA\tHGVS AA\n";
115 print RARE "Gene Name\tChr/Pos\tIn samples\tEvent Type\tMAF\tSIFT\tPolyPhen2\tGenotyping Caveats\tTranscript Length\tidbSNP ID\tHGVS DNA\tHGVS AA\n";
116 my @sample_names = $trio ? qw(Father Mother Child) : sort keys %sample_count;
117 for my $gene (sort keys %gene_vars){
118 #next unless scalar(keys %{$gene_vars{$gene}}) == scalar(@sample_names); # require complete gene penetrance
119 my $chr = $gene2chr{$gene};
120 my %rare_phase;
121 my %rare_events;
122 my %rare_samples;
123 my %sample_counts;
124 # For trios, it's assumed to be in the order: father, mother, child
125 for my $sample_index (0..$#sample_names){
126 my $sample = $sample_names[$sample_index];
127 next unless exists $gene_vars{$gene}->{$sample};
128 for my $pos (keys %{$gene_vars{$gene}->{$sample}}){
129 next unless $all_variants or grep /p\.|c\.\d+_\d+\+\d|c\.\d+\+[12]_|c\.\d+-[12]_|c\.\d+-\d+_\d+$/, keys %{$pos2hgvs{"$chr:$pos"}};
130 print "Checking $chr:$pos\n";
131 # For now, only look for homo rare or compound het (known from explicit haplotype phasing data provided, or trio)
132 if($gene_vars{$gene}->{$sample}->{$pos}->[0] =~ /homo/){
133 my $maf_list = $pos2info{"$chr:$pos"}->[0];
134 for my $maf (split /,/, $maf_list){
135 next if $maf eq "-";
136 if($trio){ # only report if not homozyguous in either parent
137 next if exists $gene_vars{$gene}->{$sample_names[0]} and exists $gene_vars{$gene}->{$sample_names[0]}->{$pos} and $gene_vars{$gene}->{$sample_names[0]}->{$pos}->[0] =~ /homo/ or
138 exists $gene_vars{$gene}->{$sample_names[1]} and exists $gene_vars{$gene}->{$sample_names[1]}->{$pos} and $gene_vars{$gene}->{$sample_names[1]}->{$pos}->[0] =~ /homo/;
139 }
140 if($maf eq "NA"){
141 if($pos2info{"$chr:$pos"}->[1] eq "1"){ # SIFT says its benign
142 $rare_events{$pos} = "benign homo novel variant";
143 }
144 else{
145 $rare_events{$pos} = "homo novel variant";
146 }
147 $rare_samples{$pos} = [] unless exists $rare_samples{$pos};
148 push @{$rare_samples{$pos}}, $sample;
149 $sample_counts{$sample}++;
150 last;
151 }
152 # if here, it's rare, not novel
153 next if is_benign(\%pos2info, $chr, $pos);
154 # if here, rare and maybe not benign
155 $rare_events{$pos} = "homo rare variant";
156 $rare_samples{$pos} = [] unless exists $rare_samples{$pos};
157 push @{$rare_samples{$pos}}, $sample;
158 $sample_counts{$sample}++;
159 }
160 } # end homo
161 # het novel (if dominant model enabled), possibly non-benign
162 elsif($maybe_dominant and $pos2info{"$chr:$pos"}->[0] =~ /NA/ and $pos2info{"$chr:$pos"}->[7] =~ /novel/){
163 next if $pos2info{"$chr:$pos"}->[3] =~ /non-unique/; # or is_benign(\%pos2info, $chr, $pos); # has mapping caveat or is almost definitely benign
164 if($trio){ # only report if not present in either parent
165 next if exists $gene_vars{$gene}->{$sample_names[0]} and exists $gene_vars{$gene}->{$sample_names[0]}->{$pos} or
166 exists $gene_vars{$gene}->{$sample_names[1]} and exists $gene_vars{$gene}->{$sample_names[1]}->{$pos};
167 }
168 $rare_events{$pos} = "het novel variant, possibly non-benign under dominant disease model";
169 $rare_samples{$pos} = [] unless exists $rare_samples{$pos};
170 push @{$rare_samples{$pos}}, $sample;
171 $sample_counts{$sample}++;
172 }
173 # check if two rare het variants are trans in a sample, using phasing info
174 # the phase info should look like
175 # A-chrX:134986768-134986769
176 # B-chrX:134986768-134986769
177 # or, in the case of trios we have
178 # Mother-GENENAME:1-length
179 # Father-GENENAME:1-length
180 # todo: handle case where more than one type of event happens at one position
181 elsif($trio or $gene_vars{$gene}->{$sample}->{$pos}->[1] =~ /^(\S+?)-(\S+):(\d+)-(\d+)/){
182 #next if is_benign(\%pos2info, $chr, $pos); # don't care about benign rare hets
183 my $novel = $pos2info{"$chr:$pos"}->[0] =~ /NA/;
184 my $phase_group = $trio ? get_phase_group($gene_vars{$gene}->{$sample_names[0]}, $gene_vars{$gene}->{$sample_names[1]}, $pos) : $1; # if trio, need to determine of if the allele came from the father or mother
185 #next if $phase_group eq "Either";
186 my $phase_chr = $trio ? $gene2chr{$gene} : $2;
187 my $phase_start = $trio ? 1 : $3;
188 my $phase_end = $trio ? $pos2info{"$chr:$pos"}->[6] : $4;
189 if(exists $rare_phase{"$phase_chr:$phase_start-$phase_end"}){
190 my $other_group = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[0];
191 next if $other_group eq $phase_group; # rare events on same allele...ignore for now as LD artifact (todo: revisit this???)
192 my $other_pos = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[1];
193 my $other_novel = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[2];
194 my $other_sample = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[3];
195 $rare_events{$pos} = "het ".($novel ? "novel":"rare")." variant phased trans with het ".($other_novel ? "novel":"rare")." variant at $phase_chr:$other_pos";
196 $rare_events{$other_pos} = "het ".($novel ? "novel":"rare")." variant phased trans with het ".($other_novel ? "novel":"rare")." variant at $phase_chr:$pos";
197 $rare_samples{$pos} = [] unless exists $rare_samples{$pos};
198 $rare_samples{$other_pos} = [] unless exists $rare_samples{$pos};
199 push @{$rare_samples{$pos}}, $sample;
200 push @{$rare_samples{$other_pos}}, $other_sample;
201 # todo: could have more than two rare trans hets...how to report? (all reported aganist first one right now)
202 $sample_counts{$sample}++;
203 $sample_counts{$other_sample}++;
204 }
205 else{
206 $rare_phase{"$phase_chr:$phase_start-$phase_end"} = [$phase_group, $pos, $novel, $sample]; # note as first rare allele, in case we find another one later in the same gene
207 }
208 }
209 }
210 }
211 # Don't have rare events in all samples?
212 next if $trio and not exists $sample_counts{"Child"};
213 my $num_missing = scalar(@sample_names) - scalar(keys %sample_counts);
214 my $reason = "";
215 if(not $trio and $num_missing){
216 next if $num_allowed_missing < $num_missing;
217 # todo: check if missing from a sample due to low coverage
218 my @missing_list;
219 for my $name (@sample_names){
220 push @missing_list, $name unless exists $sample_counts{$name};
221 }
222 $reason = "Rare events for this gene are not found in all cohort samples (missing ". join(", ", @missing_list).") / ";
223 # next;
224 }
225
226 my $novelty = 0;
227 my $num_caveats = 0;
228 my $assume_dominance = 0;
229 my $benign = 0;
230 my $gene_length;
231 my @protein_events;
232 for my $pos (sort {$a <=> $b} keys %rare_events){
233 $novelty++ if $rare_events{$pos} eq "homo novel variant" or $rare_events{$pos} =~ /^het novel/;
234 $assume_dominance++ if $rare_events{$pos} =~ /dominant/;
235 $benign++ if $rare_events{$pos} =~ /benign/;
236 $num_caveats++ if $pos2info{"$chr:$pos"}->[3] =~ /\S/;
237 $gene_length = $pos2info{"$chr:$pos"}->[6] if not defined $gene_length;
238 my @dna_info;
239 my @prot_info;
240 for my $hgvs (sort keys %{$pos2hgvs{"$chr:$pos"}}){
241 if($hgvs =~ /^p\./){
242 push @prot_info, join(",", @{$pos2hgvs{"$chr:$pos"}->{$hgvs}}).":$hgvs";
243 }
244 else{
245 push @dna_info, join(",", @{$pos2hgvs{"$chr:$pos"}->{$hgvs}}).":$hgvs";
246 }
247 }
248 push @protein_events, "\t$pos\t". join("/", @{$rare_samples{$pos}}). "\t". $rare_events{$pos}. "\t". join("\t", @{$pos2info{"$chr:$pos"}}).
249 "\t". join(";", @dna_info). "\t". join(";", @prot_info). "\n";
250 }
251 if($novelty){
252 if($gene_length > $gene_length_threshold){
253 $reason .= "Exceptionally long gene, prone to random novel events at heterogeneous loci";
254 }
255 elsif($num_caveats){
256 if($num_caveats+1 >= @protein_events or $num_caveats >= 0.9*@protein_events){
257 $reason .= "Most variant calls for this gene have caveats";
258 }
259 elsif($assume_dominance){
260 my $num_events += scalar(@protein_events)-$assume_dominance;
261 if($num_caveats+1 >= $num_events or $num_caveats >= 0.9*$num_events){
262 $reason .= "Apart from novel hets, most variant calls for this gene have caveats";
263 }
264 }
265 }
266 }
267 elsif($benign == @protein_events){
268 $reason .= "The gene probably only contains benign variants";
269 }
270 else{
271 $reason .= "Variants found in the general population, albeit rarely";
272 }
273
274 # fix formatting if missing sample is the only problem
275 $reason =~ s( / $)();
276
277 if($novelty and not $reason){
278 print NOVEL "$gene\t$gene2chr{$gene}\t$gene2desc{$gene}\n", @protein_events;
279 }
280 else{
281 print RARE "$gene\t$gene2chr{$gene}\t$gene2desc{$gene}\t$reason\n", @protein_events;
282 }
283 }
284
285 sub get_phase_group{
286 my ($father_variants, $mother_variants, $pos) = @_;
287
288 if(defined $father_variants and exists $father_variants->{$pos}){
289 if(defined $mother_variants->{$pos} and exists $mother_variants->{$pos}){
290 return "Either"; # in future, trio-based phasing that models meiotic recombination may help here (e.g PHASE).
291 }
292 else{
293 return "Father";
294 }
295 }
296 elsif(defined $mother_variants->{$pos} and exists $mother_variants->{$pos}){
297 return "Mother";
298 }
299 else{
300 return "De_novo";
301 }
302 }
303
304 sub is_benign{
305 my ($pos2info, $chr, $pos) = @_;
306 if($pos2info->{"$chr:$pos"}->[1] eq "1"){
307 return 1; # sift thinks it's completely benign
308 }
309 elsif($pos2info->{"$chr:$pos"}->[1] eq "NA"){
310 if($pos2info->{"$chr:$pos"}->[2] ne "benign"){ # sift doesn't know, and polyphen thinks it might be bad
311 return 0;
312 }
313 }
314 elsif($pos2info->{"$chr:$pos"}->[1] < 0.06){ # sift thinks it might be bad
315 return 0;
316 }
317 elsif($pos2info->{"$chr:$pos"}->[2] ne "benign"){ # polyphen thinks it might be bad
318 return 0;
319 }
320 return 1; # nobody thinks it's bad
321 }