Mercurial > repos > yusuf > triage_rare_genotypes
comparison rare_triage @ 0:0d7a85ddac86 default tip
initial commit of project
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:44:41 -0600 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0d7a85ddac86 |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 | |
6 my $trio = 0; | |
7 if(@ARGV == 1 and $ARGV[0] eq "-v"){ | |
8 print "Version 0.1"; exit; | |
9 } | |
10 if($ARGV[0] eq "-t"){ | |
11 shift @ARGV; | |
12 $trio = 1; | |
13 } | |
14 | |
15 @ARGV == 8 or @ARGV == 9 or die "Usage: $0 [-t(rio)] <input hgvs annotated variant table.txt> <novel output.txt> <rare output.txt> <maf reporting threshold> <max. gene DNA length caveat cutoff> <report non-coding variants too (true|false)> <num samples allowed to be missing> <dominant model (true|false)> [known false positive novels.txt]\n"; | |
16 | |
17 my %known_fps; # a priori known false positives | |
18 if(@ARGV == 9){ | |
19 open(FP, $ARGV[8]) | |
20 or die "Cannot open $ARGV[8] for reading: $!\n"; | |
21 while(<FP>){ | |
22 next if /^\s*#/; | |
23 chomp; | |
24 my @F = split /\t/, $_; | |
25 $known_fps{"$F[0]:$F[1]"} = 1; | |
26 } | |
27 close(FP); | |
28 pop @ARGV; | |
29 } | |
30 | |
31 my $maybe_dominant = pop @ARGV; # i.e. should novel non-benign het calls be considered across the cohort? | |
32 $maybe_dominant = $maybe_dominant =~ /^(?:t(?:rue)?|1|y(?:es)?)/i; | |
33 my $num_allowed_missing = pop @ARGV; # report rare events even if not universal, as long as coverage is missing in the non-conforming samples | |
34 my $all_variants = pop @ARGV; # restrict to protein-coding only? | |
35 $all_variants = $all_variants =~ /^(?:t(?:rue)?|1|y(?:es)?)/i; | |
36 my $gene_length_threshold = pop @ARGV; | |
37 my $maf_threshold = pop @ARGV; | |
38 | |
39 my %gene_vars; | |
40 my %gene2chr; | |
41 my %gene2desc; | |
42 my %pos2info; | |
43 my %pos2hgvs; | |
44 my %sample_count; | |
45 open(IN, $ARGV[0]) | |
46 or die "Cannot open $ARGV[0] for reading: $!\n"; | |
47 open(NOVEL, ">$ARGV[1]") | |
48 or die "Cannot open $ARGV[1] for writing: $!\n"; | |
49 open(RARE, ">$ARGV[2]") | |
50 or die "Cannot open $ARGV[2] for writing: $!\n"; | |
51 <IN>; # header | |
52 while(<IN>){ | |
53 chomp; | |
54 my @F = split /\t/, $_; | |
55 my $chr = $F[0]; | |
56 my $pos = $F[1]; | |
57 next if exists $known_fps{"$chr:$pos"}; | |
58 my $var_depth = $F[6]; | |
59 my $total_depth = $F[7]; | |
60 my @rsid_list = split /,/, $F[10]; | |
61 my @dbsnp_ver_list = split /,/, $F[11]; | |
62 my @maf_list = split /,/, $F[12]; | |
63 my $rare_event = 0; | |
64 for (my $i = 0; $i <= $#maf_list; $i++){ | |
65 next if $maf_list[$i] eq "-"; | |
66 if($maf_list[$i] eq "NA" or $maf_list[$i] <= $maf_threshold){ | |
67 $rare_event = 1 if $rsid_list[$i] eq "novel" or $rsid_list[$i] =~ /^rs/ and $dbsnp_ver_list[$i] >= 132; | |
68 # print "$chr:$pos $maf_list[$i] $rsid_list[$i] $dbsnp_ver_list[$i] : $rare_event\n"; | |
69 last; | |
70 } | |
71 } | |
72 next unless $rare_event; | |
73 my $sift = $F[13]; | |
74 my $polyphen = $F[15]; | |
75 my $dna_hgvs = $F[18]; | |
76 next if $F[9] =~ /UTR/; | |
77 my $transcript_id = $F[19]; | |
78 my $transcript_length = $F[20]; | |
79 my $prot_hgvs = $F[21]; | |
80 next if $prot_hgvs =~ /=$/; | |
81 my $zygosity_list = $F[22]; | |
82 my $sample_list = $F[$#F]; | |
83 my $phase_list = $F[$#F-1]; | |
84 my $caveats = $F[$#F-2]; | |
85 my $desc = $F[$#F-3]; | |
86 my $tissues = $F[$#F-5]; | |
87 my $gene = $F[17]; | |
88 next if not defined $gene; | |
89 #for my $gene (split /;/, $gene_list){ | |
90 next if not $gene =~ /\S/; # only work with named genes | |
91 $gene2chr{$gene} = $chr if not exists $gene2chr{$gene}; | |
92 $gene2desc{$gene} = $desc if not exists $gene2desc{$gene}; | |
93 $pos2info{"$chr:$pos"} = [join(",",@maf_list), $sift, $polyphen, $caveats, $var_depth, $total_depth, $transcript_length, join(",",@rsid_list)] if not exists $pos2info{"$chr:$pos"}; | |
94 $pos2hgvs{"$chr:$pos"}->{$dna_hgvs} = [] if not exists $pos2hgvs{"$chr:$pos"}->{$dna_hgvs}; | |
95 $pos2hgvs{"$chr:$pos"}->{$prot_hgvs} = [] if not exists $pos2hgvs{"$chr:$pos"}->{$dna_hgvs}; | |
96 push @{$pos2hgvs{"$chr:$pos"}->{$dna_hgvs}}, $transcript_id; | |
97 push @{$pos2hgvs{"$chr:$pos"}->{$prot_hgvs}}, $transcript_id; | |
98 $gene_vars{$gene} = {} if not exists $gene_vars{$gene}; | |
99 my @zygosities = split /; /, $zygosity_list; | |
100 my @phases = split /; /, $phase_list; | |
101 my @samples = split /; /, $sample_list; | |
102 for(my $i = 0; $i <= $#samples; $i++){ | |
103 my $sample = $samples[$i]; | |
104 my $phase = $phases[$i]; | |
105 my $zygosity = $zygosities[$i]; | |
106 $gene_vars{$gene}->{$sample} = {} if not exists $gene_vars{$gene}->{$sample}; | |
107 $gene_vars{$gene}->{$sample}->{$pos} = [$zygosity, $phase]; | |
108 $sample_count{$sample}++; | |
109 # } | |
110 } | |
111 } | |
112 close(IN); | |
113 | |
114 print NOVEL "Gene Name\tChr/Pos\tIn samples\tEvent Type\tMAF\tSIFT\tPolyPhen2\tGenotyping Caveats\tTranscript Length\tdbSNP ID\tHGVS DNA\tHGVS AA\n"; | |
115 print RARE "Gene Name\tChr/Pos\tIn samples\tEvent Type\tMAF\tSIFT\tPolyPhen2\tGenotyping Caveats\tTranscript Length\tidbSNP ID\tHGVS DNA\tHGVS AA\n"; | |
116 my @sample_names = $trio ? qw(Father Mother Child) : sort keys %sample_count; | |
117 for my $gene (sort keys %gene_vars){ | |
118 #next unless scalar(keys %{$gene_vars{$gene}}) == scalar(@sample_names); # require complete gene penetrance | |
119 my $chr = $gene2chr{$gene}; | |
120 my %rare_phase; | |
121 my %rare_events; | |
122 my %rare_samples; | |
123 my %sample_counts; | |
124 # For trios, it's assumed to be in the order: father, mother, child | |
125 for my $sample_index (0..$#sample_names){ | |
126 my $sample = $sample_names[$sample_index]; | |
127 next unless exists $gene_vars{$gene}->{$sample}; | |
128 for my $pos (keys %{$gene_vars{$gene}->{$sample}}){ | |
129 next unless $all_variants or grep /p\.|c\.\d+_\d+\+\d|c\.\d+\+[12]_|c\.\d+-[12]_|c\.\d+-\d+_\d+$/, keys %{$pos2hgvs{"$chr:$pos"}}; | |
130 print "Checking $chr:$pos\n"; | |
131 # For now, only look for homo rare or compound het (known from explicit haplotype phasing data provided, or trio) | |
132 if($gene_vars{$gene}->{$sample}->{$pos}->[0] =~ /homo/){ | |
133 my $maf_list = $pos2info{"$chr:$pos"}->[0]; | |
134 for my $maf (split /,/, $maf_list){ | |
135 next if $maf eq "-"; | |
136 if($trio){ # only report if not homozyguous in either parent | |
137 next if exists $gene_vars{$gene}->{$sample_names[0]} and exists $gene_vars{$gene}->{$sample_names[0]}->{$pos} and $gene_vars{$gene}->{$sample_names[0]}->{$pos}->[0] =~ /homo/ or | |
138 exists $gene_vars{$gene}->{$sample_names[1]} and exists $gene_vars{$gene}->{$sample_names[1]}->{$pos} and $gene_vars{$gene}->{$sample_names[1]}->{$pos}->[0] =~ /homo/; | |
139 } | |
140 if($maf eq "NA"){ | |
141 if($pos2info{"$chr:$pos"}->[1] eq "1"){ # SIFT says its benign | |
142 $rare_events{$pos} = "benign homo novel variant"; | |
143 } | |
144 else{ | |
145 $rare_events{$pos} = "homo novel variant"; | |
146 } | |
147 $rare_samples{$pos} = [] unless exists $rare_samples{$pos}; | |
148 push @{$rare_samples{$pos}}, $sample; | |
149 $sample_counts{$sample}++; | |
150 last; | |
151 } | |
152 # if here, it's rare, not novel | |
153 next if is_benign(\%pos2info, $chr, $pos); | |
154 # if here, rare and maybe not benign | |
155 $rare_events{$pos} = "homo rare variant"; | |
156 $rare_samples{$pos} = [] unless exists $rare_samples{$pos}; | |
157 push @{$rare_samples{$pos}}, $sample; | |
158 $sample_counts{$sample}++; | |
159 } | |
160 } # end homo | |
161 # het novel (if dominant model enabled), possibly non-benign | |
162 elsif($maybe_dominant and $pos2info{"$chr:$pos"}->[0] =~ /NA/ and $pos2info{"$chr:$pos"}->[7] =~ /novel/){ | |
163 next if $pos2info{"$chr:$pos"}->[3] =~ /non-unique/; # or is_benign(\%pos2info, $chr, $pos); # has mapping caveat or is almost definitely benign | |
164 if($trio){ # only report if not present in either parent | |
165 next if exists $gene_vars{$gene}->{$sample_names[0]} and exists $gene_vars{$gene}->{$sample_names[0]}->{$pos} or | |
166 exists $gene_vars{$gene}->{$sample_names[1]} and exists $gene_vars{$gene}->{$sample_names[1]}->{$pos}; | |
167 } | |
168 $rare_events{$pos} = "het novel variant, possibly non-benign under dominant disease model"; | |
169 $rare_samples{$pos} = [] unless exists $rare_samples{$pos}; | |
170 push @{$rare_samples{$pos}}, $sample; | |
171 $sample_counts{$sample}++; | |
172 } | |
173 # check if two rare het variants are trans in a sample, using phasing info | |
174 # the phase info should look like | |
175 # A-chrX:134986768-134986769 | |
176 # B-chrX:134986768-134986769 | |
177 # or, in the case of trios we have | |
178 # Mother-GENENAME:1-length | |
179 # Father-GENENAME:1-length | |
180 # todo: handle case where more than one type of event happens at one position | |
181 elsif($trio or $gene_vars{$gene}->{$sample}->{$pos}->[1] =~ /^(\S+?)-(\S+):(\d+)-(\d+)/){ | |
182 #next if is_benign(\%pos2info, $chr, $pos); # don't care about benign rare hets | |
183 my $novel = $pos2info{"$chr:$pos"}->[0] =~ /NA/; | |
184 my $phase_group = $trio ? get_phase_group($gene_vars{$gene}->{$sample_names[0]}, $gene_vars{$gene}->{$sample_names[1]}, $pos) : $1; # if trio, need to determine of if the allele came from the father or mother | |
185 #next if $phase_group eq "Either"; | |
186 my $phase_chr = $trio ? $gene2chr{$gene} : $2; | |
187 my $phase_start = $trio ? 1 : $3; | |
188 my $phase_end = $trio ? $pos2info{"$chr:$pos"}->[6] : $4; | |
189 if(exists $rare_phase{"$phase_chr:$phase_start-$phase_end"}){ | |
190 my $other_group = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[0]; | |
191 next if $other_group eq $phase_group; # rare events on same allele...ignore for now as LD artifact (todo: revisit this???) | |
192 my $other_pos = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[1]; | |
193 my $other_novel = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[2]; | |
194 my $other_sample = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[3]; | |
195 $rare_events{$pos} = "het ".($novel ? "novel":"rare")." variant phased trans with het ".($other_novel ? "novel":"rare")." variant at $phase_chr:$other_pos"; | |
196 $rare_events{$other_pos} = "het ".($novel ? "novel":"rare")." variant phased trans with het ".($other_novel ? "novel":"rare")." variant at $phase_chr:$pos"; | |
197 $rare_samples{$pos} = [] unless exists $rare_samples{$pos}; | |
198 $rare_samples{$other_pos} = [] unless exists $rare_samples{$pos}; | |
199 push @{$rare_samples{$pos}}, $sample; | |
200 push @{$rare_samples{$other_pos}}, $other_sample; | |
201 # todo: could have more than two rare trans hets...how to report? (all reported aganist first one right now) | |
202 $sample_counts{$sample}++; | |
203 $sample_counts{$other_sample}++; | |
204 } | |
205 else{ | |
206 $rare_phase{"$phase_chr:$phase_start-$phase_end"} = [$phase_group, $pos, $novel, $sample]; # note as first rare allele, in case we find another one later in the same gene | |
207 } | |
208 } | |
209 } | |
210 } | |
211 # Don't have rare events in all samples? | |
212 next if $trio and not exists $sample_counts{"Child"}; | |
213 my $num_missing = scalar(@sample_names) - scalar(keys %sample_counts); | |
214 my $reason = ""; | |
215 if(not $trio and $num_missing){ | |
216 next if $num_allowed_missing < $num_missing; | |
217 # todo: check if missing from a sample due to low coverage | |
218 my @missing_list; | |
219 for my $name (@sample_names){ | |
220 push @missing_list, $name unless exists $sample_counts{$name}; | |
221 } | |
222 $reason = "Rare events for this gene are not found in all cohort samples (missing ". join(", ", @missing_list).") / "; | |
223 # next; | |
224 } | |
225 | |
226 my $novelty = 0; | |
227 my $num_caveats = 0; | |
228 my $assume_dominance = 0; | |
229 my $benign = 0; | |
230 my $gene_length; | |
231 my @protein_events; | |
232 for my $pos (sort {$a <=> $b} keys %rare_events){ | |
233 $novelty++ if $rare_events{$pos} eq "homo novel variant" or $rare_events{$pos} =~ /^het novel/; | |
234 $assume_dominance++ if $rare_events{$pos} =~ /dominant/; | |
235 $benign++ if $rare_events{$pos} =~ /benign/; | |
236 $num_caveats++ if $pos2info{"$chr:$pos"}->[3] =~ /\S/; | |
237 $gene_length = $pos2info{"$chr:$pos"}->[6] if not defined $gene_length; | |
238 my @dna_info; | |
239 my @prot_info; | |
240 for my $hgvs (sort keys %{$pos2hgvs{"$chr:$pos"}}){ | |
241 if($hgvs =~ /^p\./){ | |
242 push @prot_info, join(",", @{$pos2hgvs{"$chr:$pos"}->{$hgvs}}).":$hgvs"; | |
243 } | |
244 else{ | |
245 push @dna_info, join(",", @{$pos2hgvs{"$chr:$pos"}->{$hgvs}}).":$hgvs"; | |
246 } | |
247 } | |
248 push @protein_events, "\t$pos\t". join("/", @{$rare_samples{$pos}}). "\t". $rare_events{$pos}. "\t". join("\t", @{$pos2info{"$chr:$pos"}}). | |
249 "\t". join(";", @dna_info). "\t". join(";", @prot_info). "\n"; | |
250 } | |
251 if($novelty){ | |
252 if($gene_length > $gene_length_threshold){ | |
253 $reason .= "Exceptionally long gene, prone to random novel events at heterogeneous loci"; | |
254 } | |
255 elsif($num_caveats){ | |
256 if($num_caveats+1 >= @protein_events or $num_caveats >= 0.9*@protein_events){ | |
257 $reason .= "Most variant calls for this gene have caveats"; | |
258 } | |
259 elsif($assume_dominance){ | |
260 my $num_events += scalar(@protein_events)-$assume_dominance; | |
261 if($num_caveats+1 >= $num_events or $num_caveats >= 0.9*$num_events){ | |
262 $reason .= "Apart from novel hets, most variant calls for this gene have caveats"; | |
263 } | |
264 } | |
265 } | |
266 } | |
267 elsif($benign == @protein_events){ | |
268 $reason .= "The gene probably only contains benign variants"; | |
269 } | |
270 else{ | |
271 $reason .= "Variants found in the general population, albeit rarely"; | |
272 } | |
273 | |
274 # fix formatting if missing sample is the only problem | |
275 $reason =~ s( / $)(); | |
276 | |
277 if($novelty and not $reason){ | |
278 print NOVEL "$gene\t$gene2chr{$gene}\t$gene2desc{$gene}\n", @protein_events; | |
279 } | |
280 else{ | |
281 print RARE "$gene\t$gene2chr{$gene}\t$gene2desc{$gene}\t$reason\n", @protein_events; | |
282 } | |
283 } | |
284 | |
285 sub get_phase_group{ | |
286 my ($father_variants, $mother_variants, $pos) = @_; | |
287 | |
288 if(defined $father_variants and exists $father_variants->{$pos}){ | |
289 if(defined $mother_variants->{$pos} and exists $mother_variants->{$pos}){ | |
290 return "Either"; # in future, trio-based phasing that models meiotic recombination may help here (e.g PHASE). | |
291 } | |
292 else{ | |
293 return "Father"; | |
294 } | |
295 } | |
296 elsif(defined $mother_variants->{$pos} and exists $mother_variants->{$pos}){ | |
297 return "Mother"; | |
298 } | |
299 else{ | |
300 return "De_novo"; | |
301 } | |
302 } | |
303 | |
304 sub is_benign{ | |
305 my ($pos2info, $chr, $pos) = @_; | |
306 if($pos2info->{"$chr:$pos"}->[1] eq "1"){ | |
307 return 1; # sift thinks it's completely benign | |
308 } | |
309 elsif($pos2info->{"$chr:$pos"}->[1] eq "NA"){ | |
310 if($pos2info->{"$chr:$pos"}->[2] ne "benign"){ # sift doesn't know, and polyphen thinks it might be bad | |
311 return 0; | |
312 } | |
313 } | |
314 elsif($pos2info->{"$chr:$pos"}->[1] < 0.06){ # sift thinks it might be bad | |
315 return 0; | |
316 } | |
317 elsif($pos2info->{"$chr:$pos"}->[2] ne "benign"){ # polyphen thinks it might be bad | |
318 return 0; | |
319 } | |
320 return 1; # nobody thinks it's bad | |
321 } |