# HG changeset patch # User Yusuf Ali # Date 1427312681 21600 # Node ID 0d7a85ddac869463c9daced7a5d2f1e0f7570ae1 initial commit of project diff -r 000000000000 -r 0d7a85ddac86 TriageRareGenotypes.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TriageRareGenotypes.xml Wed Mar 25 13:44:41 2015 -0600 @@ -0,0 +1,28 @@ + + + + from combined cohort HGVS table + rare_triage -v + rare_triage $input_hgvs_table $output_novel_gene_table $output_rare_gene_table $rare_freq $long_gene_threshold $non_coding + + + + + + + + + + + + + + + + + + +This tool takes a combined HGVS table of genotype calls for a cohort, and generates two tables. The first lists genes with novel homozygous variants in all samples, allowing for locus heterogeneity. The second table lists genes with marginal calls (e.g. lots of genotyping caveats), or very rare minor allele frequencies in the general population. + + + diff -r 000000000000 -r 0d7a85ddac86 rare_triage --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rare_triage Wed Mar 25 13:44:41 2015 -0600 @@ -0,0 +1,321 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +my $trio = 0; +if(@ARGV == 1 and $ARGV[0] eq "-v"){ + print "Version 0.1"; exit; +} +if($ARGV[0] eq "-t"){ + shift @ARGV; + $trio = 1; +} + +@ARGV == 8 or @ARGV == 9 or die "Usage: $0 [-t(rio)] [known false positive novels.txt]\n"; + +my %known_fps; # a priori known false positives +if(@ARGV == 9){ + open(FP, $ARGV[8]) + or die "Cannot open $ARGV[8] for reading: $!\n"; + while(){ + next if /^\s*#/; + chomp; + my @F = split /\t/, $_; + $known_fps{"$F[0]:$F[1]"} = 1; + } + close(FP); + pop @ARGV; +} + +my $maybe_dominant = pop @ARGV; # i.e. should novel non-benign het calls be considered across the cohort? +$maybe_dominant = $maybe_dominant =~ /^(?:t(?:rue)?|1|y(?:es)?)/i; +my $num_allowed_missing = pop @ARGV; # report rare events even if not universal, as long as coverage is missing in the non-conforming samples +my $all_variants = pop @ARGV; # restrict to protein-coding only? +$all_variants = $all_variants =~ /^(?:t(?:rue)?|1|y(?:es)?)/i; +my $gene_length_threshold = pop @ARGV; +my $maf_threshold = pop @ARGV; + +my %gene_vars; +my %gene2chr; +my %gene2desc; +my %pos2info; +my %pos2hgvs; +my %sample_count; +open(IN, $ARGV[0]) + or die "Cannot open $ARGV[0] for reading: $!\n"; +open(NOVEL, ">$ARGV[1]") + or die "Cannot open $ARGV[1] for writing: $!\n"; +open(RARE, ">$ARGV[2]") + or die "Cannot open $ARGV[2] for writing: $!\n"; +; # header +while(){ + chomp; + my @F = split /\t/, $_; + my $chr = $F[0]; + my $pos = $F[1]; + next if exists $known_fps{"$chr:$pos"}; + my $var_depth = $F[6]; + my $total_depth = $F[7]; + my @rsid_list = split /,/, $F[10]; + my @dbsnp_ver_list = split /,/, $F[11]; + my @maf_list = split /,/, $F[12]; + my $rare_event = 0; + for (my $i = 0; $i <= $#maf_list; $i++){ + next if $maf_list[$i] eq "-"; + if($maf_list[$i] eq "NA" or $maf_list[$i] <= $maf_threshold){ + $rare_event = 1 if $rsid_list[$i] eq "novel" or $rsid_list[$i] =~ /^rs/ and $dbsnp_ver_list[$i] >= 132; +# print "$chr:$pos $maf_list[$i] $rsid_list[$i] $dbsnp_ver_list[$i] : $rare_event\n"; + last; + } + } + next unless $rare_event; + my $sift = $F[13]; + my $polyphen = $F[15]; + my $dna_hgvs = $F[18]; + next if $F[9] =~ /UTR/; + my $transcript_id = $F[19]; + my $transcript_length = $F[20]; + my $prot_hgvs = $F[21]; + next if $prot_hgvs =~ /=$/; + my $zygosity_list = $F[22]; + my $sample_list = $F[$#F]; + my $phase_list = $F[$#F-1]; + my $caveats = $F[$#F-2]; + my $desc = $F[$#F-3]; + my $tissues = $F[$#F-5]; + my $gene = $F[17]; + next if not defined $gene; + #for my $gene (split /;/, $gene_list){ + next if not $gene =~ /\S/; # only work with named genes + $gene2chr{$gene} = $chr if not exists $gene2chr{$gene}; + $gene2desc{$gene} = $desc if not exists $gene2desc{$gene}; + $pos2info{"$chr:$pos"} = [join(",",@maf_list), $sift, $polyphen, $caveats, $var_depth, $total_depth, $transcript_length, join(",",@rsid_list)] if not exists $pos2info{"$chr:$pos"}; + $pos2hgvs{"$chr:$pos"}->{$dna_hgvs} = [] if not exists $pos2hgvs{"$chr:$pos"}->{$dna_hgvs}; + $pos2hgvs{"$chr:$pos"}->{$prot_hgvs} = [] if not exists $pos2hgvs{"$chr:$pos"}->{$dna_hgvs}; + push @{$pos2hgvs{"$chr:$pos"}->{$dna_hgvs}}, $transcript_id; + push @{$pos2hgvs{"$chr:$pos"}->{$prot_hgvs}}, $transcript_id; + $gene_vars{$gene} = {} if not exists $gene_vars{$gene}; + my @zygosities = split /; /, $zygosity_list; + my @phases = split /; /, $phase_list; + my @samples = split /; /, $sample_list; + for(my $i = 0; $i <= $#samples; $i++){ + my $sample = $samples[$i]; + my $phase = $phases[$i]; + my $zygosity = $zygosities[$i]; + $gene_vars{$gene}->{$sample} = {} if not exists $gene_vars{$gene}->{$sample}; + $gene_vars{$gene}->{$sample}->{$pos} = [$zygosity, $phase]; + $sample_count{$sample}++; + # } + } +} +close(IN); + +print NOVEL "Gene Name\tChr/Pos\tIn samples\tEvent Type\tMAF\tSIFT\tPolyPhen2\tGenotyping Caveats\tTranscript Length\tdbSNP ID\tHGVS DNA\tHGVS AA\n"; +print RARE "Gene Name\tChr/Pos\tIn samples\tEvent Type\tMAF\tSIFT\tPolyPhen2\tGenotyping Caveats\tTranscript Length\tidbSNP ID\tHGVS DNA\tHGVS AA\n"; +my @sample_names = $trio ? qw(Father Mother Child) : sort keys %sample_count; +for my $gene (sort keys %gene_vars){ + #next unless scalar(keys %{$gene_vars{$gene}}) == scalar(@sample_names); # require complete gene penetrance + my $chr = $gene2chr{$gene}; + my %rare_phase; + my %rare_events; + my %rare_samples; + my %sample_counts; + # For trios, it's assumed to be in the order: father, mother, child + for my $sample_index (0..$#sample_names){ + my $sample = $sample_names[$sample_index]; + next unless exists $gene_vars{$gene}->{$sample}; + for my $pos (keys %{$gene_vars{$gene}->{$sample}}){ + next unless $all_variants or grep /p\.|c\.\d+_\d+\+\d|c\.\d+\+[12]_|c\.\d+-[12]_|c\.\d+-\d+_\d+$/, keys %{$pos2hgvs{"$chr:$pos"}}; + print "Checking $chr:$pos\n"; + # For now, only look for homo rare or compound het (known from explicit haplotype phasing data provided, or trio) + if($gene_vars{$gene}->{$sample}->{$pos}->[0] =~ /homo/){ + my $maf_list = $pos2info{"$chr:$pos"}->[0]; + for my $maf (split /,/, $maf_list){ + next if $maf eq "-"; + if($trio){ # only report if not homozyguous in either parent + next if exists $gene_vars{$gene}->{$sample_names[0]} and exists $gene_vars{$gene}->{$sample_names[0]}->{$pos} and $gene_vars{$gene}->{$sample_names[0]}->{$pos}->[0] =~ /homo/ or + exists $gene_vars{$gene}->{$sample_names[1]} and exists $gene_vars{$gene}->{$sample_names[1]}->{$pos} and $gene_vars{$gene}->{$sample_names[1]}->{$pos}->[0] =~ /homo/; + } + if($maf eq "NA"){ + if($pos2info{"$chr:$pos"}->[1] eq "1"){ # SIFT says its benign + $rare_events{$pos} = "benign homo novel variant"; + } + else{ + $rare_events{$pos} = "homo novel variant"; + } + $rare_samples{$pos} = [] unless exists $rare_samples{$pos}; + push @{$rare_samples{$pos}}, $sample; + $sample_counts{$sample}++; + last; + } + # if here, it's rare, not novel + next if is_benign(\%pos2info, $chr, $pos); + # if here, rare and maybe not benign + $rare_events{$pos} = "homo rare variant"; + $rare_samples{$pos} = [] unless exists $rare_samples{$pos}; + push @{$rare_samples{$pos}}, $sample; + $sample_counts{$sample}++; + } + } # end homo + # het novel (if dominant model enabled), possibly non-benign + elsif($maybe_dominant and $pos2info{"$chr:$pos"}->[0] =~ /NA/ and $pos2info{"$chr:$pos"}->[7] =~ /novel/){ + next if $pos2info{"$chr:$pos"}->[3] =~ /non-unique/; # or is_benign(\%pos2info, $chr, $pos); # has mapping caveat or is almost definitely benign + if($trio){ # only report if not present in either parent + next if exists $gene_vars{$gene}->{$sample_names[0]} and exists $gene_vars{$gene}->{$sample_names[0]}->{$pos} or + exists $gene_vars{$gene}->{$sample_names[1]} and exists $gene_vars{$gene}->{$sample_names[1]}->{$pos}; + } + $rare_events{$pos} = "het novel variant, possibly non-benign under dominant disease model"; + $rare_samples{$pos} = [] unless exists $rare_samples{$pos}; + push @{$rare_samples{$pos}}, $sample; + $sample_counts{$sample}++; + } + # check if two rare het variants are trans in a sample, using phasing info + # the phase info should look like + # A-chrX:134986768-134986769 + # B-chrX:134986768-134986769 + # or, in the case of trios we have + # Mother-GENENAME:1-length + # Father-GENENAME:1-length + # todo: handle case where more than one type of event happens at one position + elsif($trio or $gene_vars{$gene}->{$sample}->{$pos}->[1] =~ /^(\S+?)-(\S+):(\d+)-(\d+)/){ + #next if is_benign(\%pos2info, $chr, $pos); # don't care about benign rare hets + my $novel = $pos2info{"$chr:$pos"}->[0] =~ /NA/; + my $phase_group = $trio ? get_phase_group($gene_vars{$gene}->{$sample_names[0]}, $gene_vars{$gene}->{$sample_names[1]}, $pos) : $1; # if trio, need to determine of if the allele came from the father or mother + #next if $phase_group eq "Either"; + my $phase_chr = $trio ? $gene2chr{$gene} : $2; + my $phase_start = $trio ? 1 : $3; + my $phase_end = $trio ? $pos2info{"$chr:$pos"}->[6] : $4; + if(exists $rare_phase{"$phase_chr:$phase_start-$phase_end"}){ + my $other_group = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[0]; + next if $other_group eq $phase_group; # rare events on same allele...ignore for now as LD artifact (todo: revisit this???) + my $other_pos = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[1]; + my $other_novel = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[2]; + my $other_sample = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[3]; + $rare_events{$pos} = "het ".($novel ? "novel":"rare")." variant phased trans with het ".($other_novel ? "novel":"rare")." variant at $phase_chr:$other_pos"; + $rare_events{$other_pos} = "het ".($novel ? "novel":"rare")." variant phased trans with het ".($other_novel ? "novel":"rare")." variant at $phase_chr:$pos"; + $rare_samples{$pos} = [] unless exists $rare_samples{$pos}; + $rare_samples{$other_pos} = [] unless exists $rare_samples{$pos}; + push @{$rare_samples{$pos}}, $sample; + push @{$rare_samples{$other_pos}}, $other_sample; + # todo: could have more than two rare trans hets...how to report? (all reported aganist first one right now) + $sample_counts{$sample}++; + $sample_counts{$other_sample}++; + } + else{ + $rare_phase{"$phase_chr:$phase_start-$phase_end"} = [$phase_group, $pos, $novel, $sample]; # note as first rare allele, in case we find another one later in the same gene + } + } + } + } + # Don't have rare events in all samples? + next if $trio and not exists $sample_counts{"Child"}; + my $num_missing = scalar(@sample_names) - scalar(keys %sample_counts); + my $reason = ""; + if(not $trio and $num_missing){ + next if $num_allowed_missing < $num_missing; + # todo: check if missing from a sample due to low coverage + my @missing_list; + for my $name (@sample_names){ + push @missing_list, $name unless exists $sample_counts{$name}; + } + $reason = "Rare events for this gene are not found in all cohort samples (missing ". join(", ", @missing_list).") / "; + # next; + } + + my $novelty = 0; + my $num_caveats = 0; + my $assume_dominance = 0; + my $benign = 0; + my $gene_length; + my @protein_events; + for my $pos (sort {$a <=> $b} keys %rare_events){ + $novelty++ if $rare_events{$pos} eq "homo novel variant" or $rare_events{$pos} =~ /^het novel/; + $assume_dominance++ if $rare_events{$pos} =~ /dominant/; + $benign++ if $rare_events{$pos} =~ /benign/; + $num_caveats++ if $pos2info{"$chr:$pos"}->[3] =~ /\S/; + $gene_length = $pos2info{"$chr:$pos"}->[6] if not defined $gene_length; + my @dna_info; + my @prot_info; + for my $hgvs (sort keys %{$pos2hgvs{"$chr:$pos"}}){ + if($hgvs =~ /^p\./){ + push @prot_info, join(",", @{$pos2hgvs{"$chr:$pos"}->{$hgvs}}).":$hgvs"; + } + else{ + push @dna_info, join(",", @{$pos2hgvs{"$chr:$pos"}->{$hgvs}}).":$hgvs"; + } + } + push @protein_events, "\t$pos\t". join("/", @{$rare_samples{$pos}}). "\t". $rare_events{$pos}. "\t". join("\t", @{$pos2info{"$chr:$pos"}}). + "\t". join(";", @dna_info). "\t". join(";", @prot_info). "\n"; + } + if($novelty){ + if($gene_length > $gene_length_threshold){ + $reason .= "Exceptionally long gene, prone to random novel events at heterogeneous loci"; + } + elsif($num_caveats){ + if($num_caveats+1 >= @protein_events or $num_caveats >= 0.9*@protein_events){ + $reason .= "Most variant calls for this gene have caveats"; + } + elsif($assume_dominance){ + my $num_events += scalar(@protein_events)-$assume_dominance; + if($num_caveats+1 >= $num_events or $num_caveats >= 0.9*$num_events){ + $reason .= "Apart from novel hets, most variant calls for this gene have caveats"; + } + } + } + } + elsif($benign == @protein_events){ + $reason .= "The gene probably only contains benign variants"; + } + else{ + $reason .= "Variants found in the general population, albeit rarely"; + } + + # fix formatting if missing sample is the only problem + $reason =~ s( / $)(); + + if($novelty and not $reason){ + print NOVEL "$gene\t$gene2chr{$gene}\t$gene2desc{$gene}\n", @protein_events; + } + else{ + print RARE "$gene\t$gene2chr{$gene}\t$gene2desc{$gene}\t$reason\n", @protein_events; + } +} + +sub get_phase_group{ + my ($father_variants, $mother_variants, $pos) = @_; + + if(defined $father_variants and exists $father_variants->{$pos}){ + if(defined $mother_variants->{$pos} and exists $mother_variants->{$pos}){ + return "Either"; # in future, trio-based phasing that models meiotic recombination may help here (e.g PHASE). + } + else{ + return "Father"; + } + } + elsif(defined $mother_variants->{$pos} and exists $mother_variants->{$pos}){ + return "Mother"; + } + else{ + return "De_novo"; + } +} + +sub is_benign{ + my ($pos2info, $chr, $pos) = @_; + if($pos2info->{"$chr:$pos"}->[1] eq "1"){ + return 1; # sift thinks it's completely benign + } + elsif($pos2info->{"$chr:$pos"}->[1] eq "NA"){ + if($pos2info->{"$chr:$pos"}->[2] ne "benign"){ # sift doesn't know, and polyphen thinks it might be bad + return 0; + } + } + elsif($pos2info->{"$chr:$pos"}->[1] < 0.06){ # sift thinks it might be bad + return 0; + } + elsif($pos2info->{"$chr:$pos"}->[2] ne "benign"){ # polyphen thinks it might be bad + return 0; + } + return 1; # nobody thinks it's bad +}