Mercurial > repos > yusuf > triage_rare_genotypes
view rare_triage @ 0:0d7a85ddac86 default tip
initial commit of project
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:44:41 -0600 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl use strict; use warnings; my $trio = 0; if(@ARGV == 1 and $ARGV[0] eq "-v"){ print "Version 0.1"; exit; } if($ARGV[0] eq "-t"){ shift @ARGV; $trio = 1; } @ARGV == 8 or @ARGV == 9 or die "Usage: $0 [-t(rio)] <input hgvs annotated variant table.txt> <novel output.txt> <rare output.txt> <maf reporting threshold> <max. gene DNA length caveat cutoff> <report non-coding variants too (true|false)> <num samples allowed to be missing> <dominant model (true|false)> [known false positive novels.txt]\n"; my %known_fps; # a priori known false positives if(@ARGV == 9){ open(FP, $ARGV[8]) or die "Cannot open $ARGV[8] for reading: $!\n"; while(<FP>){ next if /^\s*#/; chomp; my @F = split /\t/, $_; $known_fps{"$F[0]:$F[1]"} = 1; } close(FP); pop @ARGV; } my $maybe_dominant = pop @ARGV; # i.e. should novel non-benign het calls be considered across the cohort? $maybe_dominant = $maybe_dominant =~ /^(?:t(?:rue)?|1|y(?:es)?)/i; my $num_allowed_missing = pop @ARGV; # report rare events even if not universal, as long as coverage is missing in the non-conforming samples my $all_variants = pop @ARGV; # restrict to protein-coding only? $all_variants = $all_variants =~ /^(?:t(?:rue)?|1|y(?:es)?)/i; my $gene_length_threshold = pop @ARGV; my $maf_threshold = pop @ARGV; my %gene_vars; my %gene2chr; my %gene2desc; my %pos2info; my %pos2hgvs; my %sample_count; open(IN, $ARGV[0]) or die "Cannot open $ARGV[0] for reading: $!\n"; open(NOVEL, ">$ARGV[1]") or die "Cannot open $ARGV[1] for writing: $!\n"; open(RARE, ">$ARGV[2]") or die "Cannot open $ARGV[2] for writing: $!\n"; <IN>; # header while(<IN>){ chomp; my @F = split /\t/, $_; my $chr = $F[0]; my $pos = $F[1]; next if exists $known_fps{"$chr:$pos"}; my $var_depth = $F[6]; my $total_depth = $F[7]; my @rsid_list = split /,/, $F[10]; my @dbsnp_ver_list = split /,/, $F[11]; my @maf_list = split /,/, $F[12]; my $rare_event = 0; for (my $i = 0; $i <= $#maf_list; $i++){ next if $maf_list[$i] eq "-"; if($maf_list[$i] eq "NA" or $maf_list[$i] <= $maf_threshold){ $rare_event = 1 if $rsid_list[$i] eq "novel" or $rsid_list[$i] =~ /^rs/ and $dbsnp_ver_list[$i] >= 132; # print "$chr:$pos $maf_list[$i] $rsid_list[$i] $dbsnp_ver_list[$i] : $rare_event\n"; last; } } next unless $rare_event; my $sift = $F[13]; my $polyphen = $F[15]; my $dna_hgvs = $F[18]; next if $F[9] =~ /UTR/; my $transcript_id = $F[19]; my $transcript_length = $F[20]; my $prot_hgvs = $F[21]; next if $prot_hgvs =~ /=$/; my $zygosity_list = $F[22]; my $sample_list = $F[$#F]; my $phase_list = $F[$#F-1]; my $caveats = $F[$#F-2]; my $desc = $F[$#F-3]; my $tissues = $F[$#F-5]; my $gene = $F[17]; next if not defined $gene; #for my $gene (split /;/, $gene_list){ next if not $gene =~ /\S/; # only work with named genes $gene2chr{$gene} = $chr if not exists $gene2chr{$gene}; $gene2desc{$gene} = $desc if not exists $gene2desc{$gene}; $pos2info{"$chr:$pos"} = [join(",",@maf_list), $sift, $polyphen, $caveats, $var_depth, $total_depth, $transcript_length, join(",",@rsid_list)] if not exists $pos2info{"$chr:$pos"}; $pos2hgvs{"$chr:$pos"}->{$dna_hgvs} = [] if not exists $pos2hgvs{"$chr:$pos"}->{$dna_hgvs}; $pos2hgvs{"$chr:$pos"}->{$prot_hgvs} = [] if not exists $pos2hgvs{"$chr:$pos"}->{$dna_hgvs}; push @{$pos2hgvs{"$chr:$pos"}->{$dna_hgvs}}, $transcript_id; push @{$pos2hgvs{"$chr:$pos"}->{$prot_hgvs}}, $transcript_id; $gene_vars{$gene} = {} if not exists $gene_vars{$gene}; my @zygosities = split /; /, $zygosity_list; my @phases = split /; /, $phase_list; my @samples = split /; /, $sample_list; for(my $i = 0; $i <= $#samples; $i++){ my $sample = $samples[$i]; my $phase = $phases[$i]; my $zygosity = $zygosities[$i]; $gene_vars{$gene}->{$sample} = {} if not exists $gene_vars{$gene}->{$sample}; $gene_vars{$gene}->{$sample}->{$pos} = [$zygosity, $phase]; $sample_count{$sample}++; # } } } close(IN); print NOVEL "Gene Name\tChr/Pos\tIn samples\tEvent Type\tMAF\tSIFT\tPolyPhen2\tGenotyping Caveats\tTranscript Length\tdbSNP ID\tHGVS DNA\tHGVS AA\n"; print RARE "Gene Name\tChr/Pos\tIn samples\tEvent Type\tMAF\tSIFT\tPolyPhen2\tGenotyping Caveats\tTranscript Length\tidbSNP ID\tHGVS DNA\tHGVS AA\n"; my @sample_names = $trio ? qw(Father Mother Child) : sort keys %sample_count; for my $gene (sort keys %gene_vars){ #next unless scalar(keys %{$gene_vars{$gene}}) == scalar(@sample_names); # require complete gene penetrance my $chr = $gene2chr{$gene}; my %rare_phase; my %rare_events; my %rare_samples; my %sample_counts; # For trios, it's assumed to be in the order: father, mother, child for my $sample_index (0..$#sample_names){ my $sample = $sample_names[$sample_index]; next unless exists $gene_vars{$gene}->{$sample}; for my $pos (keys %{$gene_vars{$gene}->{$sample}}){ next unless $all_variants or grep /p\.|c\.\d+_\d+\+\d|c\.\d+\+[12]_|c\.\d+-[12]_|c\.\d+-\d+_\d+$/, keys %{$pos2hgvs{"$chr:$pos"}}; print "Checking $chr:$pos\n"; # For now, only look for homo rare or compound het (known from explicit haplotype phasing data provided, or trio) if($gene_vars{$gene}->{$sample}->{$pos}->[0] =~ /homo/){ my $maf_list = $pos2info{"$chr:$pos"}->[0]; for my $maf (split /,/, $maf_list){ next if $maf eq "-"; if($trio){ # only report if not homozyguous in either parent next if exists $gene_vars{$gene}->{$sample_names[0]} and exists $gene_vars{$gene}->{$sample_names[0]}->{$pos} and $gene_vars{$gene}->{$sample_names[0]}->{$pos}->[0] =~ /homo/ or exists $gene_vars{$gene}->{$sample_names[1]} and exists $gene_vars{$gene}->{$sample_names[1]}->{$pos} and $gene_vars{$gene}->{$sample_names[1]}->{$pos}->[0] =~ /homo/; } if($maf eq "NA"){ if($pos2info{"$chr:$pos"}->[1] eq "1"){ # SIFT says its benign $rare_events{$pos} = "benign homo novel variant"; } else{ $rare_events{$pos} = "homo novel variant"; } $rare_samples{$pos} = [] unless exists $rare_samples{$pos}; push @{$rare_samples{$pos}}, $sample; $sample_counts{$sample}++; last; } # if here, it's rare, not novel next if is_benign(\%pos2info, $chr, $pos); # if here, rare and maybe not benign $rare_events{$pos} = "homo rare variant"; $rare_samples{$pos} = [] unless exists $rare_samples{$pos}; push @{$rare_samples{$pos}}, $sample; $sample_counts{$sample}++; } } # end homo # het novel (if dominant model enabled), possibly non-benign elsif($maybe_dominant and $pos2info{"$chr:$pos"}->[0] =~ /NA/ and $pos2info{"$chr:$pos"}->[7] =~ /novel/){ next if $pos2info{"$chr:$pos"}->[3] =~ /non-unique/; # or is_benign(\%pos2info, $chr, $pos); # has mapping caveat or is almost definitely benign if($trio){ # only report if not present in either parent next if exists $gene_vars{$gene}->{$sample_names[0]} and exists $gene_vars{$gene}->{$sample_names[0]}->{$pos} or exists $gene_vars{$gene}->{$sample_names[1]} and exists $gene_vars{$gene}->{$sample_names[1]}->{$pos}; } $rare_events{$pos} = "het novel variant, possibly non-benign under dominant disease model"; $rare_samples{$pos} = [] unless exists $rare_samples{$pos}; push @{$rare_samples{$pos}}, $sample; $sample_counts{$sample}++; } # check if two rare het variants are trans in a sample, using phasing info # the phase info should look like # A-chrX:134986768-134986769 # B-chrX:134986768-134986769 # or, in the case of trios we have # Mother-GENENAME:1-length # Father-GENENAME:1-length # todo: handle case where more than one type of event happens at one position elsif($trio or $gene_vars{$gene}->{$sample}->{$pos}->[1] =~ /^(\S+?)-(\S+):(\d+)-(\d+)/){ #next if is_benign(\%pos2info, $chr, $pos); # don't care about benign rare hets my $novel = $pos2info{"$chr:$pos"}->[0] =~ /NA/; my $phase_group = $trio ? get_phase_group($gene_vars{$gene}->{$sample_names[0]}, $gene_vars{$gene}->{$sample_names[1]}, $pos) : $1; # if trio, need to determine of if the allele came from the father or mother #next if $phase_group eq "Either"; my $phase_chr = $trio ? $gene2chr{$gene} : $2; my $phase_start = $trio ? 1 : $3; my $phase_end = $trio ? $pos2info{"$chr:$pos"}->[6] : $4; if(exists $rare_phase{"$phase_chr:$phase_start-$phase_end"}){ my $other_group = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[0]; next if $other_group eq $phase_group; # rare events on same allele...ignore for now as LD artifact (todo: revisit this???) my $other_pos = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[1]; my $other_novel = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[2]; my $other_sample = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[3]; $rare_events{$pos} = "het ".($novel ? "novel":"rare")." variant phased trans with het ".($other_novel ? "novel":"rare")." variant at $phase_chr:$other_pos"; $rare_events{$other_pos} = "het ".($novel ? "novel":"rare")." variant phased trans with het ".($other_novel ? "novel":"rare")." variant at $phase_chr:$pos"; $rare_samples{$pos} = [] unless exists $rare_samples{$pos}; $rare_samples{$other_pos} = [] unless exists $rare_samples{$pos}; push @{$rare_samples{$pos}}, $sample; push @{$rare_samples{$other_pos}}, $other_sample; # todo: could have more than two rare trans hets...how to report? (all reported aganist first one right now) $sample_counts{$sample}++; $sample_counts{$other_sample}++; } else{ $rare_phase{"$phase_chr:$phase_start-$phase_end"} = [$phase_group, $pos, $novel, $sample]; # note as first rare allele, in case we find another one later in the same gene } } } } # Don't have rare events in all samples? next if $trio and not exists $sample_counts{"Child"}; my $num_missing = scalar(@sample_names) - scalar(keys %sample_counts); my $reason = ""; if(not $trio and $num_missing){ next if $num_allowed_missing < $num_missing; # todo: check if missing from a sample due to low coverage my @missing_list; for my $name (@sample_names){ push @missing_list, $name unless exists $sample_counts{$name}; } $reason = "Rare events for this gene are not found in all cohort samples (missing ". join(", ", @missing_list).") / "; # next; } my $novelty = 0; my $num_caveats = 0; my $assume_dominance = 0; my $benign = 0; my $gene_length; my @protein_events; for my $pos (sort {$a <=> $b} keys %rare_events){ $novelty++ if $rare_events{$pos} eq "homo novel variant" or $rare_events{$pos} =~ /^het novel/; $assume_dominance++ if $rare_events{$pos} =~ /dominant/; $benign++ if $rare_events{$pos} =~ /benign/; $num_caveats++ if $pos2info{"$chr:$pos"}->[3] =~ /\S/; $gene_length = $pos2info{"$chr:$pos"}->[6] if not defined $gene_length; my @dna_info; my @prot_info; for my $hgvs (sort keys %{$pos2hgvs{"$chr:$pos"}}){ if($hgvs =~ /^p\./){ push @prot_info, join(",", @{$pos2hgvs{"$chr:$pos"}->{$hgvs}}).":$hgvs"; } else{ push @dna_info, join(",", @{$pos2hgvs{"$chr:$pos"}->{$hgvs}}).":$hgvs"; } } push @protein_events, "\t$pos\t". join("/", @{$rare_samples{$pos}}). "\t". $rare_events{$pos}. "\t". join("\t", @{$pos2info{"$chr:$pos"}}). "\t". join(";", @dna_info). "\t". join(";", @prot_info). "\n"; } if($novelty){ if($gene_length > $gene_length_threshold){ $reason .= "Exceptionally long gene, prone to random novel events at heterogeneous loci"; } elsif($num_caveats){ if($num_caveats+1 >= @protein_events or $num_caveats >= 0.9*@protein_events){ $reason .= "Most variant calls for this gene have caveats"; } elsif($assume_dominance){ my $num_events += scalar(@protein_events)-$assume_dominance; if($num_caveats+1 >= $num_events or $num_caveats >= 0.9*$num_events){ $reason .= "Apart from novel hets, most variant calls for this gene have caveats"; } } } } elsif($benign == @protein_events){ $reason .= "The gene probably only contains benign variants"; } else{ $reason .= "Variants found in the general population, albeit rarely"; } # fix formatting if missing sample is the only problem $reason =~ s( / $)(); if($novelty and not $reason){ print NOVEL "$gene\t$gene2chr{$gene}\t$gene2desc{$gene}\n", @protein_events; } else{ print RARE "$gene\t$gene2chr{$gene}\t$gene2desc{$gene}\t$reason\n", @protein_events; } } sub get_phase_group{ my ($father_variants, $mother_variants, $pos) = @_; if(defined $father_variants and exists $father_variants->{$pos}){ if(defined $mother_variants->{$pos} and exists $mother_variants->{$pos}){ return "Either"; # in future, trio-based phasing that models meiotic recombination may help here (e.g PHASE). } else{ return "Father"; } } elsif(defined $mother_variants->{$pos} and exists $mother_variants->{$pos}){ return "Mother"; } else{ return "De_novo"; } } sub is_benign{ my ($pos2info, $chr, $pos) = @_; if($pos2info->{"$chr:$pos"}->[1] eq "1"){ return 1; # sift thinks it's completely benign } elsif($pos2info->{"$chr:$pos"}->[1] eq "NA"){ if($pos2info->{"$chr:$pos"}->[2] ne "benign"){ # sift doesn't know, and polyphen thinks it might be bad return 0; } } elsif($pos2info->{"$chr:$pos"}->[1] < 0.06){ # sift thinks it might be bad return 0; } elsif($pos2info->{"$chr:$pos"}->[2] ne "benign"){ # polyphen thinks it might be bad return 0; } return 1; # nobody thinks it's bad }