view rare_triage @ 0:0d7a85ddac86 default tip

initial commit of project
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:44:41 -0600
parents
children
line wrap: on
line source

#!/usr/bin/env perl

use strict;
use warnings;

my $trio = 0;
if(@ARGV == 1 and $ARGV[0] eq "-v"){
  print "Version 0.1"; exit;
} 
if($ARGV[0] eq "-t"){
  shift @ARGV;
  $trio = 1;
} 

@ARGV == 8 or @ARGV == 9 or die "Usage: $0 [-t(rio)] <input hgvs annotated variant table.txt> <novel output.txt> <rare output.txt> <maf reporting threshold> <max. gene DNA length caveat cutoff> <report non-coding variants too (true|false)> <num samples allowed to be missing> <dominant model (true|false)> [known false positive novels.txt]\n";

my %known_fps; # a priori known false positives
if(@ARGV == 9){
  open(FP, $ARGV[8])
    or die "Cannot open $ARGV[8] for reading: $!\n";
  while(<FP>){
    next if /^\s*#/;
    chomp;
    my @F = split /\t/, $_;
    $known_fps{"$F[0]:$F[1]"} = 1;
  }
  close(FP);
  pop @ARGV;
}

my $maybe_dominant = pop @ARGV; # i.e. should novel non-benign het calls be considered across the cohort?
$maybe_dominant = $maybe_dominant =~ /^(?:t(?:rue)?|1|y(?:es)?)/i;
my $num_allowed_missing = pop @ARGV; # report rare events even if not universal, as long as coverage is missing in the non-conforming samples
my $all_variants = pop @ARGV; # restrict to protein-coding only?
$all_variants = $all_variants =~ /^(?:t(?:rue)?|1|y(?:es)?)/i;
my $gene_length_threshold = pop @ARGV;
my $maf_threshold = pop @ARGV;

my %gene_vars;
my %gene2chr;
my %gene2desc;
my %pos2info;
my %pos2hgvs;
my %sample_count;
open(IN, $ARGV[0])
  or die "Cannot open $ARGV[0] for reading: $!\n";
open(NOVEL, ">$ARGV[1]")
  or die "Cannot open $ARGV[1] for writing: $!\n";
open(RARE, ">$ARGV[2]")
  or die "Cannot open $ARGV[2] for writing: $!\n";
<IN>; # header
while(<IN>){
  chomp;
  my @F = split /\t/, $_;
  my $chr = $F[0];
  my $pos = $F[1];
  next if exists $known_fps{"$chr:$pos"};
  my $var_depth = $F[6];
  my $total_depth = $F[7];
  my @rsid_list = split /,/, $F[10];
  my @dbsnp_ver_list = split /,/, $F[11];
  my @maf_list = split /,/, $F[12];
  my $rare_event = 0;
  for (my $i = 0; $i <= $#maf_list; $i++){
    next if $maf_list[$i] eq "-";
    if($maf_list[$i] eq "NA" or $maf_list[$i] <= $maf_threshold){ 
      $rare_event = 1 if $rsid_list[$i] eq "novel" or $rsid_list[$i] =~ /^rs/ and $dbsnp_ver_list[$i] >= 132;
#      print "$chr:$pos $maf_list[$i] $rsid_list[$i] $dbsnp_ver_list[$i] : $rare_event\n";
      last;
    }
  }
  next unless $rare_event;
  my $sift = $F[13];
  my $polyphen = $F[15];
  my $dna_hgvs = $F[18];
  next if $F[9] =~ /UTR/;
  my $transcript_id = $F[19];
  my $transcript_length = $F[20];
  my $prot_hgvs = $F[21];
  next if $prot_hgvs =~ /=$/;
  my $zygosity_list = $F[22];
  my $sample_list = $F[$#F];
  my $phase_list = $F[$#F-1];
  my $caveats = $F[$#F-2];
  my $desc = $F[$#F-3];
  my $tissues = $F[$#F-5];
  my $gene = $F[17];
  next if not defined $gene;
  #for my $gene (split /;/, $gene_list){
  next if not $gene =~ /\S/; # only work with named genes
    $gene2chr{$gene} = $chr if not exists $gene2chr{$gene};
    $gene2desc{$gene} = $desc if not exists $gene2desc{$gene};
    $pos2info{"$chr:$pos"} = [join(",",@maf_list), $sift, $polyphen, $caveats, $var_depth, $total_depth, $transcript_length, join(",",@rsid_list)] if not exists $pos2info{"$chr:$pos"};
    $pos2hgvs{"$chr:$pos"}->{$dna_hgvs} = [] if not exists $pos2hgvs{"$chr:$pos"}->{$dna_hgvs};
    $pos2hgvs{"$chr:$pos"}->{$prot_hgvs} = [] if not exists $pos2hgvs{"$chr:$pos"}->{$dna_hgvs};
    push @{$pos2hgvs{"$chr:$pos"}->{$dna_hgvs}}, $transcript_id;
    push @{$pos2hgvs{"$chr:$pos"}->{$prot_hgvs}}, $transcript_id;
    $gene_vars{$gene} = {} if not exists $gene_vars{$gene};
    my @zygosities = split /; /, $zygosity_list;
    my @phases = split /; /, $phase_list;
    my @samples = split /; /, $sample_list;
    for(my $i = 0; $i <= $#samples; $i++){
      my $sample = $samples[$i];
      my $phase = $phases[$i];
      my $zygosity = $zygosities[$i];
      $gene_vars{$gene}->{$sample} = {} if not exists $gene_vars{$gene}->{$sample};
      $gene_vars{$gene}->{$sample}->{$pos} = [$zygosity, $phase];
      $sample_count{$sample}++;
  #  }
 }
}
close(IN);

print NOVEL "Gene Name\tChr/Pos\tIn samples\tEvent Type\tMAF\tSIFT\tPolyPhen2\tGenotyping Caveats\tTranscript Length\tdbSNP ID\tHGVS DNA\tHGVS AA\n";
print RARE "Gene Name\tChr/Pos\tIn samples\tEvent Type\tMAF\tSIFT\tPolyPhen2\tGenotyping Caveats\tTranscript Length\tidbSNP ID\tHGVS DNA\tHGVS AA\n";
my @sample_names = $trio ? qw(Father Mother Child) : sort keys %sample_count;
for my $gene (sort keys %gene_vars){
  #next unless scalar(keys %{$gene_vars{$gene}}) == scalar(@sample_names); # require complete gene penetrance
  my $chr = $gene2chr{$gene};
  my %rare_phase;
  my %rare_events;
  my %rare_samples;
  my %sample_counts;
  # For trios, it's assumed to be in the order: father, mother, child
  for my $sample_index (0..$#sample_names){
    my $sample = $sample_names[$sample_index];
    next unless exists $gene_vars{$gene}->{$sample}; 
    for my $pos (keys %{$gene_vars{$gene}->{$sample}}){
      next unless $all_variants or grep /p\.|c\.\d+_\d+\+\d|c\.\d+\+[12]_|c\.\d+-[12]_|c\.\d+-\d+_\d+$/, keys %{$pos2hgvs{"$chr:$pos"}};
      print "Checking $chr:$pos\n";
      # For now, only look for homo rare or compound het (known from explicit haplotype phasing data provided, or trio)
      if($gene_vars{$gene}->{$sample}->{$pos}->[0] =~ /homo/){ 
        my $maf_list = $pos2info{"$chr:$pos"}->[0];
        for my $maf (split /,/, $maf_list){
          next if $maf eq "-";
          if($trio){ # only report if not homozyguous in either parent
            next if exists $gene_vars{$gene}->{$sample_names[0]} and exists $gene_vars{$gene}->{$sample_names[0]}->{$pos} and $gene_vars{$gene}->{$sample_names[0]}->{$pos}->[0] =~ /homo/ or 
                    exists $gene_vars{$gene}->{$sample_names[1]} and exists $gene_vars{$gene}->{$sample_names[1]}->{$pos} and $gene_vars{$gene}->{$sample_names[1]}->{$pos}->[0] =~ /homo/;
          }
          if($maf eq "NA"){
            if($pos2info{"$chr:$pos"}->[1] eq "1"){ # SIFT says its benign
              $rare_events{$pos} = "benign homo novel variant";
            }
            else{
              $rare_events{$pos} = "homo novel variant";
            }
            $rare_samples{$pos} = [] unless exists $rare_samples{$pos};
            push @{$rare_samples{$pos}}, $sample;
            $sample_counts{$sample}++;
            last;
          }
          # if here, it's rare, not novel
          next if is_benign(\%pos2info, $chr, $pos);
          # if here, rare and maybe not benign
          $rare_events{$pos} = "homo rare variant";
          $rare_samples{$pos} = [] unless exists $rare_samples{$pos};
          push @{$rare_samples{$pos}}, $sample;
          $sample_counts{$sample}++;
        }
      } # end homo
      # het novel (if dominant model enabled), possibly non-benign
      elsif($maybe_dominant and $pos2info{"$chr:$pos"}->[0] =~ /NA/ and $pos2info{"$chr:$pos"}->[7] =~ /novel/){
        next if $pos2info{"$chr:$pos"}->[3] =~ /non-unique/; # or is_benign(\%pos2info, $chr, $pos); # has mapping caveat or is almost definitely benign
        if($trio){ # only report if not present in either parent
            next if exists $gene_vars{$gene}->{$sample_names[0]} and exists $gene_vars{$gene}->{$sample_names[0]}->{$pos} or  
                    exists $gene_vars{$gene}->{$sample_names[1]} and exists $gene_vars{$gene}->{$sample_names[1]}->{$pos};
        }
        $rare_events{$pos} = "het novel variant, possibly non-benign under dominant disease model";
        $rare_samples{$pos} = [] unless exists $rare_samples{$pos};
        push @{$rare_samples{$pos}}, $sample;
        $sample_counts{$sample}++;
      }
      # check if two rare het variants are trans in a sample, using phasing info
      # the phase info should look like 
      # A-chrX:134986768-134986769
      # B-chrX:134986768-134986769
      # or, in the case of trios we have
      # Mother-GENENAME:1-length
      # Father-GENENAME:1-length
      # todo: handle case where more than one type of event happens at one position
      elsif($trio or $gene_vars{$gene}->{$sample}->{$pos}->[1] =~ /^(\S+?)-(\S+):(\d+)-(\d+)/){
        #next if is_benign(\%pos2info, $chr, $pos); # don't care about benign rare hets
        my $novel = $pos2info{"$chr:$pos"}->[0] =~ /NA/;
        my $phase_group = $trio ? get_phase_group($gene_vars{$gene}->{$sample_names[0]}, $gene_vars{$gene}->{$sample_names[1]}, $pos) : $1; # if trio, need to determine of if the allele came from the father or mother
        #next if $phase_group eq "Either";
        my $phase_chr = $trio ? $gene2chr{$gene} : $2;
        my $phase_start = $trio ? 1 : $3;
        my $phase_end = $trio ? $pos2info{"$chr:$pos"}->[6] : $4;
        if(exists $rare_phase{"$phase_chr:$phase_start-$phase_end"}){
          my $other_group = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[0];
          next if $other_group eq $phase_group; # rare events on same allele...ignore for now as LD artifact (todo: revisit this???)
          my $other_pos = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[1];
          my $other_novel = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[2];
          my $other_sample = $rare_phase{"$phase_chr:$phase_start-$phase_end"}->[3];
          $rare_events{$pos} = "het ".($novel ? "novel":"rare")." variant phased trans with het ".($other_novel ? "novel":"rare")." variant at $phase_chr:$other_pos";
          $rare_events{$other_pos} = "het ".($novel ? "novel":"rare")." variant phased trans with het ".($other_novel ? "novel":"rare")." variant at $phase_chr:$pos";
          $rare_samples{$pos} = [] unless exists $rare_samples{$pos};
          $rare_samples{$other_pos} = [] unless exists $rare_samples{$pos};
          push @{$rare_samples{$pos}}, $sample;
          push @{$rare_samples{$other_pos}}, $other_sample;
          # todo: could have more than two rare trans hets...how to report? (all reported aganist first one right now)
          $sample_counts{$sample}++;
          $sample_counts{$other_sample}++;
        }
        else{
          $rare_phase{"$phase_chr:$phase_start-$phase_end"} = [$phase_group, $pos, $novel, $sample];  # note as first rare allele, in case we find another one later in the same gene
        }
      }
    }
  }
  # Don't have rare events in all samples?
  next if $trio and not exists $sample_counts{"Child"};
  my $num_missing = scalar(@sample_names) - scalar(keys %sample_counts);
  my $reason = "";
  if(not $trio and $num_missing){
    next if $num_allowed_missing < $num_missing;
    # todo: check if missing from a sample due to low coverage
    my @missing_list;
    for my $name (@sample_names){
      push @missing_list, $name unless exists $sample_counts{$name};
    }
    $reason = "Rare events for this gene are not found in all cohort samples (missing ". join(", ", @missing_list).") / ";
    # next;
  }

  my $novelty = 0;
  my $num_caveats = 0;
  my $assume_dominance = 0;
  my $benign = 0;
  my $gene_length;
  my @protein_events;
  for my $pos (sort {$a <=> $b} keys %rare_events){
    $novelty++ if $rare_events{$pos} eq "homo novel variant" or $rare_events{$pos} =~ /^het novel/; 
    $assume_dominance++ if $rare_events{$pos} =~ /dominant/; 
    $benign++ if $rare_events{$pos} =~ /benign/; 
    $num_caveats++ if $pos2info{"$chr:$pos"}->[3] =~ /\S/; 
    $gene_length = $pos2info{"$chr:$pos"}->[6] if not defined $gene_length; 
    my @dna_info;
    my @prot_info;
    for my $hgvs (sort keys %{$pos2hgvs{"$chr:$pos"}}){
      if($hgvs =~ /^p\./){
        push @prot_info, join(",", @{$pos2hgvs{"$chr:$pos"}->{$hgvs}}).":$hgvs";
      }
      else{
        push @dna_info, join(",", @{$pos2hgvs{"$chr:$pos"}->{$hgvs}}).":$hgvs";
      }
    }
    push @protein_events, "\t$pos\t". join("/", @{$rare_samples{$pos}}). "\t". $rare_events{$pos}. "\t". join("\t", @{$pos2info{"$chr:$pos"}}). 
                                "\t". join(";", @dna_info). "\t". join(";", @prot_info). "\n";
  }
  if($novelty){
    if($gene_length > $gene_length_threshold){
      $reason .= "Exceptionally long gene, prone to random novel events at heterogeneous loci";
    }
    elsif($num_caveats){
      if($num_caveats+1 >= @protein_events or $num_caveats >= 0.9*@protein_events){
        $reason .= "Most variant calls for this gene have caveats";
      }
      elsif($assume_dominance){
        my $num_events += scalar(@protein_events)-$assume_dominance;
        if($num_caveats+1 >= $num_events or $num_caveats >= 0.9*$num_events){
          $reason .= "Apart from novel hets, most variant calls for this gene have caveats";
        }
      }
    }
  }
  elsif($benign == @protein_events){
    $reason .= "The gene probably only contains benign variants";
  }
  else{
    $reason .= "Variants found in the general population, albeit rarely";
  }

  # fix formatting if missing sample is the only problem
  $reason =~ s( / $)();

  if($novelty and not $reason){
    print NOVEL "$gene\t$gene2chr{$gene}\t$gene2desc{$gene}\n", @protein_events;
  }
  else{
    print RARE "$gene\t$gene2chr{$gene}\t$gene2desc{$gene}\t$reason\n", @protein_events;
  }
}

sub get_phase_group{
  my ($father_variants, $mother_variants, $pos) = @_;

  if(defined $father_variants and exists $father_variants->{$pos}){
    if(defined $mother_variants->{$pos} and exists $mother_variants->{$pos}){
      return "Either"; # in future, trio-based phasing that models meiotic recombination may help here (e.g PHASE).
    }
    else{
      return "Father";
    }
  }
  elsif(defined $mother_variants->{$pos} and exists $mother_variants->{$pos}){
    return "Mother";
  }
  else{
    return "De_novo";
  }
}

sub is_benign{
  my ($pos2info, $chr, $pos) = @_;
  if($pos2info->{"$chr:$pos"}->[1] eq "1"){
    return 1; # sift thinks it's completely benign
  }
  elsif($pos2info->{"$chr:$pos"}->[1] eq "NA"){
    if($pos2info->{"$chr:$pos"}->[2] ne "benign"){ # sift doesn't know, and polyphen thinks it might be bad
      return 0;
    }
  }
  elsif($pos2info->{"$chr:$pos"}->[1] < 0.06){ # sift thinks it might be bad
    return 0;
  }
  elsif($pos2info->{"$chr:$pos"}->[2] ne "benign"){ # polyphen thinks it might be bad
    return 0;
  }
  return 1; # nobody thinks it's bad
}