view hgvs_table_annotate @ 0:9977d1935a07 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:10:47 -0600
parents
children
line wrap: on
line source

#!/usr/bin/env perl

use Bio::DB::Sam;
use DBI;
use IPC::Open2;
use Set::IntervalTree;
use strict;
use warnings;
use vars qw($quiet %chr2variant_locs %polyphen_info %chr2polyphen_vcf_lines %chr2gerp_vcf_lines %range2genes $fasta_index $UNAFFECTED $tmpfile);
use File::Basename;
$UNAFFECTED = -299999; # sentinel for splice site predictions
my $current_directory = dirname(__FILE__);
if(@ARGV == 1 and $ARGV[0] eq "-v"){
  print "Version 1.0\n";
  exit;
}

#configuration file stuff
my %config;
my $tool_dir = shift @ARGV;
if(not -e "$tool_dir/annotate_hgvs.loc"){
  system("cp $current_directory/tool-data/annotate_hgvs.loc $tool_dir/annotate_hgvs.loc");
}
open CONFIG, '<', "$tool_dir/annotate_hgvs.loc";
while(<CONFIG>){
  next if $_ =~ /^#/;
  (my $key, my $value) = split(/\s+/,$_);
  $config{$key} = $value;
}
close CONFIG;
my $dbs_dir = $config{"dbs_dir"};

$quiet = 0;
if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
  $quiet = 1;
  shift @ARGV;
}

@ARGV == 9 or @ARGV == 10 or die "Usage: $0 -q <SIFT dir> <POLYPHEN2 file> <GERP dir> <tissuedb file> <pathway file> <interpro domains.bed> <omim morbidmap> <input hgvs table.txt> <output hgvs table.annotated.txt> [reference.fasta]\n Where if the reference fasta is provided, we predict cryptic splicing effects (using MaxEntScan and GeneSplicer)\n";

my $sift_dir = $dbs_dir . shift @ARGV;
my $polyphen_file = $dbs_dir . shift @ARGV;
my $gerp_dir = $dbs_dir . shift @ARGV;
my $tissue_file = $dbs_dir . shift @ARGV;
my $pathway_file = $dbs_dir. shift @ARGV;
my $interpro_bed_file = $dbs_dir . shift @ARGV;
my $morbidmap_file = $dbs_dir . shift @ARGV;
my $hgvs_table_file = shift @ARGV;
my $outfile = shift @ARGV;
my $predict_cryptic_splicings = @ARGV ?  ($dbs_dir . shift @ARGV) : undef;
my $flanking_bases = 500; # assume this is ROI around a gene
my @lines;
 
sub polyphen_gerp_info($$$){
    my($gerp_dir,$polyphen_file,$info_key) = @_;

    my($chr,$pos,$ref,$variant) = split /:/, $info_key;

    # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse
    if(not exists $chr2polyphen_vcf_lines{$chr}){
      ($chr2polyphen_vcf_lines{$chr}, $chr2gerp_vcf_lines{$chr}) = retrieve_vcf_lines($gerp_dir,$polyphen_file,$chr);
    }

    my @results;
    if(not $ref or not $variant){ # can only get data for SNPs
        @results = qw(NA NA NA);
    }
    #print STDERR "ref = $ref and variant = $variant for  $info_key\n";
    elsif(length($ref) == 1 and length($variant) == 1){
      #print STDERR "Grabbing poly/gerp for $chr,$pos,$ref,$variant\n";
      @results = polyphen_info($chr,$pos,$variant);
      $results[2] = gerp_info($chr,$pos);
    }

    # It may be that a novel MNP variant is a bunch of known SNPs in a row.  In this case, 
    # report the list of variants
    elsif(length($ref) == length($variant) and $ref ne "NA"){
        my (@poly_scores, @poly_calls, @gerp_scores);
        for(my $i = 0; $i < length($ref); $i++){
            my $refbase = substr($ref, $i, 1);
            my $newbase = substr($variant, $i, 1);
            if($newbase eq $refbase){
              push @poly_scores, "-";
              push @poly_calls, "-";
              push @gerp_scores, "-";
              next;
            }
            my @base_results = polyphen_info($chr,$pos+$i,$refbase,$newbase);
            push @poly_scores, $base_results[0];
            push @poly_calls, $base_results[1];
            $base_results[2] = gerp_info($chr,$pos+$i);
            push @gerp_scores, $base_results[2];
        }
        $results[0] = join(",", @poly_scores); 
        $results[1] = join(",", @poly_calls); 
        $results[2] = join(",", @gerp_scores); 
    }
    else{
        @results = qw(NA NA NA);
    }

    return @results;
}

sub get_variant_key($$$$){
  # params chr pos ref variant
  my $c = $_[0];
  $c =~ s/^chr//;
  my $prop_info_key = join(":", $c, @_[1..3]);
  $chr2variant_locs{$c} = [] unless exists $chr2variant_locs{$c};
  push @{$chr2variant_locs{$c}}, $_[1];
  return $prop_info_key;
}

sub retrieve_vcf_lines($$$){
  my ($gerp_dir, $polyphen_file, $chr) = @_;

  my (%polyphen_lines, %gerp_lines);

  if(not exists $chr2variant_locs{$chr}){
    return ({}, {}); # no data requested for this chromosome
  }

  # build up the request
  my @tabix_regions;
  my @var_locs = grep /^\d+$/, @{$chr2variant_locs{$chr}}; # grep keeps point mutations only
  if(not @var_locs){
    return ({}, {}); # no point mutations 
  }

  # sort by variant start location
  for my $var_loc (sort {$a <=> $b} @var_locs){
    push @tabix_regions, $chr.":".$var_loc."-".$var_loc;
  }
  my $regions = join(" ", @tabix_regions);

  # retrieve the data
  die "Cannot find Polyphen2 VCF file $polyphen_file\n" if not -e $polyphen_file;
  open(VCF, "tabix $polyphen_file $regions |")
    or die "Cannot run tabix on $polyphen_file: $!\n";
  while(<VCF>){
    if(/^\S+\t(\d+)/ and grep {$_ eq $1} @var_locs){ # take only main columns to save room, if possible
      chomp;
      $polyphen_lines{$1} = [] unless exists $polyphen_lines{$1};
      push @{$polyphen_lines{$1}}, $_;
    }
  }
  close(VCF);

  my $vcf_file = "$gerp_dir/chr$chr.tab.gz";
  if(not -e $vcf_file){
    warn "Cannot find GERP VCF file $vcf_file\n" if not $quiet;
    return ({}, {});
  }
  open(VCF, "tabix $vcf_file $regions |")
    or die "Cannot run tabix on $vcf_file: $!\n";
  while(<VCF>){
    if(/^(\S+\t(\d+)(?:\t\S+){2})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible
      $gerp_lines{$2} = [] unless exists $gerp_lines{$2};
      push @{$gerp_lines{$2}}, $1;
    }
  }
  close(VCF);

  return (\%polyphen_lines, \%gerp_lines);
}

sub polyphen_info($$$){
  my ($chr, $pos, $variant_base) = @_;
    
  my $key = "$chr:$pos:$variant_base";
  if(exists $polyphen_info{$key}){
      return @{$polyphen_info{$key}};
  }

  #print STDERR "Checking for polyphen data for $chr, $pos, $variant_base\n";
  if(exists $chr2polyphen_vcf_lines{$chr}->{$pos}){
    for(@{$chr2polyphen_vcf_lines{$chr}->{$pos}}){
      #print STDERR "Checking $chr, $pos, $variant_base against $_";
      my @fields = split /\t/, $_;
      if($fields[4] eq $variant_base){
	  if($fields[6] eq "D"){
	      $polyphen_info{$key} = [$fields[5],"deleterious"];
              last;
	  }
	  elsif($fields[6] eq "P"){
	      $polyphen_info{$key} = [$fields[5],"poss. del."];
              last;
	  }
	  elsif($fields[6] eq "B"){
	      $polyphen_info{$key} = [$fields[5],"benign"];
              last;
	  }
	  else{
	      $polyphen_info{$key} = [$fields[5],$fields[6]];
              last;
	  }
      }
    }
    if(not exists $polyphen_info{$key}){
      $polyphen_info{$key} = ["NA", "NA"];
    }
  }
  else{
    $polyphen_info{$key} = ["NA", "NA"];
  }
  return @{$polyphen_info{$key}};
}

sub gerp_info($$){
    my ($chr,$pos) = @_;

    if(exists $chr2gerp_vcf_lines{$chr}->{$pos}){
      for(@{$chr2gerp_vcf_lines{$chr}->{$pos}}){
        my @fields = split /\t/, $_;
        return $fields[3];
      }
    }
    return "NA";
}

sub predict_splicing{
  my ($chr, $pos, $ref, $var, $strand, $exon_edge_distance, $splice_type) = @_;
  
  my $disruption_level = 0;
  my @disruption_details;
  # The methods being used only look for up to a 30 base context, so only 
  # need to check for obliterated acceptors/donors if they are nearby
  $exon_edge_distance-- if $exon_edge_distance > 0; # +1 is actually in the right spot, so decrement
  my $edge_offset = $strand eq "-" ? -1*$exon_edge_distance : $exon_edge_distance; # flip offset sign if the call was on the negative strand (since +/- was relative to the transcript, not the genome coordinates)
  my $annotated_detected = 0;
  my ($genesplicer_orig, $genesplicer_orig_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref);
  if($genesplicer_orig != 0){
    $annotated_detected = 1;
  }
  my ($maxentscan_orig, $maxentscan_orig_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref);
  if(not $annotated_detected and $maxentscan_orig){
    $annotated_detected = 1;
  }
  my ($genesplicer_mod, $genesplicer_mod_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance);
  if($genesplicer_mod != $UNAFFECTED){ # was the mod within the range where this software could detect up or downstream effects on the annotated splice site?
    if($genesplicer_orig_pos != $genesplicer_mod_pos){
      $disruption_level+=0.75;
      if($genesplicer_mod_pos == -1){
        push @disruption_details, "Variant obliterates GeneSplicer predicted splice site from the reference sequence ($chr:$genesplicer_orig_pos, orig score $genesplicer_orig)";
      }
      elsif($genesplicer_orig_pos != -1){
        push @disruption_details, "Variant moves GeneSplicer preferred splice site from annotated location $chr:$genesplicer_orig_pos to $chr:$genesplicer_mod_pos (scores: orig=$genesplicer_orig, new=$genesplicer_mod)";
      }
    }
    elsif($genesplicer_orig-$genesplicer_mod > 1){
      $disruption_level+=0.5;
      push @disruption_details, "Variant significantly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)";
    }
    elsif($genesplicer_orig > $genesplicer_mod){
      $disruption_level+=0.25;
      push @disruption_details, "Variant slightly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)";
    }
    $genesplicer_orig = $genesplicer_mod; # so that splice site comparison vs. cryptic is accurate
  }
  my ($maxentscan_mod, $maxentscan_mod_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance);
  if($maxentscan_mod != $UNAFFECTED){
    if($maxentscan_mod_pos != $maxentscan_orig_pos){
      $disruption_level+=1;
      if($maxentscan_mod_pos == -1){
        push @disruption_details, "Variant obliterates MaxEntScan predicted splice site from the reference sequence ($chr:$maxentscan_orig_pos, orig score $maxentscan_orig)";
      }
      elsif($maxentscan_orig_pos != -1){
        push @disruption_details, "Variant moves MaxEntScan preferred splice site from annotated location $chr:$maxentscan_orig_pos to $chr:$maxentscan_mod_pos (scores: orig=$maxentscan_orig, new=$maxentscan_mod)";
      }
    }
    elsif($maxentscan_orig-$maxentscan_mod > 1){
      $disruption_level+=0.5;
      push @disruption_details, "Variant significantly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)";
    }
    elsif($maxentscan_orig>$maxentscan_mod){
      $disruption_level+=0.25;
      push @disruption_details, "Variant slightly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)";
    }
    $maxentscan_orig = $maxentscan_mod;
  }

  # Regardless of location, see if there is an increased chance of creating a cryptic splicing site
  my ($genesplicer_control, $genesplicer_control_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref);
  my ($genesplicer_cryptic, $genesplicer_cryptic_pos)  = call_genesplicer($splice_type, $chr, $strand, $pos, $ref, $var);
  if(defined $genesplicer_orig and $genesplicer_control < $genesplicer_orig and $genesplicer_orig < $genesplicer_cryptic){
    $disruption_level+=0.75;
    push @disruption_details, "Variant causes GeneSplicer score for a potential cryptic $splice_type splice site at $chr:$genesplicer_cryptic_pos to become higher than the annotated splice site $chr:$genesplicer_orig_pos (orig=$genesplicer_orig vs. var=$genesplicer_cryptic)";
  }
  if($genesplicer_control - $genesplicer_cryptic < -1){
    $disruption_level+=0.5;
    push @disruption_details, "Variant raises GeneSplicer score for a potential cryptic $splice_type splice site (orig=$genesplicer_control vs. var=$genesplicer_cryptic)";
  }
  my ($maxentscan_control, $maxentscan_control_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref);
  my ($maxentscan_cryptic, $maxentscan_cryptic_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref, $var);
  if(defined $maxentscan_orig and $maxentscan_control < $maxentscan_orig and $maxentscan_orig < $maxentscan_cryptic){
    $disruption_level++;
    push @disruption_details, "Variant causes MaxEntScan score for a potential cryptic $splice_type splice site at $chr:$maxentscan_cryptic_pos to become higher than the annotated splice site $chr:$maxentscan_orig_pos (orig=$maxentscan_orig vs. var=$maxentscan_cryptic)";
  }
  if($maxentscan_control - $maxentscan_cryptic < -1 and $maxentscan_cryptic > 0){
    $disruption_level+=0.5;
    push @disruption_details, "Variant raises MaxEntScan score for a potential cryptic $splice_type splice site (orig=$maxentscan_control vs. var=$maxentscan_cryptic)";
  }

  # If neither method predicted the annotated donor site, and the alternate site isn't predicted either, note that so the user doesn't think the mod is definite okay
  if(not @disruption_details and $maxentscan_orig < 0 and $genesplicer_orig < 0){
    $disruption_level+=0.5;
    push @disruption_details, "Neither the annotated splice site nor any potential cryptic site introduced by the variant are predicted by either GeneSplicer or MaxEntScan, therefore the alt splicing potential for this variant should be evaluated manually"; 
  }
 
  return ($disruption_level, join("; ", @disruption_details));
}

sub call_genesplicer{
  my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_;

  if($splice_type eq "acceptor"){
    $pos += $strand eq  "-" ? 2: -2;
  }
  my $num_flanking = 110; # seems to croak if less than 160bp provided
  my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins
  my $site_window = defined $var_offset ? 2 : ($indel_size > 0 ? 29+$indel_size : 29); # exact position if looking for exisitng site effect, otherwise 10 or 10 + the insert size 
  my $cmd = "genesplicer";
  my $start = $pos - $num_flanking;
  $start = 1 if $start < 1;
  my $stop = $pos + $num_flanking + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 2*$num_flanking after the edit
  $stop++; # end coordinate is exclusive
  return ($UNAFFECTED, -1) if defined $var_offset and ($pos+$var_offset < $start or $pos+$var_offset > $stop); # variant is outside of range we can detect affect on original splice site
  my $seq = $fasta_index->fetch("$chr:$start-$stop");
  if(defined $var){
    if(defined $var_offset){ # substitution away from the site of interest
      substr($seq, $num_flanking+$var_offset, length($ref)) = $var;
    }
    else{ # substitution at the site of interest
      substr($seq, $num_flanking, length($ref)) = $var;
    }
  }
  if($strand eq "-"){
    $seq = reverse($seq);
    $seq =~ tr/ACGTacgt/TGCAtgca/;
  }
  $tmpfile = "$$.fasta";
  open(TMP, ">$tmpfile") or die "Could not open $tmpfile for writing: $!\n";
  print TMP ">$chr $strand $pos $ref",($var?" $var":""), ($var_offset?" $var_offset":""),"\n$seq\n";
  substr($seq, $num_flanking, 2) = lc(substr($seq, $num_flanking, 2));
  #print STDERR "$chr $strand $pos $ref $var $var_offset: $seq\n";
  close(TMP);
  #print STDERR "$cmd $tmpfile /export/achri_data/programs/GeneSplicer/human\n";
  #<STDIN>;
  open(GS, "$cmd $tmpfile human 2> /dev/null|")
    or die "Could not run $cmd: $!\n";
  my $highest_score = 0;
  my $highest_position = -1;
  while(<GS>){
    # End5 End3 Score "confidence" splice_site_type
    # E.g. :
    # 202 203 2.994010 Medium donor
    chomp;
    my @F = split /\s+/, $_;
    next if $F[1] < $F[0]; # wrong strand
    if($splice_type eq $F[4]){
      if(abs($F[0]-$num_flanking-1) <= $site_window){ # right type and approximate location
        #print STDERR "*$_\n";
        if($F[2] > $highest_score){
          $highest_score = $F[2];
          # last bit accounts for indels in the position offset
          $highest_position = $pos + ($strand eq "-" ? -1: 1)*(-$num_flanking + $F[0] - 1) + (($F[0] <= $num_flanking && $strand eq "+" or $F[0] >= $num_flanking && $strand eq "-") ? -$indel_size : 0);
        }
      }
    } 
    #else{print STDERR $_,"\n";}
  }
  close(GS);
  return ($highest_score, $highest_position);
}

sub call_maxentscan{
  my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_;

  my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins
  my $cmd;
  my ($num_flanking5, $num_flanking3);
  my ($region5, $region3);
  my ($start, $stop);
  my $highest_score = -9999;
  my $highest_position = -1;
  # note that donor and acceptor are very different beasts for MaxEntScan
  if($splice_type eq "acceptor"){
    if($strand eq "-"){
      $pos += 2;
    }
    else{
      $pos -= 2;
    }
    $cmd = "score3.pl";
    $num_flanking5 = 18;
    $num_flanking3 = 4;
  }
  else{ # donor
    $cmd = "score5.pl";
    $num_flanking5 = 3; 
    $num_flanking3 = 5;
  }
  # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region
  $region5 = defined $var_offset ? 0 : $num_flanking5; # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region
  $region3 = defined $var_offset ? 0 : $num_flanking3;
  if($strand eq "-"){
    $start = $pos - $num_flanking3 - $region3;
    $stop = $pos + $num_flanking5 + $region5 + ($indel_size < 0 ? -1*$indel_size:0);
    #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking3 - $region3)\n";
  }
  else{
    $start = $pos - $num_flanking5 - $region5;
    $stop = $pos + $num_flanking3 + $region3 + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 23 or 9 bases as the case may be
    #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking5 - $region5)\n";
  }
  if(defined $var_offset and (
     $strand eq "+" and ($pos+$var_offset < $start or $pos+$var_offset > $stop) or 
     $strand eq "-" and ($pos-$var_offset < $start or $pos-$var_offset > $stop))){ # variant is outside of range we can detect affect on original splice site
    return ($UNAFFECTED, -1) 
  }

  my $seq = $fasta_index->fetch("$chr:$start-$stop");
  my @subseqs;
  if(defined $var){
      if(defined $var_offset){ # substitution away from the site of interest
        #print STDERR "Subbing in string of length ", length($seq), "near splice site $ref -> $var at ", ($strand eq "-" ? "$region3+$num_flanking3-$var_offset":"$region5+$num_flanking5+$var_offset"), "\n";
        substr($seq, ($strand eq "-" ? $region3+$num_flanking3-$var_offset:$region5+$num_flanking5+$var_offset), length($ref)) = $var;
      }
      else{ # substitution at the site of interest
        substr($seq, ($strand eq "-" ? $region3+$num_flanking3:$region5+$num_flanking5), length($ref)) = $var;
      }
  }
  if($strand eq "-"){
      $seq = reverse($seq);
      $seq =~ tr/ACGTacgt/TGCAtgca/;
  }
  # get all of the 23 or 9 base pair subsequences, depending on whether it's donors or acceptors we are looking for
  for(my $i = 0; $i+$num_flanking5+$num_flanking3+1 <= length($seq); $i++){
      push @subseqs, substr($seq, $i, $num_flanking5+$num_flanking3+1);
  }
  my $pid = open2(\*CHILD_OUT, \*CHILD_IN, "$cmd -");
  $pid or die "Could not run $cmd: $!\n";
  print CHILD_IN join("\n",@subseqs);
  close(CHILD_IN);
  while(<CHILD_OUT>){
      chomp;
      # out is just "seq[tab]score"
      my @F = split /\s+/, $_;
      if($F[1] > $highest_score){
        #print STDERR ">";
        $highest_score = $F[1];
        # since we printed the subseqs in order, $. (containing the line number of the prediction output) gives us the offset in the original seq
        $highest_position = $start + ($strand eq "-" ? length($seq) - $num_flanking5 - $. : $num_flanking5 + $. -1)+(($. <= $num_flanking5+$region5 && $strand eq "+" or $. >= $num_flanking5+$region5 && $strand eq "-") ? -$indel_size : 0);
      }
      #print STDERR $_,"\n";
  }
  close(CHILD_OUT);
  waitpid($pid, 0);

  return ($highest_score, $highest_position);
}

sub get_range_info($$$$){
   my ($chr2ranges, $chr, $start, $stop) = @_;

   if(not exists $chr2ranges->{$chr}){
     return {};
   }
   my $range_key = "$chr:$start:$stop";

   # Use a binary search to find the leftmost range of interest
   my $left_index = 0;
   my $right_index = $#{$chr2ranges->{$chr}};
   my $cursor = int($right_index/2);
   while($left_index != $right_index and $left_index != $cursor and $right_index != $cursor){
     # need to go left
     if($chr2ranges->{$chr}->[$cursor]->[0] >= $start){
       $right_index = $cursor;
       $cursor = int(($left_index+$cursor)/2);
     }
     # need to go to the right
     elsif($chr2ranges->{$chr}->[$cursor]->[1] <= $stop){
       $left_index = $cursor;
       $cursor = int(($right_index+$cursor)/2);
     }
     else{
       $right_index = $cursor;
       $cursor--; # in the range, go left until not in the range any more
     }
   }

   my %names;
   for (; $cursor <= $#{$chr2ranges->{$chr}}; $cursor++){
      my $range_ref = $chr2ranges->{$chr}->[$cursor];
      if($range_ref->[0] > $stop){
        last;
      }
      elsif($range_ref->[0] <= $stop and $range_ref->[1] >= $start){
        $names{$range_ref->[2]}++;
      }
   }

   #print STDERR "Names for range $chr:$start-$stop are ", join("/", keys %names), "\n";
   return \%names;  
}

if(not -d $sift_dir){
    die "The specified SIFT directory $sift_dir does not exist\n";
}	

print STDERR "Reading in OMIM morbid map...\n" unless $quiet;
die "Data file $morbidmap_file does not exist, aborting.\n" if not -e $morbidmap_file;
my %gene2morbids;
open(TAB, $morbidmap_file)
  or die "Cannot open OMIM morbid map file $morbidmap_file for reading: $!\n";
while(<TAB>){
  my @fields = split /\|/, $_;
  #print STDERR "got ", ($#fields+1), " fields in $_\n";
  #my @fields = split /\|/, $_;
  my $morbid = $fields[0];
  $morbid =~ s/\s*\(\d+\)$//; # get rid of trailing parenthetical number
  my $omim_id = $fields[2];
  for my $genename (split /, /, $fields[1]){
    if(not exists $gene2morbids{$genename}){
      $gene2morbids{$genename}  = "OMIM:$omim_id $morbid";
    }
    else{
      $gene2morbids{$genename} .= "; $morbid";
    }
  }
}
close(TAB);

print STDERR "Reading in interpro mappings...\n" unless $quiet;
die "Data file $interpro_bed_file does not exist, aborting.\n" if not -e $interpro_bed_file;
my %interpro_ids;
open(TAB, $interpro_bed_file)
  or die "Cannot open gene name BED file $interpro_bed_file for reading: $!\n";
while(<TAB>){
  chomp;
  # format should be "chr start stop interpro desc..."
  my @fields = split /\t/, $_;
  my $c = $fields[0];
  if(not exists $interpro_ids{$c}){
     $interpro_ids{$c} = [] unless exists $interpro_ids{$c};
     $interpro_ids{"chr".$c} = [] unless $c =~ /^chr/;
     $interpro_ids{$1} = [] if $c =~ /^chr(\S+)/;
  }
  my $dataref = [@fields[1..3]];
  push @{$interpro_ids{$c}}, $dataref;
  push @{$interpro_ids{"chr".$c}}, $dataref unless $c =~ /^chr/;
  push @{$interpro_ids{$1}}, $dataref if $c =~ /^chr(\S+)/;
}
close(TAB);
print STDERR "Sorting interpro matches per contig...\n" unless $quiet;
for my $chr (keys %interpro_ids){
  $interpro_ids{$chr} = [sort {$a->[0] <=> $b->[0]} @{$interpro_ids{$chr}}];
}
 
# {uc(gene_name) => space separated info}
print STDERR "Reading in tissue info...\n" unless $quiet;
my %tissues;
my %tissue_desc;
open(TISSUE, $tissue_file)
    or die "Cannot open $tissue_file for reading: $!\n";
while(<TISSUE>){
    chomp;
    my @fields = split /\t/, $_;
    my $gname = uc($fields[1]);
    $tissue_desc{$gname} = $fields[2];
    next unless $#fields == 4; 
    my @tis = split /;\s*/, $fields[4];
    pop @tis;
    if(@tis < 6){
	$tissues{$gname} = join(">", @tis);
    }
    elsif(@tis < 24){
	$tissues{$gname} = join(">", @tis[0..5]).">...";
    }
    else{
	$tissues{$gname} = join(">", @tis[0..(int($#tis/5))]).">...";
    }
}
close(TISSUE);

print STDERR "Loading pathway info...\n" unless $quiet;
my %pathways;
my %pathway_ids;
open(PATHWAY, $pathway_file)
    or die "Cannot open $pathway_file for reading: $!\n";
while(<PATHWAY>){
    chomp;
    my @fields = split /\t/, $_;
    next unless @fields >= 4;
    for my $gname (split /\s+/, $fields[1]){ 
      $gname = uc($gname);
      if(not exists $pathways{$gname}){
	  $pathway_ids{$gname} = $fields[2];
	  $pathways{$gname} = $fields[3];
      }
      else{
	  $pathway_ids{$gname} .= "; ".$fields[2];
	  $pathways{$gname} .= "; ".$fields[3];
      }
   }
}
close(PATHWAY);

print STDERR "Loading SIFT indices...\n" unless $quiet;
# Read in the database table list
my %chr_tables;
open(DBLIST, "$sift_dir/bins.list")
    or die "Cannot open $sift_dir/bins.list for reading: $!\n";
while(<DBLIST>){
    chomp;
    my @fields = split /\t/, $_;
    if(not exists $chr_tables{$fields[1]}){
	$chr_tables{$fields[1]} = {};
    }
    my $chr = $fields[1];
    $chr = "chr$chr" unless $chr =~ /^chr/;
    $chr_tables{$chr}->{"$chr\_$fields[2]_$fields[3]"} = [$fields[2],$fields[3]];
}
close(DBLIST);

#Connect to database
my $db_gene = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_Supp.sqlite","","",{RaiseError=>1, AutoCommit=>1});

my $db_chr;
my $fr_stmt; # retrieval for frameshift
my $snp_stmt; # retrieval for SNP
my $cur_chr = "";
my $cur_max_pos;

if(defined $predict_cryptic_splicings){
  if(not -e $predict_cryptic_splicings){
    die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, does not exist.\n";
  }
  if(not -e $predict_cryptic_splicings.".fai" and not -w dirname($predict_cryptic_splicings)){
    die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, is not indexed, and the directory is not writable.\n";
  }
  print STDERR "Loading reference FastA index...\n" unless $quiet;
  $fasta_index = Bio::DB::Sam::Fai->load($predict_cryptic_splicings);
}

print STDERR "Annotating variants...\n" unless $quiet;
# Assume contigs and positions in order
print STDERR "Processing file $hgvs_table_file...\n" unless $quiet;
open(HGVS, $hgvs_table_file)
    or die "Cannot open HGVS file $hgvs_table_file for reading: $!\n";
# Output file for annotated snps and frameshifts
my $header = <HGVS>; # header
chomp $header;
my @headers = split /\t/, $header;
my ($chr_column, $pos_column, $ref_column, $alt_column, $alt_cnt_column, $tot_cnt_column, $ftype_column, $dbid_column, $maf_src_column, $maf_column, $internal_freq_column, 
    $cdna_hgvs_column, $aa_hgvs_column, $transcript_column, $transcript_length_column, $strand_column, $zygosity_column, $splice_dist_column, $caveats_column, $phase_column,
    $srcs_column, $pvalue_column, $to_column, $genename_column, $rares_column, $rares_header);
my %req_columns = (
	"Chr" => \$chr_column,
	"DNA From" => \$pos_column,
	"DNA To" => \$to_column,
	"Gene Name" => \$genename_column,
	"Ref base" => \$ref_column,
	"Obs base" => \$alt_column,
	"Variant Reads" => \$alt_cnt_column,
	"Total Reads" => \$tot_cnt_column,
	"Feature type" => \$ftype_column,
	"Variant DB ID" => \$dbid_column,
	"Pop. freq. source" => \$maf_src_column,
	"Pop. freq." => \$maf_column,
	"Transcript HGVS" => \$cdna_hgvs_column,
	"Protein HGVS" => \$aa_hgvs_column,
        "Selected transcript" => \$transcript_column,
	"Transcript length" => \$transcript_length_column,
	"Strand" => \$strand_column,
	"Zygosity" => \$zygosity_column,
	"Closest exon junction (AA coding variants)" => \$splice_dist_column,
	"Caveats" => \$caveats_column,
	"Phase" => \$phase_column,
        "P-value" => \$pvalue_column);
&load_header_columns(\%req_columns, \@headers);
for(my $i = 0; $i <= $#headers; $i++){
  # Optional columns go here
  if($headers[$i] eq "Sources"){
    $srcs_column = $i;
  }
  elsif($headers[$i] eq "Internal pop. freq."){
    $internal_freq_column = $i;
  }
  elsif($headers[$i] =~ /^Num rare variants/){
    $rares_column = $i;
    $rares_header = $headers[$i];
  }
}
open(OUT, ">$outfile")
    or die "Cannot open $outfile for writing: $!\n";
print OUT join("\t", "Chr", "DNA From", "DNA To", "Ref base", "Obs base", "% ref read","% variant read", 
	   "Variant Reads", "Total Reads", "Feature type","Variant type", "Variant context", "Variant DB ID", "Pop. freq. source", 
	   "Pop. freq."), (defined $internal_freq_column ? "\tInternal pop. freq.\t" : "\t"), join("\t", "SIFT Score", "Polyphen2 Score", "Polyphen2 call", "GERP score",
	   "Gene Name", "Transcript HGVS",
	   "Protein HGVS", "Zygosity","Closest exon junction (AA coding variants)","Splicing alteration potential","Splicing alteration details",
	   "OMIM Morbid Map","Tissue expression", "Pathways", "Pathway refs",
	   "Gene Function", "Spanning InterPro Domains", "P-value", "Caveats", "Phase", 
           "Selected transcript", "Transcript length", "Strand"),
           (defined $rares_column ? "\t$rares_header" : ""),
           (defined $srcs_column ? "\tSources" : ""), "\n";
while(<HGVS>){
    chomp;
    my @fields = split /\t/, $_, -1;
    my $mode;
    my $protein_hgvs = $fields[$aa_hgvs_column];
    my $refSeq = $fields[$ref_column];
    my $newBase = $fields[$alt_column];
    my $splicing_potential = "NA";
    my $splicing_details = "NA";
    if(defined $predict_cryptic_splicings){
      my $distance = $fields[$splice_dist_column]; # only for coding variants
      my $site_type;
      if($distance eq "NA"){
      }
      elsif($distance){
        $site_type = $distance < 0 ? "donor" : "acceptor"; # exon context
      }
      else{
        if($fields[$cdna_hgvs_column] =~ /\d+([\-+]\d+)/){ # get from the cDNA HGVS otherwise
          $distance = $1;
          $distance =~ s/^\+//;
          $site_type = $distance > 0 ? "donor" : "acceptor"; # intron context
        }
      }
      # only do splicing predictions for novel or rare variants, since these predictions are expensive
      if($distance and $distance ne "NA" and ($fields[$maf_column] eq "NA" or $fields[$maf_column] <= 0.01)){
        #print STDERR "Running prediction for rare variant ($fields[3], MAF=$fields[14]): $fields[5], $fields[6], $refSeq, $newBase, $fields[4], $distance, $site_type\n";
        ($splicing_potential, $splicing_details) = &predict_splicing($fields[$chr_column], $fields[$pos_column], $refSeq, $newBase, $fields[$strand_column], $distance, $site_type);
      }
    }
    my $lookupBase;
    if($fields[$strand_column] eq "-"){
      if($refSeq ne "NA"){
        $refSeq = reverse($refSeq);
        $refSeq =~ tr/ACGTacgt/TGCAtgca/;
      }
      if($newBase ne "NA"){
        $newBase = reverse($newBase);
        $newBase =~ tr/ACGTacgt/TGCAtgca/;
        $newBase =~ s(TN/)(NA/);
        $newBase =~ s(/TN)(/NA);
      }
    }
    if($fields[$cdna_hgvs_column] =~ /^g\..*?(?:ins|delins|>)([ACGTN]+)$/){
      $mode = "snps";
    }
    elsif($fields[$cdna_hgvs_column] =~ /^g\..*?del([ACGTN]+)$/){
      $mode = "snps";
    }
    elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+(?:[\-+]\d+)?ins([ACGTN]+)$/ or $fields[$cdna_hgvs_column] =~ /^c\.\d+(?:[\-+]\d+)?_\d+ins([ACGTN]+)$/){
        if(not length($1)%3){
            $mode = "snps";
        }
        else{
            $mode = "frameshifts";
        }
    }
    elsif($fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:[\-+]\d+)?_(\d+)(?:[\-+]\d+)?delins([ACGTN]+)$/){
	if(not (length($3)-$2+$1-1)%3){
	    $mode = "snps";
	}
	else{
	    $mode = "frameshifts";
	}
    }
    elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[+\-]\d+_\d+[+\-]\d+delins([ACGTN]+)$/ or
          $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or 
          $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or 
          $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+del([ACGTN]*)$/ or
          $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?(?:ins|delins)?([ACGTN]+)$/){
	$mode = "snps";
    }
    elsif($fields[$cdna_hgvs_column] =~ /^c\.([-*]?\d+)del(\S+)/){
        my $dist = $1;
        $dist =~ s/^\*//;
        if(abs($dist) < 4){
          $mode = "snps";
        }
        else{ 
          $mode = "frameshifts";
        }
    }
    elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+ins(\S*)/ or
          $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+del(\S*)/){
        $mode = "snps";
    }
    elsif($fields[$cdna_hgvs_column] =~ /^c\.(-?\d+)_(\d+)(?:\+\d+)?del(\S+)/ or 
          $fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:\-\d+)?_(\d+)del(\S+)/){
	if(not ($2-$1+1)%3){
	    $mode = "snps";
	}
	else{
	    $mode = "frameshifts";
	}
    }
    elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+ins([ACGTN]+)$/){
	if(not length($1)%3){
	    $mode = "snps";
	}
	else{
	    $mode = "frameshifts";
	}
    }
    # special case of shifted start codon
    elsif($fields[$cdna_hgvs_column] =~ /^c\.-1_1+ins([ACGTN]+)$/ or
          $fields[$cdna_hgvs_column] =~ /inv$/){
        $mode = "snps";
    }
    elsif($fields[$cdna_hgvs_column] =~ /[ACGTN]>([ACGTN])$/){
	# todo: handle multiple consecutive substitutions, will have to run polyphen separately	
        $lookupBase = $newBase;
	if($fields[$strand_column] eq "-"){ # revert to positive strand variant for SIFT index
	    $lookupBase =~ tr/ACGTacgt/TGCAtgca/;
        }
	$mode = "snps";
    }
    else{
	$mode = "snps";
    }
    if($fields[$chr_column] ne $cur_chr){
	#Prepared DB connection on per-chromosome basis
	$cur_chr = $fields[$chr_column];
	$cur_chr =~ s/^chr//;
	$db_chr = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_CHR$cur_chr.sqlite",
			       "", 
			       "",
			       { RaiseError => 1, AutoCommit => 1 }) unless not -e "$sift_dir/Human_CHR$cur_chr.sqlite";
	
	$cur_max_pos = -1;
    }
    if($fields[$pos_column] =~ /^\d+$/ and $cur_max_pos < $fields[$pos_column]){
	undef $fr_stmt;
	undef $snp_stmt;
        my $c = $fields[$chr_column];
        $c = "chr$c" if $c !~ /^chr/;
	for my $chr_table_key (keys %{$chr_tables{$c}}){
	    my $chr_table_range = $chr_tables{$c}->{$chr_table_key};
	    if($chr_table_range->[0] <= $fields[$pos_column] and $chr_table_range->[1] >= $fields[$pos_column]){
		$fr_stmt = $db_chr->prepare("select SCORE".
					    " from $chr_table_key where COORD1 = ?");
		$snp_stmt = $db_chr->prepare("select SCORE".
					     " from $chr_table_key where COORD1 = ? AND NT2 = ?");
		$cur_max_pos = $chr_table_range->[1];
		last;
	    }
	}
	if(not defined $fr_stmt and not $quiet){
	    warn "Got request for position not in range of SIFT ($c:$fields[$pos_column])\n";
	}
    }

    my @cols;
    if(not $fr_stmt){
       @cols = ("NA");
    }
    elsif($mode eq "frameshifts"){
	$fr_stmt->execute($fields[$pos_column]);
	@cols = $fr_stmt->fetchrow_array();
    }
    else{
	$snp_stmt->execute($fields[$pos_column]-1, $lookupBase);
	@cols = $snp_stmt->fetchrow_array();
    }
    # See if the change is in a CDS, and if it's non-synonymous 
    my ($type, $location) = ("silent", "intron");
    my $start = $fields[$pos_column];
    my $stop = $fields[$to_column];
    my $interpro = get_range_info(\%interpro_ids, $fields[$chr_column], $start, $stop); 
    my $domains = join("; ", sort keys %$interpro);

    my $sift_score = "NA";
    my $variant_key = "NA";
    my $transcript_hgvs = $fields[$cdna_hgvs_column];
    if($transcript_hgvs =~ /\.\d+[ACGTN]>/ or $transcript_hgvs =~ /\.\d+(?:_|del|ins)/ or $transcript_hgvs =~ /[+\-]\?/){
	$location = "exon";
    }
    elsif($transcript_hgvs =~ /\.\d+[\-+](\d+)/){
	if($1 < 21){
	    $type = "splice";
	}
	else{
	    $type = "intronic";
	}
	$location = "splice site";
    }
    elsif($transcript_hgvs =~ /\.\*\d+/){
	$location = "3'UTR";
    }
    elsif($transcript_hgvs =~ /\.-\d+/){
	$location = "5'UTR";
    }
    if($mode eq "frameshifts"){
	$type = "frameshift";
    }
    else{
	if(not defined $protein_hgvs){
            $type = "unknown";
        }
        elsif($protein_hgvs eq "NA"){
	    $type = "non-coding";
	}
	elsif($protein_hgvs =~ /=$/){
	    $type = "silent";
	}
	elsif($protein_hgvs =~ /\d+\*/){ # nonsense
	    $type = "nonsense";
	}
	elsif($protein_hgvs =~ /ext/){ # nonsense
	    $type = "nonstop";
	}
        elsif($protein_hgvs eq "p.0?" or $protein_hgvs eq "p.?"){
            $type = "unknown";
        }
	else{
	    $type = "missense";
	}

	$sift_score = $cols[0] || "NA";

        # Record the need to fetch VCF polyphen and gerp data later (en masse, for efficiency)
        # Use newBase instead of lookupBase for PolyPhen since the orientation is always relative to the transcript
	$variant_key = get_variant_key($fields[$chr_column], $fields[$pos_column], $fields[$ref_column], $newBase);
        #print STDERR "Variant key is $variant_key\n";
    }
    my $transcript_type = $fields[$ftype_column];
    my $transcript_length = $fields[$transcript_length_column];
    my $transcript_id = $fields[$transcript_column]; # sift score is derived from this transcript
    my $transcript_strand = $fields[$strand_column];
    my $zygosity = $fields[$zygosity_column];
    my $rsID = $fields[$dbid_column];
    my $popFreq = $fields[$maf_column];
    my $internalFreq;
    $internalFreq = $fields[$internal_freq_column] if defined $internal_freq_column;
    my $popFreqSrc = $fields[$maf_src_column];

    # Get the gene name and omim info
    my %seen;
    my @gene_names = split /; /, $fields[$genename_column]; 
    my $morbid = join("; ", grep {/\S/ and not $seen{$_}++} map({$gene2morbids{$_} or ""} @gene_names));
    my $tissue = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissues{$_} or ""} @gene_names));
    my $function = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissue_desc{$_} or ""} @gene_names));
    my $pathways = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathways{$_} or ""} @gene_names));
    my $pathway_ids = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathway_ids{$_} or ""} @gene_names));

    # It may be that the calls and freqs are multiple concatenated values, e.g. "X; Y"
    my @tot_bases = split /; /,$fields[$tot_cnt_column];
    my @var_bases = split /; /,$fields[$alt_cnt_column];
    my (@pct_nonvar, @pct_var);
    for (my $i = 0; $i <= $#tot_bases; $i++){
        push @pct_nonvar, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int(($tot_bases[$i]-$var_bases[$i])/$tot_bases[$i]*100+0.5);
        push @pct_var, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int($var_bases[$i]/$tot_bases[$i]*100+0.5);
    }

    push @lines, [
		   $fields[$chr_column], # chr
		   $fields[$pos_column], # pos
		   $fields[$to_column], 
		   $fields[$ref_column],
		   $fields[$alt_column],
		   join("; ", @pct_nonvar), # pct ref
		   join("; ", @pct_var),  # pct var
		   $fields[$alt_cnt_column], # num var
		   $fields[$tot_cnt_column], # num reads
		   $transcript_type, # protein_coding, pseudogene, etc.
		   $type,
		   $location,		   
		   $rsID,
		   $popFreqSrc,
		   (defined $internal_freq_column ? ($popFreq,$internalFreq) : ($popFreq)),
		   $sift_score,
		   $variant_key, # 16th field
		   join("; ", @gene_names),
		   $transcript_hgvs,
		   $protein_hgvs,
		   $zygosity,
                   $fields[$splice_dist_column], # distance to exon edge
                   $splicing_potential,
                   $splicing_details,
                   $morbid,
		   $tissue,
		   $pathways,
		   $pathway_ids,
		   $function,
                   $domains,
                   $fields[$pvalue_column],
                   $fields[$caveats_column], # caveats
                   (defined $rares_column ? ($fields[$rares_column],$transcript_id) : ($transcript_id)),
		   $transcript_length,
                   $transcript_strand,
		   (defined $srcs_column ? ($fields[$phase_column],$fields[$srcs_column]) : ($fields[$phase_column]))];
}

print STDERR "Adding Polyphen2 and GERP info...\n" unless $quiet;
# retrieve the polyphen and gerp info en masse for each chromosome, as this is much more efficient
my $vkey_col = 16+(defined $internal_freq_column ? 1 : 0);
for my $line_fields (@lines){
  # expand the key into the relevant MAF values
  $line_fields->[$vkey_col] = join("\t", polyphen_gerp_info($gerp_dir,$polyphen_file,$line_fields->[$vkey_col])); # fields[$vkey_col] is variant key
  print OUT join("\t", @{$line_fields}), "\n";
}
close(OUT);   
unlink $tmpfile if defined $tmpfile;

sub load_header_columns{
  my ($reqs_hash_ref, $headers_array_ref) = @_;
  my %unfulfilled;
  for my $field_name (keys %$reqs_hash_ref){
    $unfulfilled{$field_name} = 1;
  }
  for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){
    for my $req_header_name (keys %$reqs_hash_ref){
      if($req_header_name eq $headers_array_ref->[$i]){
        ${$reqs_hash_ref->{$req_header_name}} = $i;
        delete $unfulfilled{$req_header_name};
        last;
      }
    }
  }
  if(keys %unfulfilled){
    die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n";
  }
}