Mercurial > repos > yusuf > annotate_hgvs
changeset 0:9977d1935a07 default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:10:47 -0600 |
parents | |
children | |
files | AnnotateVariants.xml hgvs_table_annotate tool-data/annotate_hgvs.loc |
diffstat | 3 files changed, 1059 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/AnnotateVariants.xml Wed Mar 25 13:10:47 2015 -0600 @@ -0,0 +1,29 @@ +<?xml version="1.0"?> + +<tool id="hgvs_annotation_2" name="Annotate an HGVS table"> + <requirements> + <requirement type="package">genesplicer</requirement> + <requirement type="package">human</requirement> + <requirement type="package">score3.pl</requirement> + <requirement type="package">score5.pl</requirement> + </requirements> + + <description>with deleterious AA and splicing change predictions, protein domain disruptions, etc.</description> + <version_string>hgvs_table_annotate -v</version_string> + <command interpreter="perl">hgvs_table_annotate $__tool_data_path__ -q /sift/hg19 /polyphen2/hg19.txt.gz /dbs/gerp/hg19 /dbs/TissueDistributionDBs/human.v2009-07-30.tab /dbs/pathways/KEGG.human.2012-09-25.txt /dbs/interpro_supermatch_hg19.bed /dbs/OMIM/morbidmap $input_hgvs_table $out_hgvs_annotated_table /hg19.fa</command> + <inputs> + <param format="achri_snp_table" name="input_hgvs_table" type="data" label="Variant calls table with HGVS syntax"/> + </inputs> + <outputs> + <data format="achri_annotated_snp_table" name="out_hgvs_annotated_table" type="data" label="Functionally annotated HGVS variant table"/> + </outputs> + + <tests> + </tests> + + <help> +This tool reports several functional attributes, and potential for functional disturbance, based on genes that have declared sequence variants. +These results can be used to help find the genetic cause of a clinical phenotype. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgvs_table_annotate Wed Mar 25 13:10:47 2015 -0600 @@ -0,0 +1,1029 @@ +#!/usr/bin/env perl + +use Bio::DB::Sam; +use DBI; +use IPC::Open2; +use Set::IntervalTree; +use strict; +use warnings; +use vars qw($quiet %chr2variant_locs %polyphen_info %chr2polyphen_vcf_lines %chr2gerp_vcf_lines %range2genes $fasta_index $UNAFFECTED $tmpfile); +use File::Basename; +$UNAFFECTED = -299999; # sentinel for splice site predictions +my $current_directory = dirname(__FILE__); +if(@ARGV == 1 and $ARGV[0] eq "-v"){ + print "Version 1.0\n"; + exit; +} + +#configuration file stuff +my %config; +my $tool_dir = shift @ARGV; +if(not -e "$tool_dir/annotate_hgvs.loc"){ + system("cp $current_directory/tool-data/annotate_hgvs.loc $tool_dir/annotate_hgvs.loc"); +} +open CONFIG, '<', "$tool_dir/annotate_hgvs.loc"; +while(<CONFIG>){ + next if $_ =~ /^#/; + (my $key, my $value) = split(/\s+/,$_); + $config{$key} = $value; +} +close CONFIG; +my $dbs_dir = $config{"dbs_dir"}; + +$quiet = 0; +if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){ + $quiet = 1; + shift @ARGV; +} + +@ARGV == 9 or @ARGV == 10 or die "Usage: $0 -q <SIFT dir> <POLYPHEN2 file> <GERP dir> <tissuedb file> <pathway file> <interpro domains.bed> <omim morbidmap> <input hgvs table.txt> <output hgvs table.annotated.txt> [reference.fasta]\n Where if the reference fasta is provided, we predict cryptic splicing effects (using MaxEntScan and GeneSplicer)\n"; + +my $sift_dir = $dbs_dir . shift @ARGV; +my $polyphen_file = $dbs_dir . shift @ARGV; +my $gerp_dir = $dbs_dir . shift @ARGV; +my $tissue_file = $dbs_dir . shift @ARGV; +my $pathway_file = $dbs_dir. shift @ARGV; +my $interpro_bed_file = $dbs_dir . shift @ARGV; +my $morbidmap_file = $dbs_dir . shift @ARGV; +my $hgvs_table_file = shift @ARGV; +my $outfile = shift @ARGV; +my $predict_cryptic_splicings = @ARGV ? ($dbs_dir . shift @ARGV) : undef; +my $flanking_bases = 500; # assume this is ROI around a gene +my @lines; + +sub polyphen_gerp_info($$$){ + my($gerp_dir,$polyphen_file,$info_key) = @_; + + my($chr,$pos,$ref,$variant) = split /:/, $info_key; + + # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse + if(not exists $chr2polyphen_vcf_lines{$chr}){ + ($chr2polyphen_vcf_lines{$chr}, $chr2gerp_vcf_lines{$chr}) = retrieve_vcf_lines($gerp_dir,$polyphen_file,$chr); + } + + my @results; + if(not $ref or not $variant){ # can only get data for SNPs + @results = qw(NA NA NA); + } + #print STDERR "ref = $ref and variant = $variant for $info_key\n"; + elsif(length($ref) == 1 and length($variant) == 1){ + #print STDERR "Grabbing poly/gerp for $chr,$pos,$ref,$variant\n"; + @results = polyphen_info($chr,$pos,$variant); + $results[2] = gerp_info($chr,$pos); + } + + # It may be that a novel MNP variant is a bunch of known SNPs in a row. In this case, + # report the list of variants + elsif(length($ref) == length($variant) and $ref ne "NA"){ + my (@poly_scores, @poly_calls, @gerp_scores); + for(my $i = 0; $i < length($ref); $i++){ + my $refbase = substr($ref, $i, 1); + my $newbase = substr($variant, $i, 1); + if($newbase eq $refbase){ + push @poly_scores, "-"; + push @poly_calls, "-"; + push @gerp_scores, "-"; + next; + } + my @base_results = polyphen_info($chr,$pos+$i,$refbase,$newbase); + push @poly_scores, $base_results[0]; + push @poly_calls, $base_results[1]; + $base_results[2] = gerp_info($chr,$pos+$i); + push @gerp_scores, $base_results[2]; + } + $results[0] = join(",", @poly_scores); + $results[1] = join(",", @poly_calls); + $results[2] = join(",", @gerp_scores); + } + else{ + @results = qw(NA NA NA); + } + + return @results; +} + +sub get_variant_key($$$$){ + # params chr pos ref variant + my $c = $_[0]; + $c =~ s/^chr//; + my $prop_info_key = join(":", $c, @_[1..3]); + $chr2variant_locs{$c} = [] unless exists $chr2variant_locs{$c}; + push @{$chr2variant_locs{$c}}, $_[1]; + return $prop_info_key; +} + +sub retrieve_vcf_lines($$$){ + my ($gerp_dir, $polyphen_file, $chr) = @_; + + my (%polyphen_lines, %gerp_lines); + + if(not exists $chr2variant_locs{$chr}){ + return ({}, {}); # no data requested for this chromosome + } + + # build up the request + my @tabix_regions; + my @var_locs = grep /^\d+$/, @{$chr2variant_locs{$chr}}; # grep keeps point mutations only + if(not @var_locs){ + return ({}, {}); # no point mutations + } + + # sort by variant start location + for my $var_loc (sort {$a <=> $b} @var_locs){ + push @tabix_regions, $chr.":".$var_loc."-".$var_loc; + } + my $regions = join(" ", @tabix_regions); + + # retrieve the data + die "Cannot find Polyphen2 VCF file $polyphen_file\n" if not -e $polyphen_file; + open(VCF, "tabix $polyphen_file $regions |") + or die "Cannot run tabix on $polyphen_file: $!\n"; + while(<VCF>){ + if(/^\S+\t(\d+)/ and grep {$_ eq $1} @var_locs){ # take only main columns to save room, if possible + chomp; + $polyphen_lines{$1} = [] unless exists $polyphen_lines{$1}; + push @{$polyphen_lines{$1}}, $_; + } + } + close(VCF); + + my $vcf_file = "$gerp_dir/chr$chr.tab.gz"; + if(not -e $vcf_file){ + warn "Cannot find GERP VCF file $vcf_file\n" if not $quiet; + return ({}, {}); + } + open(VCF, "tabix $vcf_file $regions |") + or die "Cannot run tabix on $vcf_file: $!\n"; + while(<VCF>){ + if(/^(\S+\t(\d+)(?:\t\S+){2})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible + $gerp_lines{$2} = [] unless exists $gerp_lines{$2}; + push @{$gerp_lines{$2}}, $1; + } + } + close(VCF); + + return (\%polyphen_lines, \%gerp_lines); +} + +sub polyphen_info($$$){ + my ($chr, $pos, $variant_base) = @_; + + my $key = "$chr:$pos:$variant_base"; + if(exists $polyphen_info{$key}){ + return @{$polyphen_info{$key}}; + } + + #print STDERR "Checking for polyphen data for $chr, $pos, $variant_base\n"; + if(exists $chr2polyphen_vcf_lines{$chr}->{$pos}){ + for(@{$chr2polyphen_vcf_lines{$chr}->{$pos}}){ + #print STDERR "Checking $chr, $pos, $variant_base against $_"; + my @fields = split /\t/, $_; + if($fields[4] eq $variant_base){ + if($fields[6] eq "D"){ + $polyphen_info{$key} = [$fields[5],"deleterious"]; + last; + } + elsif($fields[6] eq "P"){ + $polyphen_info{$key} = [$fields[5],"poss. del."]; + last; + } + elsif($fields[6] eq "B"){ + $polyphen_info{$key} = [$fields[5],"benign"]; + last; + } + else{ + $polyphen_info{$key} = [$fields[5],$fields[6]]; + last; + } + } + } + if(not exists $polyphen_info{$key}){ + $polyphen_info{$key} = ["NA", "NA"]; + } + } + else{ + $polyphen_info{$key} = ["NA", "NA"]; + } + return @{$polyphen_info{$key}}; +} + +sub gerp_info($$){ + my ($chr,$pos) = @_; + + if(exists $chr2gerp_vcf_lines{$chr}->{$pos}){ + for(@{$chr2gerp_vcf_lines{$chr}->{$pos}}){ + my @fields = split /\t/, $_; + return $fields[3]; + } + } + return "NA"; +} + +sub predict_splicing{ + my ($chr, $pos, $ref, $var, $strand, $exon_edge_distance, $splice_type) = @_; + + my $disruption_level = 0; + my @disruption_details; + # The methods being used only look for up to a 30 base context, so only + # need to check for obliterated acceptors/donors if they are nearby + $exon_edge_distance-- if $exon_edge_distance > 0; # +1 is actually in the right spot, so decrement + my $edge_offset = $strand eq "-" ? -1*$exon_edge_distance : $exon_edge_distance; # flip offset sign if the call was on the negative strand (since +/- was relative to the transcript, not the genome coordinates) + my $annotated_detected = 0; + my ($genesplicer_orig, $genesplicer_orig_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref); + if($genesplicer_orig != 0){ + $annotated_detected = 1; + } + my ($maxentscan_orig, $maxentscan_orig_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref); + if(not $annotated_detected and $maxentscan_orig){ + $annotated_detected = 1; + } + my ($genesplicer_mod, $genesplicer_mod_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance); + if($genesplicer_mod != $UNAFFECTED){ # was the mod within the range where this software could detect up or downstream effects on the annotated splice site? + if($genesplicer_orig_pos != $genesplicer_mod_pos){ + $disruption_level+=0.75; + if($genesplicer_mod_pos == -1){ + push @disruption_details, "Variant obliterates GeneSplicer predicted splice site from the reference sequence ($chr:$genesplicer_orig_pos, orig score $genesplicer_orig)"; + } + elsif($genesplicer_orig_pos != -1){ + push @disruption_details, "Variant moves GeneSplicer preferred splice site from annotated location $chr:$genesplicer_orig_pos to $chr:$genesplicer_mod_pos (scores: orig=$genesplicer_orig, new=$genesplicer_mod)"; + } + } + elsif($genesplicer_orig-$genesplicer_mod > 1){ + $disruption_level+=0.5; + push @disruption_details, "Variant significantly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)"; + } + elsif($genesplicer_orig > $genesplicer_mod){ + $disruption_level+=0.25; + push @disruption_details, "Variant slightly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)"; + } + $genesplicer_orig = $genesplicer_mod; # so that splice site comparison vs. cryptic is accurate + } + my ($maxentscan_mod, $maxentscan_mod_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance); + if($maxentscan_mod != $UNAFFECTED){ + if($maxentscan_mod_pos != $maxentscan_orig_pos){ + $disruption_level+=1; + if($maxentscan_mod_pos == -1){ + push @disruption_details, "Variant obliterates MaxEntScan predicted splice site from the reference sequence ($chr:$maxentscan_orig_pos, orig score $maxentscan_orig)"; + } + elsif($maxentscan_orig_pos != -1){ + push @disruption_details, "Variant moves MaxEntScan preferred splice site from annotated location $chr:$maxentscan_orig_pos to $chr:$maxentscan_mod_pos (scores: orig=$maxentscan_orig, new=$maxentscan_mod)"; + } + } + elsif($maxentscan_orig-$maxentscan_mod > 1){ + $disruption_level+=0.5; + push @disruption_details, "Variant significantly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)"; + } + elsif($maxentscan_orig>$maxentscan_mod){ + $disruption_level+=0.25; + push @disruption_details, "Variant slightly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)"; + } + $maxentscan_orig = $maxentscan_mod; + } + + # Regardless of location, see if there is an increased chance of creating a cryptic splicing site + my ($genesplicer_control, $genesplicer_control_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref); + my ($genesplicer_cryptic, $genesplicer_cryptic_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref, $var); + if(defined $genesplicer_orig and $genesplicer_control < $genesplicer_orig and $genesplicer_orig < $genesplicer_cryptic){ + $disruption_level+=0.75; + push @disruption_details, "Variant causes GeneSplicer score for a potential cryptic $splice_type splice site at $chr:$genesplicer_cryptic_pos to become higher than the annotated splice site $chr:$genesplicer_orig_pos (orig=$genesplicer_orig vs. var=$genesplicer_cryptic)"; + } + if($genesplicer_control - $genesplicer_cryptic < -1){ + $disruption_level+=0.5; + push @disruption_details, "Variant raises GeneSplicer score for a potential cryptic $splice_type splice site (orig=$genesplicer_control vs. var=$genesplicer_cryptic)"; + } + my ($maxentscan_control, $maxentscan_control_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref); + my ($maxentscan_cryptic, $maxentscan_cryptic_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref, $var); + if(defined $maxentscan_orig and $maxentscan_control < $maxentscan_orig and $maxentscan_orig < $maxentscan_cryptic){ + $disruption_level++; + push @disruption_details, "Variant causes MaxEntScan score for a potential cryptic $splice_type splice site at $chr:$maxentscan_cryptic_pos to become higher than the annotated splice site $chr:$maxentscan_orig_pos (orig=$maxentscan_orig vs. var=$maxentscan_cryptic)"; + } + if($maxentscan_control - $maxentscan_cryptic < -1 and $maxentscan_cryptic > 0){ + $disruption_level+=0.5; + push @disruption_details, "Variant raises MaxEntScan score for a potential cryptic $splice_type splice site (orig=$maxentscan_control vs. var=$maxentscan_cryptic)"; + } + + # If neither method predicted the annotated donor site, and the alternate site isn't predicted either, note that so the user doesn't think the mod is definite okay + if(not @disruption_details and $maxentscan_orig < 0 and $genesplicer_orig < 0){ + $disruption_level+=0.5; + push @disruption_details, "Neither the annotated splice site nor any potential cryptic site introduced by the variant are predicted by either GeneSplicer or MaxEntScan, therefore the alt splicing potential for this variant should be evaluated manually"; + } + + return ($disruption_level, join("; ", @disruption_details)); +} + +sub call_genesplicer{ + my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_; + + if($splice_type eq "acceptor"){ + $pos += $strand eq "-" ? 2: -2; + } + my $num_flanking = 110; # seems to croak if less than 160bp provided + my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins + my $site_window = defined $var_offset ? 2 : ($indel_size > 0 ? 29+$indel_size : 29); # exact position if looking for exisitng site effect, otherwise 10 or 10 + the insert size + my $cmd = "genesplicer"; + my $start = $pos - $num_flanking; + $start = 1 if $start < 1; + my $stop = $pos + $num_flanking + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 2*$num_flanking after the edit + $stop++; # end coordinate is exclusive + return ($UNAFFECTED, -1) if defined $var_offset and ($pos+$var_offset < $start or $pos+$var_offset > $stop); # variant is outside of range we can detect affect on original splice site + my $seq = $fasta_index->fetch("$chr:$start-$stop"); + if(defined $var){ + if(defined $var_offset){ # substitution away from the site of interest + substr($seq, $num_flanking+$var_offset, length($ref)) = $var; + } + else{ # substitution at the site of interest + substr($seq, $num_flanking, length($ref)) = $var; + } + } + if($strand eq "-"){ + $seq = reverse($seq); + $seq =~ tr/ACGTacgt/TGCAtgca/; + } + $tmpfile = "$$.fasta"; + open(TMP, ">$tmpfile") or die "Could not open $tmpfile for writing: $!\n"; + print TMP ">$chr $strand $pos $ref",($var?" $var":""), ($var_offset?" $var_offset":""),"\n$seq\n"; + substr($seq, $num_flanking, 2) = lc(substr($seq, $num_flanking, 2)); + #print STDERR "$chr $strand $pos $ref $var $var_offset: $seq\n"; + close(TMP); + #print STDERR "$cmd $tmpfile /export/achri_data/programs/GeneSplicer/human\n"; + #<STDIN>; + open(GS, "$cmd $tmpfile human 2> /dev/null|") + or die "Could not run $cmd: $!\n"; + my $highest_score = 0; + my $highest_position = -1; + while(<GS>){ + # End5 End3 Score "confidence" splice_site_type + # E.g. : + # 202 203 2.994010 Medium donor + chomp; + my @F = split /\s+/, $_; + next if $F[1] < $F[0]; # wrong strand + if($splice_type eq $F[4]){ + if(abs($F[0]-$num_flanking-1) <= $site_window){ # right type and approximate location + #print STDERR "*$_\n"; + if($F[2] > $highest_score){ + $highest_score = $F[2]; + # last bit accounts for indels in the position offset + $highest_position = $pos + ($strand eq "-" ? -1: 1)*(-$num_flanking + $F[0] - 1) + (($F[0] <= $num_flanking && $strand eq "+" or $F[0] >= $num_flanking && $strand eq "-") ? -$indel_size : 0); + } + } + } + #else{print STDERR $_,"\n";} + } + close(GS); + return ($highest_score, $highest_position); +} + +sub call_maxentscan{ + my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_; + + my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins + my $cmd; + my ($num_flanking5, $num_flanking3); + my ($region5, $region3); + my ($start, $stop); + my $highest_score = -9999; + my $highest_position = -1; + # note that donor and acceptor are very different beasts for MaxEntScan + if($splice_type eq "acceptor"){ + if($strand eq "-"){ + $pos += 2; + } + else{ + $pos -= 2; + } + $cmd = "score3.pl"; + $num_flanking5 = 18; + $num_flanking3 = 4; + } + else{ # donor + $cmd = "score5.pl"; + $num_flanking5 = 3; + $num_flanking3 = 5; + } + # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region + $region5 = defined $var_offset ? 0 : $num_flanking5; # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region + $region3 = defined $var_offset ? 0 : $num_flanking3; + if($strand eq "-"){ + $start = $pos - $num_flanking3 - $region3; + $stop = $pos + $num_flanking5 + $region5 + ($indel_size < 0 ? -1*$indel_size:0); + #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking3 - $region3)\n"; + } + else{ + $start = $pos - $num_flanking5 - $region5; + $stop = $pos + $num_flanking3 + $region3 + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 23 or 9 bases as the case may be + #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking5 - $region5)\n"; + } + if(defined $var_offset and ( + $strand eq "+" and ($pos+$var_offset < $start or $pos+$var_offset > $stop) or + $strand eq "-" and ($pos-$var_offset < $start or $pos-$var_offset > $stop))){ # variant is outside of range we can detect affect on original splice site + return ($UNAFFECTED, -1) + } + + my $seq = $fasta_index->fetch("$chr:$start-$stop"); + my @subseqs; + if(defined $var){ + if(defined $var_offset){ # substitution away from the site of interest + #print STDERR "Subbing in string of length ", length($seq), "near splice site $ref -> $var at ", ($strand eq "-" ? "$region3+$num_flanking3-$var_offset":"$region5+$num_flanking5+$var_offset"), "\n"; + substr($seq, ($strand eq "-" ? $region3+$num_flanking3-$var_offset:$region5+$num_flanking5+$var_offset), length($ref)) = $var; + } + else{ # substitution at the site of interest + substr($seq, ($strand eq "-" ? $region3+$num_flanking3:$region5+$num_flanking5), length($ref)) = $var; + } + } + if($strand eq "-"){ + $seq = reverse($seq); + $seq =~ tr/ACGTacgt/TGCAtgca/; + } + # get all of the 23 or 9 base pair subsequences, depending on whether it's donors or acceptors we are looking for + for(my $i = 0; $i+$num_flanking5+$num_flanking3+1 <= length($seq); $i++){ + push @subseqs, substr($seq, $i, $num_flanking5+$num_flanking3+1); + } + my $pid = open2(\*CHILD_OUT, \*CHILD_IN, "$cmd -"); + $pid or die "Could not run $cmd: $!\n"; + print CHILD_IN join("\n",@subseqs); + close(CHILD_IN); + while(<CHILD_OUT>){ + chomp; + # out is just "seq[tab]score" + my @F = split /\s+/, $_; + if($F[1] > $highest_score){ + #print STDERR ">"; + $highest_score = $F[1]; + # since we printed the subseqs in order, $. (containing the line number of the prediction output) gives us the offset in the original seq + $highest_position = $start + ($strand eq "-" ? length($seq) - $num_flanking5 - $. : $num_flanking5 + $. -1)+(($. <= $num_flanking5+$region5 && $strand eq "+" or $. >= $num_flanking5+$region5 && $strand eq "-") ? -$indel_size : 0); + } + #print STDERR $_,"\n"; + } + close(CHILD_OUT); + waitpid($pid, 0); + + return ($highest_score, $highest_position); +} + +sub get_range_info($$$$){ + my ($chr2ranges, $chr, $start, $stop) = @_; + + if(not exists $chr2ranges->{$chr}){ + return {}; + } + my $range_key = "$chr:$start:$stop"; + + # Use a binary search to find the leftmost range of interest + my $left_index = 0; + my $right_index = $#{$chr2ranges->{$chr}}; + my $cursor = int($right_index/2); + while($left_index != $right_index and $left_index != $cursor and $right_index != $cursor){ + # need to go left + if($chr2ranges->{$chr}->[$cursor]->[0] >= $start){ + $right_index = $cursor; + $cursor = int(($left_index+$cursor)/2); + } + # need to go to the right + elsif($chr2ranges->{$chr}->[$cursor]->[1] <= $stop){ + $left_index = $cursor; + $cursor = int(($right_index+$cursor)/2); + } + else{ + $right_index = $cursor; + $cursor--; # in the range, go left until not in the range any more + } + } + + my %names; + for (; $cursor <= $#{$chr2ranges->{$chr}}; $cursor++){ + my $range_ref = $chr2ranges->{$chr}->[$cursor]; + if($range_ref->[0] > $stop){ + last; + } + elsif($range_ref->[0] <= $stop and $range_ref->[1] >= $start){ + $names{$range_ref->[2]}++; + } + } + + #print STDERR "Names for range $chr:$start-$stop are ", join("/", keys %names), "\n"; + return \%names; +} + +if(not -d $sift_dir){ + die "The specified SIFT directory $sift_dir does not exist\n"; +} + +print STDERR "Reading in OMIM morbid map...\n" unless $quiet; +die "Data file $morbidmap_file does not exist, aborting.\n" if not -e $morbidmap_file; +my %gene2morbids; +open(TAB, $morbidmap_file) + or die "Cannot open OMIM morbid map file $morbidmap_file for reading: $!\n"; +while(<TAB>){ + my @fields = split /\|/, $_; + #print STDERR "got ", ($#fields+1), " fields in $_\n"; + #my @fields = split /\|/, $_; + my $morbid = $fields[0]; + $morbid =~ s/\s*\(\d+\)$//; # get rid of trailing parenthetical number + my $omim_id = $fields[2]; + for my $genename (split /, /, $fields[1]){ + if(not exists $gene2morbids{$genename}){ + $gene2morbids{$genename} = "OMIM:$omim_id $morbid"; + } + else{ + $gene2morbids{$genename} .= "; $morbid"; + } + } +} +close(TAB); + +print STDERR "Reading in interpro mappings...\n" unless $quiet; +die "Data file $interpro_bed_file does not exist, aborting.\n" if not -e $interpro_bed_file; +my %interpro_ids; +open(TAB, $interpro_bed_file) + or die "Cannot open gene name BED file $interpro_bed_file for reading: $!\n"; +while(<TAB>){ + chomp; + # format should be "chr start stop interpro desc..." + my @fields = split /\t/, $_; + my $c = $fields[0]; + if(not exists $interpro_ids{$c}){ + $interpro_ids{$c} = [] unless exists $interpro_ids{$c}; + $interpro_ids{"chr".$c} = [] unless $c =~ /^chr/; + $interpro_ids{$1} = [] if $c =~ /^chr(\S+)/; + } + my $dataref = [@fields[1..3]]; + push @{$interpro_ids{$c}}, $dataref; + push @{$interpro_ids{"chr".$c}}, $dataref unless $c =~ /^chr/; + push @{$interpro_ids{$1}}, $dataref if $c =~ /^chr(\S+)/; +} +close(TAB); +print STDERR "Sorting interpro matches per contig...\n" unless $quiet; +for my $chr (keys %interpro_ids){ + $interpro_ids{$chr} = [sort {$a->[0] <=> $b->[0]} @{$interpro_ids{$chr}}]; +} + +# {uc(gene_name) => space separated info} +print STDERR "Reading in tissue info...\n" unless $quiet; +my %tissues; +my %tissue_desc; +open(TISSUE, $tissue_file) + or die "Cannot open $tissue_file for reading: $!\n"; +while(<TISSUE>){ + chomp; + my @fields = split /\t/, $_; + my $gname = uc($fields[1]); + $tissue_desc{$gname} = $fields[2]; + next unless $#fields == 4; + my @tis = split /;\s*/, $fields[4]; + pop @tis; + if(@tis < 6){ + $tissues{$gname} = join(">", @tis); + } + elsif(@tis < 24){ + $tissues{$gname} = join(">", @tis[0..5]).">..."; + } + else{ + $tissues{$gname} = join(">", @tis[0..(int($#tis/5))]).">..."; + } +} +close(TISSUE); + +print STDERR "Loading pathway info...\n" unless $quiet; +my %pathways; +my %pathway_ids; +open(PATHWAY, $pathway_file) + or die "Cannot open $pathway_file for reading: $!\n"; +while(<PATHWAY>){ + chomp; + my @fields = split /\t/, $_; + next unless @fields >= 4; + for my $gname (split /\s+/, $fields[1]){ + $gname = uc($gname); + if(not exists $pathways{$gname}){ + $pathway_ids{$gname} = $fields[2]; + $pathways{$gname} = $fields[3]; + } + else{ + $pathway_ids{$gname} .= "; ".$fields[2]; + $pathways{$gname} .= "; ".$fields[3]; + } + } +} +close(PATHWAY); + +print STDERR "Loading SIFT indices...\n" unless $quiet; +# Read in the database table list +my %chr_tables; +open(DBLIST, "$sift_dir/bins.list") + or die "Cannot open $sift_dir/bins.list for reading: $!\n"; +while(<DBLIST>){ + chomp; + my @fields = split /\t/, $_; + if(not exists $chr_tables{$fields[1]}){ + $chr_tables{$fields[1]} = {}; + } + my $chr = $fields[1]; + $chr = "chr$chr" unless $chr =~ /^chr/; + $chr_tables{$chr}->{"$chr\_$fields[2]_$fields[3]"} = [$fields[2],$fields[3]]; +} +close(DBLIST); + +#Connect to database +my $db_gene = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_Supp.sqlite","","",{RaiseError=>1, AutoCommit=>1}); + +my $db_chr; +my $fr_stmt; # retrieval for frameshift +my $snp_stmt; # retrieval for SNP +my $cur_chr = ""; +my $cur_max_pos; + +if(defined $predict_cryptic_splicings){ + if(not -e $predict_cryptic_splicings){ + die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, does not exist.\n"; + } + if(not -e $predict_cryptic_splicings.".fai" and not -w dirname($predict_cryptic_splicings)){ + die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, is not indexed, and the directory is not writable.\n"; + } + print STDERR "Loading reference FastA index...\n" unless $quiet; + $fasta_index = Bio::DB::Sam::Fai->load($predict_cryptic_splicings); +} + +print STDERR "Annotating variants...\n" unless $quiet; +# Assume contigs and positions in order +print STDERR "Processing file $hgvs_table_file...\n" unless $quiet; +open(HGVS, $hgvs_table_file) + or die "Cannot open HGVS file $hgvs_table_file for reading: $!\n"; +# Output file for annotated snps and frameshifts +my $header = <HGVS>; # header +chomp $header; +my @headers = split /\t/, $header; +my ($chr_column, $pos_column, $ref_column, $alt_column, $alt_cnt_column, $tot_cnt_column, $ftype_column, $dbid_column, $maf_src_column, $maf_column, $internal_freq_column, + $cdna_hgvs_column, $aa_hgvs_column, $transcript_column, $transcript_length_column, $strand_column, $zygosity_column, $splice_dist_column, $caveats_column, $phase_column, + $srcs_column, $pvalue_column, $to_column, $genename_column, $rares_column, $rares_header); +my %req_columns = ( + "Chr" => \$chr_column, + "DNA From" => \$pos_column, + "DNA To" => \$to_column, + "Gene Name" => \$genename_column, + "Ref base" => \$ref_column, + "Obs base" => \$alt_column, + "Variant Reads" => \$alt_cnt_column, + "Total Reads" => \$tot_cnt_column, + "Feature type" => \$ftype_column, + "Variant DB ID" => \$dbid_column, + "Pop. freq. source" => \$maf_src_column, + "Pop. freq." => \$maf_column, + "Transcript HGVS" => \$cdna_hgvs_column, + "Protein HGVS" => \$aa_hgvs_column, + "Selected transcript" => \$transcript_column, + "Transcript length" => \$transcript_length_column, + "Strand" => \$strand_column, + "Zygosity" => \$zygosity_column, + "Closest exon junction (AA coding variants)" => \$splice_dist_column, + "Caveats" => \$caveats_column, + "Phase" => \$phase_column, + "P-value" => \$pvalue_column); +&load_header_columns(\%req_columns, \@headers); +for(my $i = 0; $i <= $#headers; $i++){ + # Optional columns go here + if($headers[$i] eq "Sources"){ + $srcs_column = $i; + } + elsif($headers[$i] eq "Internal pop. freq."){ + $internal_freq_column = $i; + } + elsif($headers[$i] =~ /^Num rare variants/){ + $rares_column = $i; + $rares_header = $headers[$i]; + } +} +open(OUT, ">$outfile") + or die "Cannot open $outfile for writing: $!\n"; +print OUT join("\t", "Chr", "DNA From", "DNA To", "Ref base", "Obs base", "% ref read","% variant read", + "Variant Reads", "Total Reads", "Feature type","Variant type", "Variant context", "Variant DB ID", "Pop. freq. source", + "Pop. freq."), (defined $internal_freq_column ? "\tInternal pop. freq.\t" : "\t"), join("\t", "SIFT Score", "Polyphen2 Score", "Polyphen2 call", "GERP score", + "Gene Name", "Transcript HGVS", + "Protein HGVS", "Zygosity","Closest exon junction (AA coding variants)","Splicing alteration potential","Splicing alteration details", + "OMIM Morbid Map","Tissue expression", "Pathways", "Pathway refs", + "Gene Function", "Spanning InterPro Domains", "P-value", "Caveats", "Phase", + "Selected transcript", "Transcript length", "Strand"), + (defined $rares_column ? "\t$rares_header" : ""), + (defined $srcs_column ? "\tSources" : ""), "\n"; +while(<HGVS>){ + chomp; + my @fields = split /\t/, $_, -1; + my $mode; + my $protein_hgvs = $fields[$aa_hgvs_column]; + my $refSeq = $fields[$ref_column]; + my $newBase = $fields[$alt_column]; + my $splicing_potential = "NA"; + my $splicing_details = "NA"; + if(defined $predict_cryptic_splicings){ + my $distance = $fields[$splice_dist_column]; # only for coding variants + my $site_type; + if($distance eq "NA"){ + } + elsif($distance){ + $site_type = $distance < 0 ? "donor" : "acceptor"; # exon context + } + else{ + if($fields[$cdna_hgvs_column] =~ /\d+([\-+]\d+)/){ # get from the cDNA HGVS otherwise + $distance = $1; + $distance =~ s/^\+//; + $site_type = $distance > 0 ? "donor" : "acceptor"; # intron context + } + } + # only do splicing predictions for novel or rare variants, since these predictions are expensive + if($distance and $distance ne "NA" and ($fields[$maf_column] eq "NA" or $fields[$maf_column] <= 0.01)){ + #print STDERR "Running prediction for rare variant ($fields[3], MAF=$fields[14]): $fields[5], $fields[6], $refSeq, $newBase, $fields[4], $distance, $site_type\n"; + ($splicing_potential, $splicing_details) = &predict_splicing($fields[$chr_column], $fields[$pos_column], $refSeq, $newBase, $fields[$strand_column], $distance, $site_type); + } + } + my $lookupBase; + if($fields[$strand_column] eq "-"){ + if($refSeq ne "NA"){ + $refSeq = reverse($refSeq); + $refSeq =~ tr/ACGTacgt/TGCAtgca/; + } + if($newBase ne "NA"){ + $newBase = reverse($newBase); + $newBase =~ tr/ACGTacgt/TGCAtgca/; + $newBase =~ s(TN/)(NA/); + $newBase =~ s(/TN)(/NA); + } + } + if($fields[$cdna_hgvs_column] =~ /^g\..*?(?:ins|delins|>)([ACGTN]+)$/){ + $mode = "snps"; + } + elsif($fields[$cdna_hgvs_column] =~ /^g\..*?del([ACGTN]+)$/){ + $mode = "snps"; + } + elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+(?:[\-+]\d+)?ins([ACGTN]+)$/ or $fields[$cdna_hgvs_column] =~ /^c\.\d+(?:[\-+]\d+)?_\d+ins([ACGTN]+)$/){ + if(not length($1)%3){ + $mode = "snps"; + } + else{ + $mode = "frameshifts"; + } + } + elsif($fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:[\-+]\d+)?_(\d+)(?:[\-+]\d+)?delins([ACGTN]+)$/){ + if(not (length($3)-$2+$1-1)%3){ + $mode = "snps"; + } + else{ + $mode = "frameshifts"; + } + } + elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[+\-]\d+_\d+[+\-]\d+delins([ACGTN]+)$/ or + $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or + $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or + $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+del([ACGTN]*)$/ or + $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?(?:ins|delins)?([ACGTN]+)$/){ + $mode = "snps"; + } + elsif($fields[$cdna_hgvs_column] =~ /^c\.([-*]?\d+)del(\S+)/){ + my $dist = $1; + $dist =~ s/^\*//; + if(abs($dist) < 4){ + $mode = "snps"; + } + else{ + $mode = "frameshifts"; + } + } + elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+ins(\S*)/ or + $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+del(\S*)/){ + $mode = "snps"; + } + elsif($fields[$cdna_hgvs_column] =~ /^c\.(-?\d+)_(\d+)(?:\+\d+)?del(\S+)/ or + $fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:\-\d+)?_(\d+)del(\S+)/){ + if(not ($2-$1+1)%3){ + $mode = "snps"; + } + else{ + $mode = "frameshifts"; + } + } + elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+ins([ACGTN]+)$/){ + if(not length($1)%3){ + $mode = "snps"; + } + else{ + $mode = "frameshifts"; + } + } + # special case of shifted start codon + elsif($fields[$cdna_hgvs_column] =~ /^c\.-1_1+ins([ACGTN]+)$/ or + $fields[$cdna_hgvs_column] =~ /inv$/){ + $mode = "snps"; + } + elsif($fields[$cdna_hgvs_column] =~ /[ACGTN]>([ACGTN])$/){ + # todo: handle multiple consecutive substitutions, will have to run polyphen separately + $lookupBase = $newBase; + if($fields[$strand_column] eq "-"){ # revert to positive strand variant for SIFT index + $lookupBase =~ tr/ACGTacgt/TGCAtgca/; + } + $mode = "snps"; + } + else{ + $mode = "snps"; + } + if($fields[$chr_column] ne $cur_chr){ + #Prepared DB connection on per-chromosome basis + $cur_chr = $fields[$chr_column]; + $cur_chr =~ s/^chr//; + $db_chr = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_CHR$cur_chr.sqlite", + "", + "", + { RaiseError => 1, AutoCommit => 1 }) unless not -e "$sift_dir/Human_CHR$cur_chr.sqlite"; + + $cur_max_pos = -1; + } + if($fields[$pos_column] =~ /^\d+$/ and $cur_max_pos < $fields[$pos_column]){ + undef $fr_stmt; + undef $snp_stmt; + my $c = $fields[$chr_column]; + $c = "chr$c" if $c !~ /^chr/; + for my $chr_table_key (keys %{$chr_tables{$c}}){ + my $chr_table_range = $chr_tables{$c}->{$chr_table_key}; + if($chr_table_range->[0] <= $fields[$pos_column] and $chr_table_range->[1] >= $fields[$pos_column]){ + $fr_stmt = $db_chr->prepare("select SCORE". + " from $chr_table_key where COORD1 = ?"); + $snp_stmt = $db_chr->prepare("select SCORE". + " from $chr_table_key where COORD1 = ? AND NT2 = ?"); + $cur_max_pos = $chr_table_range->[1]; + last; + } + } + if(not defined $fr_stmt and not $quiet){ + warn "Got request for position not in range of SIFT ($c:$fields[$pos_column])\n"; + } + } + + my @cols; + if(not $fr_stmt){ + @cols = ("NA"); + } + elsif($mode eq "frameshifts"){ + $fr_stmt->execute($fields[$pos_column]); + @cols = $fr_stmt->fetchrow_array(); + } + else{ + $snp_stmt->execute($fields[$pos_column]-1, $lookupBase); + @cols = $snp_stmt->fetchrow_array(); + } + # See if the change is in a CDS, and if it's non-synonymous + my ($type, $location) = ("silent", "intron"); + my $start = $fields[$pos_column]; + my $stop = $fields[$to_column]; + my $interpro = get_range_info(\%interpro_ids, $fields[$chr_column], $start, $stop); + my $domains = join("; ", sort keys %$interpro); + + my $sift_score = "NA"; + my $variant_key = "NA"; + my $transcript_hgvs = $fields[$cdna_hgvs_column]; + if($transcript_hgvs =~ /\.\d+[ACGTN]>/ or $transcript_hgvs =~ /\.\d+(?:_|del|ins)/ or $transcript_hgvs =~ /[+\-]\?/){ + $location = "exon"; + } + elsif($transcript_hgvs =~ /\.\d+[\-+](\d+)/){ + if($1 < 21){ + $type = "splice"; + } + else{ + $type = "intronic"; + } + $location = "splice site"; + } + elsif($transcript_hgvs =~ /\.\*\d+/){ + $location = "3'UTR"; + } + elsif($transcript_hgvs =~ /\.-\d+/){ + $location = "5'UTR"; + } + if($mode eq "frameshifts"){ + $type = "frameshift"; + } + else{ + if(not defined $protein_hgvs){ + $type = "unknown"; + } + elsif($protein_hgvs eq "NA"){ + $type = "non-coding"; + } + elsif($protein_hgvs =~ /=$/){ + $type = "silent"; + } + elsif($protein_hgvs =~ /\d+\*/){ # nonsense + $type = "nonsense"; + } + elsif($protein_hgvs =~ /ext/){ # nonsense + $type = "nonstop"; + } + elsif($protein_hgvs eq "p.0?" or $protein_hgvs eq "p.?"){ + $type = "unknown"; + } + else{ + $type = "missense"; + } + + $sift_score = $cols[0] || "NA"; + + # Record the need to fetch VCF polyphen and gerp data later (en masse, for efficiency) + # Use newBase instead of lookupBase for PolyPhen since the orientation is always relative to the transcript + $variant_key = get_variant_key($fields[$chr_column], $fields[$pos_column], $fields[$ref_column], $newBase); + #print STDERR "Variant key is $variant_key\n"; + } + my $transcript_type = $fields[$ftype_column]; + my $transcript_length = $fields[$transcript_length_column]; + my $transcript_id = $fields[$transcript_column]; # sift score is derived from this transcript + my $transcript_strand = $fields[$strand_column]; + my $zygosity = $fields[$zygosity_column]; + my $rsID = $fields[$dbid_column]; + my $popFreq = $fields[$maf_column]; + my $internalFreq; + $internalFreq = $fields[$internal_freq_column] if defined $internal_freq_column; + my $popFreqSrc = $fields[$maf_src_column]; + + # Get the gene name and omim info + my %seen; + my @gene_names = split /; /, $fields[$genename_column]; + my $morbid = join("; ", grep {/\S/ and not $seen{$_}++} map({$gene2morbids{$_} or ""} @gene_names)); + my $tissue = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissues{$_} or ""} @gene_names)); + my $function = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissue_desc{$_} or ""} @gene_names)); + my $pathways = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathways{$_} or ""} @gene_names)); + my $pathway_ids = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathway_ids{$_} or ""} @gene_names)); + + # It may be that the calls and freqs are multiple concatenated values, e.g. "X; Y" + my @tot_bases = split /; /,$fields[$tot_cnt_column]; + my @var_bases = split /; /,$fields[$alt_cnt_column]; + my (@pct_nonvar, @pct_var); + for (my $i = 0; $i <= $#tot_bases; $i++){ + push @pct_nonvar, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int(($tot_bases[$i]-$var_bases[$i])/$tot_bases[$i]*100+0.5); + push @pct_var, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int($var_bases[$i]/$tot_bases[$i]*100+0.5); + } + + push @lines, [ + $fields[$chr_column], # chr + $fields[$pos_column], # pos + $fields[$to_column], + $fields[$ref_column], + $fields[$alt_column], + join("; ", @pct_nonvar), # pct ref + join("; ", @pct_var), # pct var + $fields[$alt_cnt_column], # num var + $fields[$tot_cnt_column], # num reads + $transcript_type, # protein_coding, pseudogene, etc. + $type, + $location, + $rsID, + $popFreqSrc, + (defined $internal_freq_column ? ($popFreq,$internalFreq) : ($popFreq)), + $sift_score, + $variant_key, # 16th field + join("; ", @gene_names), + $transcript_hgvs, + $protein_hgvs, + $zygosity, + $fields[$splice_dist_column], # distance to exon edge + $splicing_potential, + $splicing_details, + $morbid, + $tissue, + $pathways, + $pathway_ids, + $function, + $domains, + $fields[$pvalue_column], + $fields[$caveats_column], # caveats + (defined $rares_column ? ($fields[$rares_column],$transcript_id) : ($transcript_id)), + $transcript_length, + $transcript_strand, + (defined $srcs_column ? ($fields[$phase_column],$fields[$srcs_column]) : ($fields[$phase_column]))]; +} + +print STDERR "Adding Polyphen2 and GERP info...\n" unless $quiet; +# retrieve the polyphen and gerp info en masse for each chromosome, as this is much more efficient +my $vkey_col = 16+(defined $internal_freq_column ? 1 : 0); +for my $line_fields (@lines){ + # expand the key into the relevant MAF values + $line_fields->[$vkey_col] = join("\t", polyphen_gerp_info($gerp_dir,$polyphen_file,$line_fields->[$vkey_col])); # fields[$vkey_col] is variant key + print OUT join("\t", @{$line_fields}), "\n"; +} +close(OUT); +unlink $tmpfile if defined $tmpfile; + +sub load_header_columns{ + my ($reqs_hash_ref, $headers_array_ref) = @_; + my %unfulfilled; + for my $field_name (keys %$reqs_hash_ref){ + $unfulfilled{$field_name} = 1; + } + for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){ + for my $req_header_name (keys %$reqs_hash_ref){ + if($req_header_name eq $headers_array_ref->[$i]){ + ${$reqs_hash_ref->{$req_header_name}} = $i; + delete $unfulfilled{$req_header_name}; + last; + } + } + } + if(keys %unfulfilled){ + die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n"; + } +}