| 0 | 1 #!/usr/bin/env perl | 
|  | 2 | 
|  | 3 use Bio::DB::Sam; | 
|  | 4 use DBI; | 
|  | 5 use IPC::Open2; | 
|  | 6 use Set::IntervalTree; | 
|  | 7 use strict; | 
|  | 8 use warnings; | 
|  | 9 use vars qw($quiet %chr2variant_locs %polyphen_info %chr2polyphen_vcf_lines %chr2gerp_vcf_lines %range2genes $fasta_index $UNAFFECTED $tmpfile); | 
|  | 10 use File::Basename; | 
|  | 11 $UNAFFECTED = -299999; # sentinel for splice site predictions | 
|  | 12 my $current_directory = dirname(__FILE__); | 
|  | 13 if(@ARGV == 1 and $ARGV[0] eq "-v"){ | 
|  | 14   print "Version 1.0\n"; | 
|  | 15   exit; | 
|  | 16 } | 
|  | 17 | 
|  | 18 #configuration file stuff | 
|  | 19 my %config; | 
|  | 20 my $tool_dir = shift @ARGV; | 
|  | 21 if(not -e "$tool_dir/annotate_hgvs.loc"){ | 
|  | 22   system("cp $current_directory/tool-data/annotate_hgvs.loc $tool_dir/annotate_hgvs.loc"); | 
|  | 23 } | 
|  | 24 open CONFIG, '<', "$tool_dir/annotate_hgvs.loc"; | 
|  | 25 while(<CONFIG>){ | 
|  | 26   next if $_ =~ /^#/; | 
|  | 27   (my $key, my $value) = split(/\s+/,$_); | 
|  | 28   $config{$key} = $value; | 
|  | 29 } | 
|  | 30 close CONFIG; | 
|  | 31 my $dbs_dir = $config{"dbs_dir"}; | 
|  | 32 | 
|  | 33 $quiet = 0; | 
|  | 34 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){ | 
|  | 35   $quiet = 1; | 
|  | 36   shift @ARGV; | 
|  | 37 } | 
|  | 38 | 
|  | 39 @ARGV == 9 or @ARGV == 10 or die "Usage: $0 -q <SIFT dir> <POLYPHEN2 file> <GERP dir> <tissuedb file> <pathway file> <interpro domains.bed> <omim morbidmap> <input hgvs table.txt> <output hgvs table.annotated.txt> [reference.fasta]\n Where if the reference fasta is provided, we predict cryptic splicing effects (using MaxEntScan and GeneSplicer)\n"; | 
|  | 40 | 
|  | 41 my $sift_dir = $dbs_dir . shift @ARGV; | 
|  | 42 my $polyphen_file = $dbs_dir . shift @ARGV; | 
|  | 43 my $gerp_dir = $dbs_dir . shift @ARGV; | 
|  | 44 my $tissue_file = $dbs_dir . shift @ARGV; | 
|  | 45 my $pathway_file = $dbs_dir. shift @ARGV; | 
|  | 46 my $interpro_bed_file = $dbs_dir . shift @ARGV; | 
|  | 47 my $morbidmap_file = $dbs_dir . shift @ARGV; | 
|  | 48 my $hgvs_table_file = shift @ARGV; | 
|  | 49 my $outfile = shift @ARGV; | 
|  | 50 my $predict_cryptic_splicings = @ARGV ?  ($dbs_dir . shift @ARGV) : undef; | 
|  | 51 my $flanking_bases = 500; # assume this is ROI around a gene | 
|  | 52 my @lines; | 
|  | 53 | 
|  | 54 sub polyphen_gerp_info($$$){ | 
|  | 55     my($gerp_dir,$polyphen_file,$info_key) = @_; | 
|  | 56 | 
|  | 57     my($chr,$pos,$ref,$variant) = split /:/, $info_key; | 
|  | 58 | 
|  | 59     # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse | 
|  | 60     if(not exists $chr2polyphen_vcf_lines{$chr}){ | 
|  | 61       ($chr2polyphen_vcf_lines{$chr}, $chr2gerp_vcf_lines{$chr}) = retrieve_vcf_lines($gerp_dir,$polyphen_file,$chr); | 
|  | 62     } | 
|  | 63 | 
|  | 64     my @results; | 
|  | 65     if(not $ref or not $variant){ # can only get data for SNPs | 
|  | 66         @results = qw(NA NA NA); | 
|  | 67     } | 
|  | 68     #print STDERR "ref = $ref and variant = $variant for  $info_key\n"; | 
|  | 69     elsif(length($ref) == 1 and length($variant) == 1){ | 
|  | 70       #print STDERR "Grabbing poly/gerp for $chr,$pos,$ref,$variant\n"; | 
|  | 71       @results = polyphen_info($chr,$pos,$variant); | 
|  | 72       $results[2] = gerp_info($chr,$pos); | 
|  | 73     } | 
|  | 74 | 
|  | 75     # It may be that a novel MNP variant is a bunch of known SNPs in a row.  In this case, | 
|  | 76     # report the list of variants | 
|  | 77     elsif(length($ref) == length($variant) and $ref ne "NA"){ | 
|  | 78         my (@poly_scores, @poly_calls, @gerp_scores); | 
|  | 79         for(my $i = 0; $i < length($ref); $i++){ | 
|  | 80             my $refbase = substr($ref, $i, 1); | 
|  | 81             my $newbase = substr($variant, $i, 1); | 
|  | 82             if($newbase eq $refbase){ | 
|  | 83               push @poly_scores, "-"; | 
|  | 84               push @poly_calls, "-"; | 
|  | 85               push @gerp_scores, "-"; | 
|  | 86               next; | 
|  | 87             } | 
|  | 88             my @base_results = polyphen_info($chr,$pos+$i,$refbase,$newbase); | 
|  | 89             push @poly_scores, $base_results[0]; | 
|  | 90             push @poly_calls, $base_results[1]; | 
|  | 91             $base_results[2] = gerp_info($chr,$pos+$i); | 
|  | 92             push @gerp_scores, $base_results[2]; | 
|  | 93         } | 
|  | 94         $results[0] = join(",", @poly_scores); | 
|  | 95         $results[1] = join(",", @poly_calls); | 
|  | 96         $results[2] = join(",", @gerp_scores); | 
|  | 97     } | 
|  | 98     else{ | 
|  | 99         @results = qw(NA NA NA); | 
|  | 100     } | 
|  | 101 | 
|  | 102     return @results; | 
|  | 103 } | 
|  | 104 | 
|  | 105 sub get_variant_key($$$$){ | 
|  | 106   # params chr pos ref variant | 
|  | 107   my $c = $_[0]; | 
|  | 108   $c =~ s/^chr//; | 
|  | 109   my $prop_info_key = join(":", $c, @_[1..3]); | 
|  | 110   $chr2variant_locs{$c} = [] unless exists $chr2variant_locs{$c}; | 
|  | 111   push @{$chr2variant_locs{$c}}, $_[1]; | 
|  | 112   return $prop_info_key; | 
|  | 113 } | 
|  | 114 | 
|  | 115 sub retrieve_vcf_lines($$$){ | 
|  | 116   my ($gerp_dir, $polyphen_file, $chr) = @_; | 
|  | 117 | 
|  | 118   my (%polyphen_lines, %gerp_lines); | 
|  | 119 | 
|  | 120   if(not exists $chr2variant_locs{$chr}){ | 
|  | 121     return ({}, {}); # no data requested for this chromosome | 
|  | 122   } | 
|  | 123 | 
|  | 124   # build up the request | 
|  | 125   my @tabix_regions; | 
|  | 126   my @var_locs = grep /^\d+$/, @{$chr2variant_locs{$chr}}; # grep keeps point mutations only | 
|  | 127   if(not @var_locs){ | 
|  | 128     return ({}, {}); # no point mutations | 
|  | 129   } | 
|  | 130 | 
|  | 131   # sort by variant start location | 
|  | 132   for my $var_loc (sort {$a <=> $b} @var_locs){ | 
|  | 133     push @tabix_regions, $chr.":".$var_loc."-".$var_loc; | 
|  | 134   } | 
|  | 135   my $regions = join(" ", @tabix_regions); | 
|  | 136 | 
|  | 137   # retrieve the data | 
|  | 138   die "Cannot find Polyphen2 VCF file $polyphen_file\n" if not -e $polyphen_file; | 
|  | 139   open(VCF, "tabix $polyphen_file $regions |") | 
|  | 140     or die "Cannot run tabix on $polyphen_file: $!\n"; | 
|  | 141   while(<VCF>){ | 
|  | 142     if(/^\S+\t(\d+)/ and grep {$_ eq $1} @var_locs){ # take only main columns to save room, if possible | 
|  | 143       chomp; | 
|  | 144       $polyphen_lines{$1} = [] unless exists $polyphen_lines{$1}; | 
|  | 145       push @{$polyphen_lines{$1}}, $_; | 
|  | 146     } | 
|  | 147   } | 
|  | 148   close(VCF); | 
|  | 149 | 
|  | 150   my $vcf_file = "$gerp_dir/chr$chr.tab.gz"; | 
|  | 151   if(not -e $vcf_file){ | 
|  | 152     warn "Cannot find GERP VCF file $vcf_file\n" if not $quiet; | 
|  | 153     return ({}, {}); | 
|  | 154   } | 
|  | 155   open(VCF, "tabix $vcf_file $regions |") | 
|  | 156     or die "Cannot run tabix on $vcf_file: $!\n"; | 
|  | 157   while(<VCF>){ | 
|  | 158     if(/^(\S+\t(\d+)(?:\t\S+){2})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible | 
|  | 159       $gerp_lines{$2} = [] unless exists $gerp_lines{$2}; | 
|  | 160       push @{$gerp_lines{$2}}, $1; | 
|  | 161     } | 
|  | 162   } | 
|  | 163   close(VCF); | 
|  | 164 | 
|  | 165   return (\%polyphen_lines, \%gerp_lines); | 
|  | 166 } | 
|  | 167 | 
|  | 168 sub polyphen_info($$$){ | 
|  | 169   my ($chr, $pos, $variant_base) = @_; | 
|  | 170 | 
|  | 171   my $key = "$chr:$pos:$variant_base"; | 
|  | 172   if(exists $polyphen_info{$key}){ | 
|  | 173       return @{$polyphen_info{$key}}; | 
|  | 174   } | 
|  | 175 | 
|  | 176   #print STDERR "Checking for polyphen data for $chr, $pos, $variant_base\n"; | 
|  | 177   if(exists $chr2polyphen_vcf_lines{$chr}->{$pos}){ | 
|  | 178     for(@{$chr2polyphen_vcf_lines{$chr}->{$pos}}){ | 
|  | 179       #print STDERR "Checking $chr, $pos, $variant_base against $_"; | 
|  | 180       my @fields = split /\t/, $_; | 
|  | 181       if($fields[4] eq $variant_base){ | 
|  | 182 	  if($fields[6] eq "D"){ | 
|  | 183 	      $polyphen_info{$key} = [$fields[5],"deleterious"]; | 
|  | 184               last; | 
|  | 185 	  } | 
|  | 186 	  elsif($fields[6] eq "P"){ | 
|  | 187 	      $polyphen_info{$key} = [$fields[5],"poss. del."]; | 
|  | 188               last; | 
|  | 189 	  } | 
|  | 190 	  elsif($fields[6] eq "B"){ | 
|  | 191 	      $polyphen_info{$key} = [$fields[5],"benign"]; | 
|  | 192               last; | 
|  | 193 	  } | 
|  | 194 	  else{ | 
|  | 195 	      $polyphen_info{$key} = [$fields[5],$fields[6]]; | 
|  | 196               last; | 
|  | 197 	  } | 
|  | 198       } | 
|  | 199     } | 
|  | 200     if(not exists $polyphen_info{$key}){ | 
|  | 201       $polyphen_info{$key} = ["NA", "NA"]; | 
|  | 202     } | 
|  | 203   } | 
|  | 204   else{ | 
|  | 205     $polyphen_info{$key} = ["NA", "NA"]; | 
|  | 206   } | 
|  | 207   return @{$polyphen_info{$key}}; | 
|  | 208 } | 
|  | 209 | 
|  | 210 sub gerp_info($$){ | 
|  | 211     my ($chr,$pos) = @_; | 
|  | 212 | 
|  | 213     if(exists $chr2gerp_vcf_lines{$chr}->{$pos}){ | 
|  | 214       for(@{$chr2gerp_vcf_lines{$chr}->{$pos}}){ | 
|  | 215         my @fields = split /\t/, $_; | 
|  | 216         return $fields[3]; | 
|  | 217       } | 
|  | 218     } | 
|  | 219     return "NA"; | 
|  | 220 } | 
|  | 221 | 
|  | 222 sub predict_splicing{ | 
|  | 223   my ($chr, $pos, $ref, $var, $strand, $exon_edge_distance, $splice_type) = @_; | 
|  | 224 | 
|  | 225   my $disruption_level = 0; | 
|  | 226   my @disruption_details; | 
|  | 227   # The methods being used only look for up to a 30 base context, so only | 
|  | 228   # need to check for obliterated acceptors/donors if they are nearby | 
|  | 229   $exon_edge_distance-- if $exon_edge_distance > 0; # +1 is actually in the right spot, so decrement | 
|  | 230   my $edge_offset = $strand eq "-" ? -1*$exon_edge_distance : $exon_edge_distance; # flip offset sign if the call was on the negative strand (since +/- was relative to the transcript, not the genome coordinates) | 
|  | 231   my $annotated_detected = 0; | 
|  | 232   my ($genesplicer_orig, $genesplicer_orig_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref); | 
|  | 233   if($genesplicer_orig != 0){ | 
|  | 234     $annotated_detected = 1; | 
|  | 235   } | 
|  | 236   my ($maxentscan_orig, $maxentscan_orig_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref); | 
|  | 237   if(not $annotated_detected and $maxentscan_orig){ | 
|  | 238     $annotated_detected = 1; | 
|  | 239   } | 
|  | 240   my ($genesplicer_mod, $genesplicer_mod_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance); | 
|  | 241   if($genesplicer_mod != $UNAFFECTED){ # was the mod within the range where this software could detect up or downstream effects on the annotated splice site? | 
|  | 242     if($genesplicer_orig_pos != $genesplicer_mod_pos){ | 
|  | 243       $disruption_level+=0.75; | 
|  | 244       if($genesplicer_mod_pos == -1){ | 
|  | 245         push @disruption_details, "Variant obliterates GeneSplicer predicted splice site from the reference sequence ($chr:$genesplicer_orig_pos, orig score $genesplicer_orig)"; | 
|  | 246       } | 
|  | 247       elsif($genesplicer_orig_pos != -1){ | 
|  | 248         push @disruption_details, "Variant moves GeneSplicer preferred splice site from annotated location $chr:$genesplicer_orig_pos to $chr:$genesplicer_mod_pos (scores: orig=$genesplicer_orig, new=$genesplicer_mod)"; | 
|  | 249       } | 
|  | 250     } | 
|  | 251     elsif($genesplicer_orig-$genesplicer_mod > 1){ | 
|  | 252       $disruption_level+=0.5; | 
|  | 253       push @disruption_details, "Variant significantly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)"; | 
|  | 254     } | 
|  | 255     elsif($genesplicer_orig > $genesplicer_mod){ | 
|  | 256       $disruption_level+=0.25; | 
|  | 257       push @disruption_details, "Variant slightly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)"; | 
|  | 258     } | 
|  | 259     $genesplicer_orig = $genesplicer_mod; # so that splice site comparison vs. cryptic is accurate | 
|  | 260   } | 
|  | 261   my ($maxentscan_mod, $maxentscan_mod_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance); | 
|  | 262   if($maxentscan_mod != $UNAFFECTED){ | 
|  | 263     if($maxentscan_mod_pos != $maxentscan_orig_pos){ | 
|  | 264       $disruption_level+=1; | 
|  | 265       if($maxentscan_mod_pos == -1){ | 
|  | 266         push @disruption_details, "Variant obliterates MaxEntScan predicted splice site from the reference sequence ($chr:$maxentscan_orig_pos, orig score $maxentscan_orig)"; | 
|  | 267       } | 
|  | 268       elsif($maxentscan_orig_pos != -1){ | 
|  | 269         push @disruption_details, "Variant moves MaxEntScan preferred splice site from annotated location $chr:$maxentscan_orig_pos to $chr:$maxentscan_mod_pos (scores: orig=$maxentscan_orig, new=$maxentscan_mod)"; | 
|  | 270       } | 
|  | 271     } | 
|  | 272     elsif($maxentscan_orig-$maxentscan_mod > 1){ | 
|  | 273       $disruption_level+=0.5; | 
|  | 274       push @disruption_details, "Variant significantly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)"; | 
|  | 275     } | 
|  | 276     elsif($maxentscan_orig>$maxentscan_mod){ | 
|  | 277       $disruption_level+=0.25; | 
|  | 278       push @disruption_details, "Variant slightly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)"; | 
|  | 279     } | 
|  | 280     $maxentscan_orig = $maxentscan_mod; | 
|  | 281   } | 
|  | 282 | 
|  | 283   # Regardless of location, see if there is an increased chance of creating a cryptic splicing site | 
|  | 284   my ($genesplicer_control, $genesplicer_control_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref); | 
|  | 285   my ($genesplicer_cryptic, $genesplicer_cryptic_pos)  = call_genesplicer($splice_type, $chr, $strand, $pos, $ref, $var); | 
|  | 286   if(defined $genesplicer_orig and $genesplicer_control < $genesplicer_orig and $genesplicer_orig < $genesplicer_cryptic){ | 
|  | 287     $disruption_level+=0.75; | 
|  | 288     push @disruption_details, "Variant causes GeneSplicer score for a potential cryptic $splice_type splice site at $chr:$genesplicer_cryptic_pos to become higher than the annotated splice site $chr:$genesplicer_orig_pos (orig=$genesplicer_orig vs. var=$genesplicer_cryptic)"; | 
|  | 289   } | 
|  | 290   if($genesplicer_control - $genesplicer_cryptic < -1){ | 
|  | 291     $disruption_level+=0.5; | 
|  | 292     push @disruption_details, "Variant raises GeneSplicer score for a potential cryptic $splice_type splice site (orig=$genesplicer_control vs. var=$genesplicer_cryptic)"; | 
|  | 293   } | 
|  | 294   my ($maxentscan_control, $maxentscan_control_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref); | 
|  | 295   my ($maxentscan_cryptic, $maxentscan_cryptic_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref, $var); | 
|  | 296   if(defined $maxentscan_orig and $maxentscan_control < $maxentscan_orig and $maxentscan_orig < $maxentscan_cryptic){ | 
|  | 297     $disruption_level++; | 
|  | 298     push @disruption_details, "Variant causes MaxEntScan score for a potential cryptic $splice_type splice site at $chr:$maxentscan_cryptic_pos to become higher than the annotated splice site $chr:$maxentscan_orig_pos (orig=$maxentscan_orig vs. var=$maxentscan_cryptic)"; | 
|  | 299   } | 
|  | 300   if($maxentscan_control - $maxentscan_cryptic < -1 and $maxentscan_cryptic > 0){ | 
|  | 301     $disruption_level+=0.5; | 
|  | 302     push @disruption_details, "Variant raises MaxEntScan score for a potential cryptic $splice_type splice site (orig=$maxentscan_control vs. var=$maxentscan_cryptic)"; | 
|  | 303   } | 
|  | 304 | 
|  | 305   # If neither method predicted the annotated donor site, and the alternate site isn't predicted either, note that so the user doesn't think the mod is definite okay | 
|  | 306   if(not @disruption_details and $maxentscan_orig < 0 and $genesplicer_orig < 0){ | 
|  | 307     $disruption_level+=0.5; | 
|  | 308     push @disruption_details, "Neither the annotated splice site nor any potential cryptic site introduced by the variant are predicted by either GeneSplicer or MaxEntScan, therefore the alt splicing potential for this variant should be evaluated manually"; | 
|  | 309   } | 
|  | 310 | 
|  | 311   return ($disruption_level, join("; ", @disruption_details)); | 
|  | 312 } | 
|  | 313 | 
|  | 314 sub call_genesplicer{ | 
|  | 315   my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_; | 
|  | 316 | 
|  | 317   if($splice_type eq "acceptor"){ | 
|  | 318     $pos += $strand eq  "-" ? 2: -2; | 
|  | 319   } | 
|  | 320   my $num_flanking = 110; # seems to croak if less than 160bp provided | 
|  | 321   my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins | 
|  | 322   my $site_window = defined $var_offset ? 2 : ($indel_size > 0 ? 29+$indel_size : 29); # exact position if looking for exisitng site effect, otherwise 10 or 10 + the insert size | 
|  | 323   my $cmd = "genesplicer"; | 
|  | 324   my $start = $pos - $num_flanking; | 
|  | 325   $start = 1 if $start < 1; | 
|  | 326   my $stop = $pos + $num_flanking + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 2*$num_flanking after the edit | 
|  | 327   $stop++; # end coordinate is exclusive | 
|  | 328   return ($UNAFFECTED, -1) if defined $var_offset and ($pos+$var_offset < $start or $pos+$var_offset > $stop); # variant is outside of range we can detect affect on original splice site | 
|  | 329   my $seq = $fasta_index->fetch("$chr:$start-$stop"); | 
|  | 330   if(defined $var){ | 
|  | 331     if(defined $var_offset){ # substitution away from the site of interest | 
|  | 332       substr($seq, $num_flanking+$var_offset, length($ref)) = $var; | 
|  | 333     } | 
|  | 334     else{ # substitution at the site of interest | 
|  | 335       substr($seq, $num_flanking, length($ref)) = $var; | 
|  | 336     } | 
|  | 337   } | 
|  | 338   if($strand eq "-"){ | 
|  | 339     $seq = reverse($seq); | 
|  | 340     $seq =~ tr/ACGTacgt/TGCAtgca/; | 
|  | 341   } | 
|  | 342   $tmpfile = "$$.fasta"; | 
|  | 343   open(TMP, ">$tmpfile") or die "Could not open $tmpfile for writing: $!\n"; | 
|  | 344   print TMP ">$chr $strand $pos $ref",($var?" $var":""), ($var_offset?" $var_offset":""),"\n$seq\n"; | 
|  | 345   substr($seq, $num_flanking, 2) = lc(substr($seq, $num_flanking, 2)); | 
|  | 346   #print STDERR "$chr $strand $pos $ref $var $var_offset: $seq\n"; | 
|  | 347   close(TMP); | 
|  | 348   #print STDERR "$cmd $tmpfile /export/achri_data/programs/GeneSplicer/human\n"; | 
|  | 349   #<STDIN>; | 
|  | 350   open(GS, "$cmd $tmpfile human 2> /dev/null|") | 
|  | 351     or die "Could not run $cmd: $!\n"; | 
|  | 352   my $highest_score = 0; | 
|  | 353   my $highest_position = -1; | 
|  | 354   while(<GS>){ | 
|  | 355     # End5 End3 Score "confidence" splice_site_type | 
|  | 356     # E.g. : | 
|  | 357     # 202 203 2.994010 Medium donor | 
|  | 358     chomp; | 
|  | 359     my @F = split /\s+/, $_; | 
|  | 360     next if $F[1] < $F[0]; # wrong strand | 
|  | 361     if($splice_type eq $F[4]){ | 
|  | 362       if(abs($F[0]-$num_flanking-1) <= $site_window){ # right type and approximate location | 
|  | 363         #print STDERR "*$_\n"; | 
|  | 364         if($F[2] > $highest_score){ | 
|  | 365           $highest_score = $F[2]; | 
|  | 366           # last bit accounts for indels in the position offset | 
|  | 367           $highest_position = $pos + ($strand eq "-" ? -1: 1)*(-$num_flanking + $F[0] - 1) + (($F[0] <= $num_flanking && $strand eq "+" or $F[0] >= $num_flanking && $strand eq "-") ? -$indel_size : 0); | 
|  | 368         } | 
|  | 369       } | 
|  | 370     } | 
|  | 371     #else{print STDERR $_,"\n";} | 
|  | 372   } | 
|  | 373   close(GS); | 
|  | 374   return ($highest_score, $highest_position); | 
|  | 375 } | 
|  | 376 | 
|  | 377 sub call_maxentscan{ | 
|  | 378   my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_; | 
|  | 379 | 
|  | 380   my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins | 
|  | 381   my $cmd; | 
|  | 382   my ($num_flanking5, $num_flanking3); | 
|  | 383   my ($region5, $region3); | 
|  | 384   my ($start, $stop); | 
|  | 385   my $highest_score = -9999; | 
|  | 386   my $highest_position = -1; | 
|  | 387   # note that donor and acceptor are very different beasts for MaxEntScan | 
|  | 388   if($splice_type eq "acceptor"){ | 
|  | 389     if($strand eq "-"){ | 
|  | 390       $pos += 2; | 
|  | 391     } | 
|  | 392     else{ | 
|  | 393       $pos -= 2; | 
|  | 394     } | 
|  | 395     $cmd = "score3.pl"; | 
|  | 396     $num_flanking5 = 18; | 
|  | 397     $num_flanking3 = 4; | 
|  | 398   } | 
|  | 399   else{ # donor | 
|  | 400     $cmd = "score5.pl"; | 
|  | 401     $num_flanking5 = 3; | 
|  | 402     $num_flanking3 = 5; | 
|  | 403   } | 
|  | 404   # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region | 
|  | 405   $region5 = defined $var_offset ? 0 : $num_flanking5; # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region | 
|  | 406   $region3 = defined $var_offset ? 0 : $num_flanking3; | 
|  | 407   if($strand eq "-"){ | 
|  | 408     $start = $pos - $num_flanking3 - $region3; | 
|  | 409     $stop = $pos + $num_flanking5 + $region5 + ($indel_size < 0 ? -1*$indel_size:0); | 
|  | 410     #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking3 - $region3)\n"; | 
|  | 411   } | 
|  | 412   else{ | 
|  | 413     $start = $pos - $num_flanking5 - $region5; | 
|  | 414     $stop = $pos + $num_flanking3 + $region3 + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 23 or 9 bases as the case may be | 
|  | 415     #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking5 - $region5)\n"; | 
|  | 416   } | 
|  | 417   if(defined $var_offset and ( | 
|  | 418      $strand eq "+" and ($pos+$var_offset < $start or $pos+$var_offset > $stop) or | 
|  | 419      $strand eq "-" and ($pos-$var_offset < $start or $pos-$var_offset > $stop))){ # variant is outside of range we can detect affect on original splice site | 
|  | 420     return ($UNAFFECTED, -1) | 
|  | 421   } | 
|  | 422 | 
|  | 423   my $seq = $fasta_index->fetch("$chr:$start-$stop"); | 
|  | 424   my @subseqs; | 
|  | 425   if(defined $var){ | 
|  | 426       if(defined $var_offset){ # substitution away from the site of interest | 
|  | 427         #print STDERR "Subbing in string of length ", length($seq), "near splice site $ref -> $var at ", ($strand eq "-" ? "$region3+$num_flanking3-$var_offset":"$region5+$num_flanking5+$var_offset"), "\n"; | 
|  | 428         substr($seq, ($strand eq "-" ? $region3+$num_flanking3-$var_offset:$region5+$num_flanking5+$var_offset), length($ref)) = $var; | 
|  | 429       } | 
|  | 430       else{ # substitution at the site of interest | 
|  | 431         substr($seq, ($strand eq "-" ? $region3+$num_flanking3:$region5+$num_flanking5), length($ref)) = $var; | 
|  | 432       } | 
|  | 433   } | 
|  | 434   if($strand eq "-"){ | 
|  | 435       $seq = reverse($seq); | 
|  | 436       $seq =~ tr/ACGTacgt/TGCAtgca/; | 
|  | 437   } | 
|  | 438   # get all of the 23 or 9 base pair subsequences, depending on whether it's donors or acceptors we are looking for | 
|  | 439   for(my $i = 0; $i+$num_flanking5+$num_flanking3+1 <= length($seq); $i++){ | 
|  | 440       push @subseqs, substr($seq, $i, $num_flanking5+$num_flanking3+1); | 
|  | 441   } | 
|  | 442   my $pid = open2(\*CHILD_OUT, \*CHILD_IN, "$cmd -"); | 
|  | 443   $pid or die "Could not run $cmd: $!\n"; | 
|  | 444   print CHILD_IN join("\n",@subseqs); | 
|  | 445   close(CHILD_IN); | 
|  | 446   while(<CHILD_OUT>){ | 
|  | 447       chomp; | 
|  | 448       # out is just "seq[tab]score" | 
|  | 449       my @F = split /\s+/, $_; | 
|  | 450       if($F[1] > $highest_score){ | 
|  | 451         #print STDERR ">"; | 
|  | 452         $highest_score = $F[1]; | 
|  | 453         # since we printed the subseqs in order, $. (containing the line number of the prediction output) gives us the offset in the original seq | 
|  | 454         $highest_position = $start + ($strand eq "-" ? length($seq) - $num_flanking5 - $. : $num_flanking5 + $. -1)+(($. <= $num_flanking5+$region5 && $strand eq "+" or $. >= $num_flanking5+$region5 && $strand eq "-") ? -$indel_size : 0); | 
|  | 455       } | 
|  | 456       #print STDERR $_,"\n"; | 
|  | 457   } | 
|  | 458   close(CHILD_OUT); | 
|  | 459   waitpid($pid, 0); | 
|  | 460 | 
|  | 461   return ($highest_score, $highest_position); | 
|  | 462 } | 
|  | 463 | 
|  | 464 sub get_range_info($$$$){ | 
|  | 465    my ($chr2ranges, $chr, $start, $stop) = @_; | 
|  | 466 | 
|  | 467    if(not exists $chr2ranges->{$chr}){ | 
|  | 468      return {}; | 
|  | 469    } | 
|  | 470    my $range_key = "$chr:$start:$stop"; | 
|  | 471 | 
|  | 472    # Use a binary search to find the leftmost range of interest | 
|  | 473    my $left_index = 0; | 
|  | 474    my $right_index = $#{$chr2ranges->{$chr}}; | 
|  | 475    my $cursor = int($right_index/2); | 
|  | 476    while($left_index != $right_index and $left_index != $cursor and $right_index != $cursor){ | 
|  | 477      # need to go left | 
|  | 478      if($chr2ranges->{$chr}->[$cursor]->[0] >= $start){ | 
|  | 479        $right_index = $cursor; | 
|  | 480        $cursor = int(($left_index+$cursor)/2); | 
|  | 481      } | 
|  | 482      # need to go to the right | 
|  | 483      elsif($chr2ranges->{$chr}->[$cursor]->[1] <= $stop){ | 
|  | 484        $left_index = $cursor; | 
|  | 485        $cursor = int(($right_index+$cursor)/2); | 
|  | 486      } | 
|  | 487      else{ | 
|  | 488        $right_index = $cursor; | 
|  | 489        $cursor--; # in the range, go left until not in the range any more | 
|  | 490      } | 
|  | 491    } | 
|  | 492 | 
|  | 493    my %names; | 
|  | 494    for (; $cursor <= $#{$chr2ranges->{$chr}}; $cursor++){ | 
|  | 495       my $range_ref = $chr2ranges->{$chr}->[$cursor]; | 
|  | 496       if($range_ref->[0] > $stop){ | 
|  | 497         last; | 
|  | 498       } | 
|  | 499       elsif($range_ref->[0] <= $stop and $range_ref->[1] >= $start){ | 
|  | 500         $names{$range_ref->[2]}++; | 
|  | 501       } | 
|  | 502    } | 
|  | 503 | 
|  | 504    #print STDERR "Names for range $chr:$start-$stop are ", join("/", keys %names), "\n"; | 
|  | 505    return \%names; | 
|  | 506 } | 
|  | 507 | 
|  | 508 if(not -d $sift_dir){ | 
|  | 509     die "The specified SIFT directory $sift_dir does not exist\n"; | 
|  | 510 } | 
|  | 511 | 
|  | 512 print STDERR "Reading in OMIM morbid map...\n" unless $quiet; | 
|  | 513 die "Data file $morbidmap_file does not exist, aborting.\n" if not -e $morbidmap_file; | 
|  | 514 my %gene2morbids; | 
|  | 515 open(TAB, $morbidmap_file) | 
|  | 516   or die "Cannot open OMIM morbid map file $morbidmap_file for reading: $!\n"; | 
|  | 517 while(<TAB>){ | 
|  | 518   my @fields = split /\|/, $_; | 
|  | 519   #print STDERR "got ", ($#fields+1), " fields in $_\n"; | 
|  | 520   #my @fields = split /\|/, $_; | 
|  | 521   my $morbid = $fields[0]; | 
|  | 522   $morbid =~ s/\s*\(\d+\)$//; # get rid of trailing parenthetical number | 
|  | 523   my $omim_id = $fields[2]; | 
|  | 524   for my $genename (split /, /, $fields[1]){ | 
|  | 525     if(not exists $gene2morbids{$genename}){ | 
|  | 526       $gene2morbids{$genename}  = "OMIM:$omim_id $morbid"; | 
|  | 527     } | 
|  | 528     else{ | 
|  | 529       $gene2morbids{$genename} .= "; $morbid"; | 
|  | 530     } | 
|  | 531   } | 
|  | 532 } | 
|  | 533 close(TAB); | 
|  | 534 | 
|  | 535 print STDERR "Reading in interpro mappings...\n" unless $quiet; | 
|  | 536 die "Data file $interpro_bed_file does not exist, aborting.\n" if not -e $interpro_bed_file; | 
|  | 537 my %interpro_ids; | 
|  | 538 open(TAB, $interpro_bed_file) | 
|  | 539   or die "Cannot open gene name BED file $interpro_bed_file for reading: $!\n"; | 
|  | 540 while(<TAB>){ | 
|  | 541   chomp; | 
|  | 542   # format should be "chr start stop interpro desc..." | 
|  | 543   my @fields = split /\t/, $_; | 
|  | 544   my $c = $fields[0]; | 
|  | 545   if(not exists $interpro_ids{$c}){ | 
|  | 546      $interpro_ids{$c} = [] unless exists $interpro_ids{$c}; | 
|  | 547      $interpro_ids{"chr".$c} = [] unless $c =~ /^chr/; | 
|  | 548      $interpro_ids{$1} = [] if $c =~ /^chr(\S+)/; | 
|  | 549   } | 
|  | 550   my $dataref = [@fields[1..3]]; | 
|  | 551   push @{$interpro_ids{$c}}, $dataref; | 
|  | 552   push @{$interpro_ids{"chr".$c}}, $dataref unless $c =~ /^chr/; | 
|  | 553   push @{$interpro_ids{$1}}, $dataref if $c =~ /^chr(\S+)/; | 
|  | 554 } | 
|  | 555 close(TAB); | 
|  | 556 print STDERR "Sorting interpro matches per contig...\n" unless $quiet; | 
|  | 557 for my $chr (keys %interpro_ids){ | 
|  | 558   $interpro_ids{$chr} = [sort {$a->[0] <=> $b->[0]} @{$interpro_ids{$chr}}]; | 
|  | 559 } | 
|  | 560 | 
|  | 561 # {uc(gene_name) => space separated info} | 
|  | 562 print STDERR "Reading in tissue info...\n" unless $quiet; | 
|  | 563 my %tissues; | 
|  | 564 my %tissue_desc; | 
|  | 565 open(TISSUE, $tissue_file) | 
|  | 566     or die "Cannot open $tissue_file for reading: $!\n"; | 
|  | 567 while(<TISSUE>){ | 
|  | 568     chomp; | 
|  | 569     my @fields = split /\t/, $_; | 
|  | 570     my $gname = uc($fields[1]); | 
|  | 571     $tissue_desc{$gname} = $fields[2]; | 
|  | 572     next unless $#fields == 4; | 
|  | 573     my @tis = split /;\s*/, $fields[4]; | 
|  | 574     pop @tis; | 
|  | 575     if(@tis < 6){ | 
|  | 576 	$tissues{$gname} = join(">", @tis); | 
|  | 577     } | 
|  | 578     elsif(@tis < 24){ | 
|  | 579 	$tissues{$gname} = join(">", @tis[0..5]).">..."; | 
|  | 580     } | 
|  | 581     else{ | 
|  | 582 	$tissues{$gname} = join(">", @tis[0..(int($#tis/5))]).">..."; | 
|  | 583     } | 
|  | 584 } | 
|  | 585 close(TISSUE); | 
|  | 586 | 
|  | 587 print STDERR "Loading pathway info...\n" unless $quiet; | 
|  | 588 my %pathways; | 
|  | 589 my %pathway_ids; | 
|  | 590 open(PATHWAY, $pathway_file) | 
|  | 591     or die "Cannot open $pathway_file for reading: $!\n"; | 
|  | 592 while(<PATHWAY>){ | 
|  | 593     chomp; | 
|  | 594     my @fields = split /\t/, $_; | 
|  | 595     next unless @fields >= 4; | 
|  | 596     for my $gname (split /\s+/, $fields[1]){ | 
|  | 597       $gname = uc($gname); | 
|  | 598       if(not exists $pathways{$gname}){ | 
|  | 599 	  $pathway_ids{$gname} = $fields[2]; | 
|  | 600 	  $pathways{$gname} = $fields[3]; | 
|  | 601       } | 
|  | 602       else{ | 
|  | 603 	  $pathway_ids{$gname} .= "; ".$fields[2]; | 
|  | 604 	  $pathways{$gname} .= "; ".$fields[3]; | 
|  | 605       } | 
|  | 606    } | 
|  | 607 } | 
|  | 608 close(PATHWAY); | 
|  | 609 | 
|  | 610 print STDERR "Loading SIFT indices...\n" unless $quiet; | 
|  | 611 # Read in the database table list | 
|  | 612 my %chr_tables; | 
|  | 613 open(DBLIST, "$sift_dir/bins.list") | 
|  | 614     or die "Cannot open $sift_dir/bins.list for reading: $!\n"; | 
|  | 615 while(<DBLIST>){ | 
|  | 616     chomp; | 
|  | 617     my @fields = split /\t/, $_; | 
|  | 618     if(not exists $chr_tables{$fields[1]}){ | 
|  | 619 	$chr_tables{$fields[1]} = {}; | 
|  | 620     } | 
|  | 621     my $chr = $fields[1]; | 
|  | 622     $chr = "chr$chr" unless $chr =~ /^chr/; | 
|  | 623     $chr_tables{$chr}->{"$chr\_$fields[2]_$fields[3]"} = [$fields[2],$fields[3]]; | 
|  | 624 } | 
|  | 625 close(DBLIST); | 
|  | 626 | 
|  | 627 #Connect to database | 
|  | 628 my $db_gene = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_Supp.sqlite","","",{RaiseError=>1, AutoCommit=>1}); | 
|  | 629 | 
|  | 630 my $db_chr; | 
|  | 631 my $fr_stmt; # retrieval for frameshift | 
|  | 632 my $snp_stmt; # retrieval for SNP | 
|  | 633 my $cur_chr = ""; | 
|  | 634 my $cur_max_pos; | 
|  | 635 | 
|  | 636 if(defined $predict_cryptic_splicings){ | 
|  | 637   if(not -e $predict_cryptic_splicings){ | 
|  | 638     die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, does not exist.\n"; | 
|  | 639   } | 
|  | 640   if(not -e $predict_cryptic_splicings.".fai" and not -w dirname($predict_cryptic_splicings)){ | 
|  | 641     die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, is not indexed, and the directory is not writable.\n"; | 
|  | 642   } | 
|  | 643   print STDERR "Loading reference FastA index...\n" unless $quiet; | 
|  | 644   $fasta_index = Bio::DB::Sam::Fai->load($predict_cryptic_splicings); | 
|  | 645 } | 
|  | 646 | 
|  | 647 print STDERR "Annotating variants...\n" unless $quiet; | 
|  | 648 # Assume contigs and positions in order | 
|  | 649 print STDERR "Processing file $hgvs_table_file...\n" unless $quiet; | 
|  | 650 open(HGVS, $hgvs_table_file) | 
|  | 651     or die "Cannot open HGVS file $hgvs_table_file for reading: $!\n"; | 
|  | 652 # Output file for annotated snps and frameshifts | 
|  | 653 my $header = <HGVS>; # header | 
|  | 654 chomp $header; | 
|  | 655 my @headers = split /\t/, $header; | 
|  | 656 my ($chr_column, $pos_column, $ref_column, $alt_column, $alt_cnt_column, $tot_cnt_column, $ftype_column, $dbid_column, $maf_src_column, $maf_column, $internal_freq_column, | 
|  | 657     $cdna_hgvs_column, $aa_hgvs_column, $transcript_column, $transcript_length_column, $strand_column, $zygosity_column, $splice_dist_column, $caveats_column, $phase_column, | 
|  | 658     $srcs_column, $pvalue_column, $to_column, $genename_column, $rares_column, $rares_header); | 
|  | 659 my %req_columns = ( | 
|  | 660 	"Chr" => \$chr_column, | 
|  | 661 	"DNA From" => \$pos_column, | 
|  | 662 	"DNA To" => \$to_column, | 
|  | 663 	"Gene Name" => \$genename_column, | 
|  | 664 	"Ref base" => \$ref_column, | 
|  | 665 	"Obs base" => \$alt_column, | 
|  | 666 	"Variant Reads" => \$alt_cnt_column, | 
|  | 667 	"Total Reads" => \$tot_cnt_column, | 
|  | 668 	"Feature type" => \$ftype_column, | 
|  | 669 	"Variant DB ID" => \$dbid_column, | 
|  | 670 	"Pop. freq. source" => \$maf_src_column, | 
|  | 671 	"Pop. freq." => \$maf_column, | 
|  | 672 	"Transcript HGVS" => \$cdna_hgvs_column, | 
|  | 673 	"Protein HGVS" => \$aa_hgvs_column, | 
|  | 674         "Selected transcript" => \$transcript_column, | 
|  | 675 	"Transcript length" => \$transcript_length_column, | 
|  | 676 	"Strand" => \$strand_column, | 
|  | 677 	"Zygosity" => \$zygosity_column, | 
|  | 678 	"Closest exon junction (AA coding variants)" => \$splice_dist_column, | 
|  | 679 	"Caveats" => \$caveats_column, | 
|  | 680 	"Phase" => \$phase_column, | 
|  | 681         "P-value" => \$pvalue_column); | 
|  | 682 &load_header_columns(\%req_columns, \@headers); | 
|  | 683 for(my $i = 0; $i <= $#headers; $i++){ | 
|  | 684   # Optional columns go here | 
|  | 685   if($headers[$i] eq "Sources"){ | 
|  | 686     $srcs_column = $i; | 
|  | 687   } | 
|  | 688   elsif($headers[$i] eq "Internal pop. freq."){ | 
|  | 689     $internal_freq_column = $i; | 
|  | 690   } | 
|  | 691   elsif($headers[$i] =~ /^Num rare variants/){ | 
|  | 692     $rares_column = $i; | 
|  | 693     $rares_header = $headers[$i]; | 
|  | 694   } | 
|  | 695 } | 
|  | 696 open(OUT, ">$outfile") | 
|  | 697     or die "Cannot open $outfile for writing: $!\n"; | 
|  | 698 print OUT join("\t", "Chr", "DNA From", "DNA To", "Ref base", "Obs base", "% ref read","% variant read", | 
|  | 699 	   "Variant Reads", "Total Reads", "Feature type","Variant type", "Variant context", "Variant DB ID", "Pop. freq. source", | 
|  | 700 	   "Pop. freq."), (defined $internal_freq_column ? "\tInternal pop. freq.\t" : "\t"), join("\t", "SIFT Score", "Polyphen2 Score", "Polyphen2 call", "GERP score", | 
|  | 701 	   "Gene Name", "Transcript HGVS", | 
|  | 702 	   "Protein HGVS", "Zygosity","Closest exon junction (AA coding variants)","Splicing alteration potential","Splicing alteration details", | 
|  | 703 	   "OMIM Morbid Map","Tissue expression", "Pathways", "Pathway refs", | 
|  | 704 	   "Gene Function", "Spanning InterPro Domains", "P-value", "Caveats", "Phase", | 
|  | 705            "Selected transcript", "Transcript length", "Strand"), | 
|  | 706            (defined $rares_column ? "\t$rares_header" : ""), | 
|  | 707            (defined $srcs_column ? "\tSources" : ""), "\n"; | 
|  | 708 while(<HGVS>){ | 
|  | 709     chomp; | 
|  | 710     my @fields = split /\t/, $_, -1; | 
|  | 711     my $mode; | 
|  | 712     my $protein_hgvs = $fields[$aa_hgvs_column]; | 
|  | 713     my $refSeq = $fields[$ref_column]; | 
|  | 714     my $newBase = $fields[$alt_column]; | 
|  | 715     my $splicing_potential = "NA"; | 
|  | 716     my $splicing_details = "NA"; | 
|  | 717     if(defined $predict_cryptic_splicings){ | 
|  | 718       my $distance = $fields[$splice_dist_column]; # only for coding variants | 
|  | 719       my $site_type; | 
|  | 720       if($distance eq "NA"){ | 
|  | 721       } | 
|  | 722       elsif($distance){ | 
|  | 723         $site_type = $distance < 0 ? "donor" : "acceptor"; # exon context | 
|  | 724       } | 
|  | 725       else{ | 
|  | 726         if($fields[$cdna_hgvs_column] =~ /\d+([\-+]\d+)/){ # get from the cDNA HGVS otherwise | 
|  | 727           $distance = $1; | 
|  | 728           $distance =~ s/^\+//; | 
|  | 729           $site_type = $distance > 0 ? "donor" : "acceptor"; # intron context | 
|  | 730         } | 
|  | 731       } | 
|  | 732       # only do splicing predictions for novel or rare variants, since these predictions are expensive | 
|  | 733       if($distance and $distance ne "NA" and ($fields[$maf_column] eq "NA" or $fields[$maf_column] <= 0.01)){ | 
|  | 734         #print STDERR "Running prediction for rare variant ($fields[3], MAF=$fields[14]): $fields[5], $fields[6], $refSeq, $newBase, $fields[4], $distance, $site_type\n"; | 
|  | 735         ($splicing_potential, $splicing_details) = &predict_splicing($fields[$chr_column], $fields[$pos_column], $refSeq, $newBase, $fields[$strand_column], $distance, $site_type); | 
|  | 736       } | 
|  | 737     } | 
|  | 738     my $lookupBase; | 
|  | 739     if($fields[$strand_column] eq "-"){ | 
|  | 740       if($refSeq ne "NA"){ | 
|  | 741         $refSeq = reverse($refSeq); | 
|  | 742         $refSeq =~ tr/ACGTacgt/TGCAtgca/; | 
|  | 743       } | 
|  | 744       if($newBase ne "NA"){ | 
|  | 745         $newBase = reverse($newBase); | 
|  | 746         $newBase =~ tr/ACGTacgt/TGCAtgca/; | 
|  | 747         $newBase =~ s(TN/)(NA/); | 
|  | 748         $newBase =~ s(/TN)(/NA); | 
|  | 749       } | 
|  | 750     } | 
|  | 751     if($fields[$cdna_hgvs_column] =~ /^g\..*?(?:ins|delins|>)([ACGTN]+)$/){ | 
|  | 752       $mode = "snps"; | 
|  | 753     } | 
|  | 754     elsif($fields[$cdna_hgvs_column] =~ /^g\..*?del([ACGTN]+)$/){ | 
|  | 755       $mode = "snps"; | 
|  | 756     } | 
|  | 757     elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+(?:[\-+]\d+)?ins([ACGTN]+)$/ or $fields[$cdna_hgvs_column] =~ /^c\.\d+(?:[\-+]\d+)?_\d+ins([ACGTN]+)$/){ | 
|  | 758         if(not length($1)%3){ | 
|  | 759             $mode = "snps"; | 
|  | 760         } | 
|  | 761         else{ | 
|  | 762             $mode = "frameshifts"; | 
|  | 763         } | 
|  | 764     } | 
|  | 765     elsif($fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:[\-+]\d+)?_(\d+)(?:[\-+]\d+)?delins([ACGTN]+)$/){ | 
|  | 766 	if(not (length($3)-$2+$1-1)%3){ | 
|  | 767 	    $mode = "snps"; | 
|  | 768 	} | 
|  | 769 	else{ | 
|  | 770 	    $mode = "frameshifts"; | 
|  | 771 	} | 
|  | 772     } | 
|  | 773     elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[+\-]\d+_\d+[+\-]\d+delins([ACGTN]+)$/ or | 
|  | 774           $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or | 
|  | 775           $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or | 
|  | 776           $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+del([ACGTN]*)$/ or | 
|  | 777           $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?(?:ins|delins)?([ACGTN]+)$/){ | 
|  | 778 	$mode = "snps"; | 
|  | 779     } | 
|  | 780     elsif($fields[$cdna_hgvs_column] =~ /^c\.([-*]?\d+)del(\S+)/){ | 
|  | 781         my $dist = $1; | 
|  | 782         $dist =~ s/^\*//; | 
|  | 783         if(abs($dist) < 4){ | 
|  | 784           $mode = "snps"; | 
|  | 785         } | 
|  | 786         else{ | 
|  | 787           $mode = "frameshifts"; | 
|  | 788         } | 
|  | 789     } | 
|  | 790     elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+ins(\S*)/ or | 
|  | 791           $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+del(\S*)/){ | 
|  | 792         $mode = "snps"; | 
|  | 793     } | 
|  | 794     elsif($fields[$cdna_hgvs_column] =~ /^c\.(-?\d+)_(\d+)(?:\+\d+)?del(\S+)/ or | 
|  | 795           $fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:\-\d+)?_(\d+)del(\S+)/){ | 
|  | 796 	if(not ($2-$1+1)%3){ | 
|  | 797 	    $mode = "snps"; | 
|  | 798 	} | 
|  | 799 	else{ | 
|  | 800 	    $mode = "frameshifts"; | 
|  | 801 	} | 
|  | 802     } | 
|  | 803     elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+ins([ACGTN]+)$/){ | 
|  | 804 	if(not length($1)%3){ | 
|  | 805 	    $mode = "snps"; | 
|  | 806 	} | 
|  | 807 	else{ | 
|  | 808 	    $mode = "frameshifts"; | 
|  | 809 	} | 
|  | 810     } | 
|  | 811     # special case of shifted start codon | 
|  | 812     elsif($fields[$cdna_hgvs_column] =~ /^c\.-1_1+ins([ACGTN]+)$/ or | 
|  | 813           $fields[$cdna_hgvs_column] =~ /inv$/){ | 
|  | 814         $mode = "snps"; | 
|  | 815     } | 
|  | 816     elsif($fields[$cdna_hgvs_column] =~ /[ACGTN]>([ACGTN])$/){ | 
|  | 817 	# todo: handle multiple consecutive substitutions, will have to run polyphen separately | 
|  | 818         $lookupBase = $newBase; | 
|  | 819 	if($fields[$strand_column] eq "-"){ # revert to positive strand variant for SIFT index | 
|  | 820 	    $lookupBase =~ tr/ACGTacgt/TGCAtgca/; | 
|  | 821         } | 
|  | 822 	$mode = "snps"; | 
|  | 823     } | 
|  | 824     else{ | 
|  | 825 	$mode = "snps"; | 
|  | 826     } | 
|  | 827     if($fields[$chr_column] ne $cur_chr){ | 
|  | 828 	#Prepared DB connection on per-chromosome basis | 
|  | 829 	$cur_chr = $fields[$chr_column]; | 
|  | 830 	$cur_chr =~ s/^chr//; | 
|  | 831 	$db_chr = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_CHR$cur_chr.sqlite", | 
|  | 832 			       "", | 
|  | 833 			       "", | 
|  | 834 			       { RaiseError => 1, AutoCommit => 1 }) unless not -e "$sift_dir/Human_CHR$cur_chr.sqlite"; | 
|  | 835 | 
|  | 836 	$cur_max_pos = -1; | 
|  | 837     } | 
|  | 838     if($fields[$pos_column] =~ /^\d+$/ and $cur_max_pos < $fields[$pos_column]){ | 
|  | 839 	undef $fr_stmt; | 
|  | 840 	undef $snp_stmt; | 
|  | 841         my $c = $fields[$chr_column]; | 
|  | 842         $c = "chr$c" if $c !~ /^chr/; | 
|  | 843 	for my $chr_table_key (keys %{$chr_tables{$c}}){ | 
|  | 844 	    my $chr_table_range = $chr_tables{$c}->{$chr_table_key}; | 
|  | 845 	    if($chr_table_range->[0] <= $fields[$pos_column] and $chr_table_range->[1] >= $fields[$pos_column]){ | 
|  | 846 		$fr_stmt = $db_chr->prepare("select SCORE". | 
|  | 847 					    " from $chr_table_key where COORD1 = ?"); | 
|  | 848 		$snp_stmt = $db_chr->prepare("select SCORE". | 
|  | 849 					     " from $chr_table_key where COORD1 = ? AND NT2 = ?"); | 
|  | 850 		$cur_max_pos = $chr_table_range->[1]; | 
|  | 851 		last; | 
|  | 852 	    } | 
|  | 853 	} | 
|  | 854 	if(not defined $fr_stmt and not $quiet){ | 
|  | 855 	    warn "Got request for position not in range of SIFT ($c:$fields[$pos_column])\n"; | 
|  | 856 	} | 
|  | 857     } | 
|  | 858 | 
|  | 859     my @cols; | 
|  | 860     if(not $fr_stmt){ | 
|  | 861        @cols = ("NA"); | 
|  | 862     } | 
|  | 863     elsif($mode eq "frameshifts"){ | 
|  | 864 	$fr_stmt->execute($fields[$pos_column]); | 
|  | 865 	@cols = $fr_stmt->fetchrow_array(); | 
|  | 866     } | 
|  | 867     else{ | 
|  | 868 	$snp_stmt->execute($fields[$pos_column]-1, $lookupBase); | 
|  | 869 	@cols = $snp_stmt->fetchrow_array(); | 
|  | 870     } | 
|  | 871     # See if the change is in a CDS, and if it's non-synonymous | 
|  | 872     my ($type, $location) = ("silent", "intron"); | 
|  | 873     my $start = $fields[$pos_column]; | 
|  | 874     my $stop = $fields[$to_column]; | 
|  | 875     my $interpro = get_range_info(\%interpro_ids, $fields[$chr_column], $start, $stop); | 
|  | 876     my $domains = join("; ", sort keys %$interpro); | 
|  | 877 | 
|  | 878     my $sift_score = "NA"; | 
|  | 879     my $variant_key = "NA"; | 
|  | 880     my $transcript_hgvs = $fields[$cdna_hgvs_column]; | 
|  | 881     if($transcript_hgvs =~ /\.\d+[ACGTN]>/ or $transcript_hgvs =~ /\.\d+(?:_|del|ins)/ or $transcript_hgvs =~ /[+\-]\?/){ | 
|  | 882 	$location = "exon"; | 
|  | 883     } | 
|  | 884     elsif($transcript_hgvs =~ /\.\d+[\-+](\d+)/){ | 
|  | 885 	if($1 < 21){ | 
|  | 886 	    $type = "splice"; | 
|  | 887 	} | 
|  | 888 	else{ | 
|  | 889 	    $type = "intronic"; | 
|  | 890 	} | 
|  | 891 	$location = "splice site"; | 
|  | 892     } | 
|  | 893     elsif($transcript_hgvs =~ /\.\*\d+/){ | 
|  | 894 	$location = "3'UTR"; | 
|  | 895     } | 
|  | 896     elsif($transcript_hgvs =~ /\.-\d+/){ | 
|  | 897 	$location = "5'UTR"; | 
|  | 898     } | 
|  | 899     if($mode eq "frameshifts"){ | 
|  | 900 	$type = "frameshift"; | 
|  | 901     } | 
|  | 902     else{ | 
|  | 903 	if(not defined $protein_hgvs){ | 
|  | 904             $type = "unknown"; | 
|  | 905         } | 
|  | 906         elsif($protein_hgvs eq "NA"){ | 
|  | 907 	    $type = "non-coding"; | 
|  | 908 	} | 
|  | 909 	elsif($protein_hgvs =~ /=$/){ | 
|  | 910 	    $type = "silent"; | 
|  | 911 	} | 
|  | 912 	elsif($protein_hgvs =~ /\d+\*/){ # nonsense | 
|  | 913 	    $type = "nonsense"; | 
|  | 914 	} | 
|  | 915 	elsif($protein_hgvs =~ /ext/){ # nonsense | 
|  | 916 	    $type = "nonstop"; | 
|  | 917 	} | 
|  | 918         elsif($protein_hgvs eq "p.0?" or $protein_hgvs eq "p.?"){ | 
|  | 919             $type = "unknown"; | 
|  | 920         } | 
|  | 921 	else{ | 
|  | 922 	    $type = "missense"; | 
|  | 923 	} | 
|  | 924 | 
|  | 925 	$sift_score = $cols[0] || "NA"; | 
|  | 926 | 
|  | 927         # Record the need to fetch VCF polyphen and gerp data later (en masse, for efficiency) | 
|  | 928         # Use newBase instead of lookupBase for PolyPhen since the orientation is always relative to the transcript | 
|  | 929 	$variant_key = get_variant_key($fields[$chr_column], $fields[$pos_column], $fields[$ref_column], $newBase); | 
|  | 930         #print STDERR "Variant key is $variant_key\n"; | 
|  | 931     } | 
|  | 932     my $transcript_type = $fields[$ftype_column]; | 
|  | 933     my $transcript_length = $fields[$transcript_length_column]; | 
|  | 934     my $transcript_id = $fields[$transcript_column]; # sift score is derived from this transcript | 
|  | 935     my $transcript_strand = $fields[$strand_column]; | 
|  | 936     my $zygosity = $fields[$zygosity_column]; | 
|  | 937     my $rsID = $fields[$dbid_column]; | 
|  | 938     my $popFreq = $fields[$maf_column]; | 
|  | 939     my $internalFreq; | 
|  | 940     $internalFreq = $fields[$internal_freq_column] if defined $internal_freq_column; | 
|  | 941     my $popFreqSrc = $fields[$maf_src_column]; | 
|  | 942 | 
|  | 943     # Get the gene name and omim info | 
|  | 944     my %seen; | 
|  | 945     my @gene_names = split /; /, $fields[$genename_column]; | 
|  | 946     my $morbid = join("; ", grep {/\S/ and not $seen{$_}++} map({$gene2morbids{$_} or ""} @gene_names)); | 
|  | 947     my $tissue = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissues{$_} or ""} @gene_names)); | 
|  | 948     my $function = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissue_desc{$_} or ""} @gene_names)); | 
|  | 949     my $pathways = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathways{$_} or ""} @gene_names)); | 
|  | 950     my $pathway_ids = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathway_ids{$_} or ""} @gene_names)); | 
|  | 951 | 
|  | 952     # It may be that the calls and freqs are multiple concatenated values, e.g. "X; Y" | 
|  | 953     my @tot_bases = split /; /,$fields[$tot_cnt_column]; | 
|  | 954     my @var_bases = split /; /,$fields[$alt_cnt_column]; | 
|  | 955     my (@pct_nonvar, @pct_var); | 
|  | 956     for (my $i = 0; $i <= $#tot_bases; $i++){ | 
|  | 957         push @pct_nonvar, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int(($tot_bases[$i]-$var_bases[$i])/$tot_bases[$i]*100+0.5); | 
|  | 958         push @pct_var, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int($var_bases[$i]/$tot_bases[$i]*100+0.5); | 
|  | 959     } | 
|  | 960 | 
|  | 961     push @lines, [ | 
|  | 962 		   $fields[$chr_column], # chr | 
|  | 963 		   $fields[$pos_column], # pos | 
|  | 964 		   $fields[$to_column], | 
|  | 965 		   $fields[$ref_column], | 
|  | 966 		   $fields[$alt_column], | 
|  | 967 		   join("; ", @pct_nonvar), # pct ref | 
|  | 968 		   join("; ", @pct_var),  # pct var | 
|  | 969 		   $fields[$alt_cnt_column], # num var | 
|  | 970 		   $fields[$tot_cnt_column], # num reads | 
|  | 971 		   $transcript_type, # protein_coding, pseudogene, etc. | 
|  | 972 		   $type, | 
|  | 973 		   $location, | 
|  | 974 		   $rsID, | 
|  | 975 		   $popFreqSrc, | 
|  | 976 		   (defined $internal_freq_column ? ($popFreq,$internalFreq) : ($popFreq)), | 
|  | 977 		   $sift_score, | 
|  | 978 		   $variant_key, # 16th field | 
|  | 979 		   join("; ", @gene_names), | 
|  | 980 		   $transcript_hgvs, | 
|  | 981 		   $protein_hgvs, | 
|  | 982 		   $zygosity, | 
|  | 983                    $fields[$splice_dist_column], # distance to exon edge | 
|  | 984                    $splicing_potential, | 
|  | 985                    $splicing_details, | 
|  | 986                    $morbid, | 
|  | 987 		   $tissue, | 
|  | 988 		   $pathways, | 
|  | 989 		   $pathway_ids, | 
|  | 990 		   $function, | 
|  | 991                    $domains, | 
|  | 992                    $fields[$pvalue_column], | 
|  | 993                    $fields[$caveats_column], # caveats | 
|  | 994                    (defined $rares_column ? ($fields[$rares_column],$transcript_id) : ($transcript_id)), | 
|  | 995 		   $transcript_length, | 
|  | 996                    $transcript_strand, | 
|  | 997 		   (defined $srcs_column ? ($fields[$phase_column],$fields[$srcs_column]) : ($fields[$phase_column]))]; | 
|  | 998 } | 
|  | 999 | 
|  | 1000 print STDERR "Adding Polyphen2 and GERP info...\n" unless $quiet; | 
|  | 1001 # retrieve the polyphen and gerp info en masse for each chromosome, as this is much more efficient | 
|  | 1002 my $vkey_col = 16+(defined $internal_freq_column ? 1 : 0); | 
|  | 1003 for my $line_fields (@lines){ | 
|  | 1004   # expand the key into the relevant MAF values | 
|  | 1005   $line_fields->[$vkey_col] = join("\t", polyphen_gerp_info($gerp_dir,$polyphen_file,$line_fields->[$vkey_col])); # fields[$vkey_col] is variant key | 
|  | 1006   print OUT join("\t", @{$line_fields}), "\n"; | 
|  | 1007 } | 
|  | 1008 close(OUT); | 
|  | 1009 unlink $tmpfile if defined $tmpfile; | 
|  | 1010 | 
|  | 1011 sub load_header_columns{ | 
|  | 1012   my ($reqs_hash_ref, $headers_array_ref) = @_; | 
|  | 1013   my %unfulfilled; | 
|  | 1014   for my $field_name (keys %$reqs_hash_ref){ | 
|  | 1015     $unfulfilled{$field_name} = 1; | 
|  | 1016   } | 
|  | 1017   for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){ | 
|  | 1018     for my $req_header_name (keys %$reqs_hash_ref){ | 
|  | 1019       if($req_header_name eq $headers_array_ref->[$i]){ | 
|  | 1020         ${$reqs_hash_ref->{$req_header_name}} = $i; | 
|  | 1021         delete $unfulfilled{$req_header_name}; | 
|  | 1022         last; | 
|  | 1023       } | 
|  | 1024     } | 
|  | 1025   } | 
|  | 1026   if(keys %unfulfilled){ | 
|  | 1027     die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n"; | 
|  | 1028   } | 
|  | 1029 } |