0
+ − 1 #!/usr/bin/env perl
+ − 2
+ − 3 use Bio::DB::Sam;
+ − 4 use DBI;
+ − 5 use IPC::Open2;
+ − 6 use Set::IntervalTree;
+ − 7 use File::Basename;
+ − 8 use strict;
+ − 9 use warnings;
+ − 10 use vars qw($quiet %chr2variant_locs %polyphen_info %chr2polyphen_vcf_lines %chr2gerp_vcf_lines %range2genes $fasta_index $UNAFFECTED $tmpfile);
+ − 11
+ − 12 $UNAFFECTED = -299999; # sentinel for splice site predictions
+ − 13
+ − 14 if(@ARGV == 1 and $ARGV[0] eq "-v"){
+ − 15 print "Version 1.0\n";
+ − 16 exit;
+ − 17 }
+ − 18
+ − 19 # configuration file stuff
+ − 20 my %config;
+ − 21 my $dirname = dirname(__FILE__);
+ − 22 my $tool_dir = shift @ARGV;
+ − 23 if(not -e "$tool_dir/hgvs_annotate.loc"){
+ − 24 system("cp $dirname/tool-data/hgvs_annotate.loc $tool_dir/hgvs_annotate.loc");
+ − 25 }
+ − 26 open CONFIG, '<', "$tool_dir/hgvs_annotate.loc";
+ − 27 while(<CONFIG>){
+ − 28 next if $_ =~ /^#/;
+ − 29 (my $key,my $value) = split(/\s+/, $_);
+ − 30 $config{$key} = $value;
+ − 31 }
+ − 32 my $dbs_dir = $config{"dbs_dir"};
+ − 33 close CONFIG;
+ − 34
+ − 35 $quiet = 0;
+ − 36 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
+ − 37 $quiet = 1;
+ − 38 shift @ARGV;
+ − 39 }
+ − 40
+ − 41 @ARGV == 9 or @ARGV == 10 or die "Usage: $0 -q <SIFT dir> <POLYPHEN2 file> <GERP dir> <tissuedb file> <pathway file> <interpro domains.bed> <omim morbidmap> <input hgvs table.txt> <output hgvs table.annotated.txt> [reference.fasta]\n Where if the reference fasta is provided, we predict cryptic splicing effects (using MaxEntScan and GeneSplicer)\n";
+ − 42
+ − 43 my $sift_dir = $dbs_dir . shift @ARGV;
+ − 44 my $polyphen_file = $dbs_dir . shift @ARGV;
+ − 45 my $gerp_dir = $dbs_dir . shift @ARGV;
+ − 46 my $tissue_file = $dbs_dir . shift @ARGV;
+ − 47 my $pathway_file = $dbs_dir . shift @ARGV;
+ − 48 my $interpro_bed_file = shift @ARGV;
+ − 49 my $morbidmap_file = shift @ARGV;
+ − 50 my $hgvs_table_file = shift @ARGV;
+ − 51 my $outfile = shift @ARGV;
+ − 52 my $predict_cryptic_splicings = @ARGV ? shift @ARGV : undef;
+ − 53 my $flanking_bases = 500; # assume this is ROI around a gene
+ − 54 my @lines;
+ − 55
+ − 56 sub polyphen_gerp_info($$$){
+ − 57 my($gerp_dir,$polyphen_file,$info_key) = @_;
+ − 58
+ − 59 my($chr,$pos,$ref,$variant) = split /:/, $info_key;
+ − 60
+ − 61 # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse
+ − 62 if(not exists $chr2polyphen_vcf_lines{$chr}){
+ − 63 ($chr2polyphen_vcf_lines{$chr}, $chr2gerp_vcf_lines{$chr}) = retrieve_vcf_lines($gerp_dir,$polyphen_file,$chr);
+ − 64 }
+ − 65
+ − 66 my @results;
+ − 67 if(not $ref or not $variant){ # can only get data for SNPs
+ − 68 @results = qw(NA NA NA);
+ − 69 }
+ − 70 #print STDERR "ref = $ref and variant = $variant for $info_key\n";
+ − 71 elsif(length($ref) == 1 and length($variant) == 1){
+ − 72 #print STDERR "Grabbing poly/gerp for $chr,$pos,$ref,$variant\n";
+ − 73 @results = polyphen_info($chr,$pos,$variant);
+ − 74 $results[2] = gerp_info($chr,$pos);
+ − 75 }
+ − 76
+ − 77 # It may be that a novel MNP variant is a bunch of known SNPs in a row. In this case,
+ − 78 # report the list of variants
+ − 79 elsif(length($ref) == length($variant) and $ref ne "NA"){
+ − 80 my (@poly_scores, @poly_calls, @gerp_scores);
+ − 81 for(my $i = 0; $i < length($ref); $i++){
+ − 82 my $refbase = substr($ref, $i, 1);
+ − 83 my $newbase = substr($variant, $i, 1);
+ − 84 if($newbase eq $refbase){
+ − 85 push @poly_scores, "-";
+ − 86 push @poly_calls, "-";
+ − 87 push @gerp_scores, "-";
+ − 88 next;
+ − 89 }
+ − 90 my @base_results = polyphen_info($chr,$pos+$i,$refbase,$newbase);
+ − 91 push @poly_scores, $base_results[0];
+ − 92 push @poly_calls, $base_results[1];
+ − 93 $base_results[2] = gerp_info($chr,$pos+$i);
+ − 94 push @gerp_scores, $base_results[2];
+ − 95 }
+ − 96 $results[0] = join(",", @poly_scores);
+ − 97 $results[1] = join(",", @poly_calls);
+ − 98 $results[2] = join(",", @gerp_scores);
+ − 99 }
+ − 100 else{
+ − 101 @results = qw(NA NA NA);
+ − 102 }
+ − 103
+ − 104 return @results;
+ − 105 }
+ − 106
+ − 107 sub get_variant_key($$$$){
+ − 108 # params chr pos ref variant
+ − 109 my $c = $_[0];
+ − 110 $c =~ s/^chr//;
+ − 111 my $prop_info_key = join(":", $c, @_[1..3]);
+ − 112 $chr2variant_locs{$c} = [] unless exists $chr2variant_locs{$c};
+ − 113 push @{$chr2variant_locs{$c}}, $_[1];
+ − 114 return $prop_info_key;
+ − 115 }
+ − 116
+ − 117 sub retrieve_vcf_lines($$$){
+ − 118 my ($gerp_dir, $polyphen_file, $chr) = @_;
+ − 119
+ − 120 my (%polyphen_lines, %gerp_lines);
+ − 121
+ − 122 if(not exists $chr2variant_locs{$chr}){
+ − 123 return ({}, {}); # no data requested for this chromosome
+ − 124 }
+ − 125
+ − 126 # build up the request
+ − 127 my @tabix_regions;
+ − 128 my @var_locs = grep /^\d+$/, @{$chr2variant_locs{$chr}}; # grep keeps point mutations only
+ − 129 if(not @var_locs){
+ − 130 return ({}, {}); # no point mutations
+ − 131 }
+ − 132
+ − 133 # sort by variant start location
+ − 134 for my $var_loc (sort {$a <=> $b} @var_locs){
+ − 135 push @tabix_regions, $chr.":".$var_loc."-".$var_loc;
+ − 136 }
+ − 137 my $regions = join(" ", @tabix_regions);
+ − 138
+ − 139 # retrieve the data
+ − 140 die "Cannot find Polyphen2 VCF file $polyphen_file\n" if not -e $polyphen_file;
+ − 141 open(VCF, "tabix $polyphen_file $regions |")
+ − 142 or die "Cannot run tabix on $polyphen_file: $!\n";
+ − 143 while(<VCF>){
+ − 144 if(/^\S+\t(\d+)/ and grep {$_ eq $1} @var_locs){ # take only main columns to save room, if possible
+ − 145 chomp;
+ − 146 $polyphen_lines{$1} = [] unless exists $polyphen_lines{$1};
+ − 147 push @{$polyphen_lines{$1}}, $_;
+ − 148 }
+ − 149 }
+ − 150 close(VCF);
+ − 151
+ − 152 my $vcf_file = "$gerp_dir/chr$chr.tab.gz";
+ − 153 if(not -e $vcf_file){
+ − 154 warn "Cannot find GERP VCF file $vcf_file\n" if not $quiet;
+ − 155 return ({}, {});
+ − 156 }
+ − 157 open(VCF, "tabix $vcf_file $regions |")
+ − 158 or die "Cannot run tabix on $vcf_file: $!\n";
+ − 159 while(<VCF>){
+ − 160 if(/^(\S+\t(\d+)(?:\t\S+){2})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible
+ − 161 $gerp_lines{$2} = [] unless exists $gerp_lines{$2};
+ − 162 push @{$gerp_lines{$2}}, $1;
+ − 163 }
+ − 164 }
+ − 165 close(VCF);
+ − 166
+ − 167 return (\%polyphen_lines, \%gerp_lines);
+ − 168 }
+ − 169
+ − 170 sub polyphen_info($$$){
+ − 171 my ($chr, $pos, $variant_base) = @_;
+ − 172
+ − 173 my $key = "$chr:$pos:$variant_base";
+ − 174 if(exists $polyphen_info{$key}){
+ − 175 return @{$polyphen_info{$key}};
+ − 176 }
+ − 177
+ − 178 #print STDERR "Checking for polyphen data for $chr, $pos, $variant_base\n";
+ − 179 if(exists $chr2polyphen_vcf_lines{$chr}->{$pos}){
+ − 180 for(@{$chr2polyphen_vcf_lines{$chr}->{$pos}}){
+ − 181 #print STDERR "Checking $chr, $pos, $variant_base against $_";
+ − 182 my @fields = split /\t/, $_;
+ − 183 if($fields[4] eq $variant_base){
+ − 184 if($fields[6] eq "D"){
+ − 185 $polyphen_info{$key} = [$fields[5],"deleterious"];
+ − 186 last;
+ − 187 }
+ − 188 elsif($fields[6] eq "P"){
+ − 189 $polyphen_info{$key} = [$fields[5],"poss. del."];
+ − 190 last;
+ − 191 }
+ − 192 elsif($fields[6] eq "B"){
+ − 193 $polyphen_info{$key} = [$fields[5],"benign"];
+ − 194 last;
+ − 195 }
+ − 196 else{
+ − 197 $polyphen_info{$key} = [$fields[5],$fields[6]];
+ − 198 last;
+ − 199 }
+ − 200 }
+ − 201 }
+ − 202 if(not exists $polyphen_info{$key}){
+ − 203 $polyphen_info{$key} = ["NA", "NA"];
+ − 204 }
+ − 205 }
+ − 206 else{
+ − 207 $polyphen_info{$key} = ["NA", "NA"];
+ − 208 }
+ − 209 return @{$polyphen_info{$key}};
+ − 210 }
+ − 211
+ − 212 sub gerp_info($$){
+ − 213 my ($chr,$pos) = @_;
+ − 214
+ − 215 if(exists $chr2gerp_vcf_lines{$chr}->{$pos}){
+ − 216 for(@{$chr2gerp_vcf_lines{$chr}->{$pos}}){
+ − 217 my @fields = split /\t/, $_;
+ − 218 return $fields[3];
+ − 219 }
+ − 220 }
+ − 221 return "NA";
+ − 222 }
+ − 223
+ − 224 sub predict_splicing{
+ − 225 my ($chr, $pos, $ref, $var, $strand, $exon_edge_distance, $splice_type) = @_;
+ − 226
+ − 227 my $disruption_level = 0;
+ − 228 my @disruption_details;
+ − 229 # The methods being used only look for up to a 30 base context, so only
+ − 230 # need to check for obliterated acceptors/donors if they are nearby
+ − 231 $exon_edge_distance-- if $exon_edge_distance > 0; # +1 is actually in the right spot, so decrement
+ − 232 my $edge_offset = $strand eq "-" ? -1*$exon_edge_distance : $exon_edge_distance; # flip offset sign if the call was on the negative strand (since +/- was relative to the transcript, not the genome coordinates)
+ − 233 my $annotated_detected = 0;
+ − 234 my ($genesplicer_orig, $genesplicer_orig_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref);
+ − 235 if($genesplicer_orig != 0){
+ − 236 $annotated_detected = 1;
+ − 237 }
+ − 238 my ($maxentscan_orig, $maxentscan_orig_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref);
+ − 239 if(not $annotated_detected and $maxentscan_orig){
+ − 240 $annotated_detected = 1;
+ − 241 }
+ − 242 my ($genesplicer_mod, $genesplicer_mod_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance);
+ − 243 if($genesplicer_mod != $UNAFFECTED){ # was the mod within the range where this software could detect up or downstream effects on the annotated splice site?
+ − 244 if($genesplicer_orig_pos != $genesplicer_mod_pos){
+ − 245 $disruption_level+=0.75;
+ − 246 if($genesplicer_mod_pos == -1){
+ − 247 push @disruption_details, "Variant obliterates GeneSplicer predicted splice site from the reference sequence ($chr:$genesplicer_orig_pos, orig score $genesplicer_orig)";
+ − 248 }
+ − 249 elsif($genesplicer_orig_pos != -1){
+ − 250 push @disruption_details, "Variant moves GeneSplicer preferred splice site from annotated location $chr:$genesplicer_orig_pos to $chr:$genesplicer_mod_pos (scores: orig=$genesplicer_orig, new=$genesplicer_mod)";
+ − 251 }
+ − 252 }
+ − 253 elsif($genesplicer_orig-$genesplicer_mod > 1){
+ − 254 $disruption_level+=0.5;
+ − 255 push @disruption_details, "Variant significantly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)";
+ − 256 }
+ − 257 elsif($genesplicer_orig > $genesplicer_mod){
+ − 258 $disruption_level+=0.25;
+ − 259 push @disruption_details, "Variant slightly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)";
+ − 260 }
+ − 261 $genesplicer_orig = $genesplicer_mod; # so that splice site comparison vs. cryptic is accurate
+ − 262 }
+ − 263 my ($maxentscan_mod, $maxentscan_mod_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance);
+ − 264 if($maxentscan_mod != $UNAFFECTED){
+ − 265 if($maxentscan_mod_pos != $maxentscan_orig_pos){
+ − 266 $disruption_level+=1;
+ − 267 if($maxentscan_mod_pos == -1){
+ − 268 push @disruption_details, "Variant obliterates MaxEntScan predicted splice site from the reference sequence ($chr:$maxentscan_orig_pos, orig score $maxentscan_orig)";
+ − 269 }
+ − 270 elsif($maxentscan_orig_pos != -1){
+ − 271 push @disruption_details, "Variant moves MaxEntScan preferred splice site from annotated location $chr:$maxentscan_orig_pos to $chr:$maxentscan_mod_pos (scores: orig=$maxentscan_orig, new=$maxentscan_mod)";
+ − 272 }
+ − 273 }
+ − 274 elsif($maxentscan_orig-$maxentscan_mod > 1){
+ − 275 $disruption_level+=0.5;
+ − 276 push @disruption_details, "Variant significantly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)";
+ − 277 }
+ − 278 elsif($maxentscan_orig>$maxentscan_mod){
+ − 279 $disruption_level+=0.25;
+ − 280 push @disruption_details, "Variant slightly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)";
+ − 281 }
+ − 282 $maxentscan_orig = $maxentscan_mod;
+ − 283 }
+ − 284
+ − 285 # Regardless of location, see if there is an increased chance of creating a cryptic splicing site
+ − 286 my ($genesplicer_control, $genesplicer_control_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref);
+ − 287 my ($genesplicer_cryptic, $genesplicer_cryptic_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref, $var);
+ − 288 if(defined $genesplicer_orig and $genesplicer_control < $genesplicer_orig and $genesplicer_orig < $genesplicer_cryptic){
+ − 289 $disruption_level+=0.75;
+ − 290 push @disruption_details, "Variant causes GeneSplicer score for a potential cryptic $splice_type splice site at $chr:$genesplicer_cryptic_pos to become higher than the annotated splice site $chr:$genesplicer_orig_pos (orig=$genesplicer_orig vs. var=$genesplicer_cryptic)";
+ − 291 }
+ − 292 if($genesplicer_control - $genesplicer_cryptic < -1){
+ − 293 $disruption_level+=0.5;
+ − 294 push @disruption_details, "Variant raises GeneSplicer score for a potential cryptic $splice_type splice site (orig=$genesplicer_control vs. var=$genesplicer_cryptic)";
+ − 295 }
+ − 296 my ($maxentscan_control, $maxentscan_control_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref);
+ − 297 my ($maxentscan_cryptic, $maxentscan_cryptic_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref, $var);
+ − 298 if(defined $maxentscan_orig and $maxentscan_control < $maxentscan_orig and $maxentscan_orig < $maxentscan_cryptic){
+ − 299 $disruption_level++;
+ − 300 push @disruption_details, "Variant causes MaxEntScan score for a potential cryptic $splice_type splice site at $chr:$maxentscan_cryptic_pos to become higher than the annotated splice site $chr:$maxentscan_orig_pos (orig=$maxentscan_orig vs. var=$maxentscan_cryptic)";
+ − 301 }
+ − 302 if($maxentscan_control - $maxentscan_cryptic < -1 and $maxentscan_cryptic > 0){
+ − 303 $disruption_level+=0.5;
+ − 304 push @disruption_details, "Variant raises MaxEntScan score for a potential cryptic $splice_type splice site (orig=$maxentscan_control vs. var=$maxentscan_cryptic)";
+ − 305 }
+ − 306
+ − 307 # If neither method predicted the annotated donor site, and the alternate site isn't predicted either, note that so the user doesn't think the mod is definite okay
+ − 308 if(not @disruption_details and $maxentscan_orig < 0 and $genesplicer_orig < 0){
+ − 309 $disruption_level+=0.5;
+ − 310 push @disruption_details, "Neither the annotated splice site nor any potential cryptic site introduced by the variant are predicted by either GeneSplicer or MaxEntScan, therefore the alt splicing potential for this variant should be evaluated manually";
+ − 311 }
+ − 312
+ − 313 return ($disruption_level, join("; ", @disruption_details));
+ − 314 }
+ − 315
+ − 316 sub call_genesplicer{
+ − 317 my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_;
+ − 318
+ − 319 if($splice_type eq "acceptor"){
+ − 320 $pos += $strand eq "-" ? 2: -2;
+ − 321 }
+ − 322 my $num_flanking = 110; # seems to croak if less than 160bp provided
+ − 323 my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins
+ − 324 my $site_window = defined $var_offset ? 2 : ($indel_size > 0 ? 29+$indel_size : 29); # exact position if looking for exisitng site effect, otherwise 10 or 10 + the insert size
+ − 325 my $cmd = "genesplicer";
+ − 326 my $start = $pos - $num_flanking;
+ − 327 $start = 1 if $start < 1;
+ − 328 my $stop = $pos + $num_flanking + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 2*$num_flanking after the edit
+ − 329 $stop++; # end coordinate is exclusive
+ − 330 return ($UNAFFECTED, -1) if defined $var_offset and ($pos+$var_offset < $start or $pos+$var_offset > $stop); # variant is outside of range we can detect affect on original splice site
+ − 331 my $seq = $fasta_index->fetch("$chr:$start-$stop");
+ − 332 if(defined $var){
+ − 333 if(defined $var_offset){ # substitution away from the site of interest
+ − 334 substr($seq, $num_flanking+$var_offset, length($ref)) = $var;
+ − 335 }
+ − 336 else{ # substitution at the site of interest
+ − 337 substr($seq, $num_flanking, length($ref)) = $var;
+ − 338 }
+ − 339 }
+ − 340 if($strand eq "-"){
+ − 341 $seq = reverse($seq);
+ − 342 $seq =~ tr/ACGTacgt/TGCAtgca/;
+ − 343 }
+ − 344 $tmpfile = "$$.fasta";
+ − 345 open(TMP, ">$tmpfile") or die "Could not open $tmpfile for writing: $!\n";
+ − 346 print TMP ">$chr $strand $pos $ref",($var?" $var":""), ($var_offset?" $var_offset":""),"\n$seq\n";
+ − 347 substr($seq, $num_flanking, 2) = lc(substr($seq, $num_flanking, 2));
+ − 348 #print STDERR "$chr $strand $pos $ref $var $var_offset: $seq\n";
+ − 349 close(TMP);
+ − 350 #print STDERR "$cmd $tmpfile /export/achri_data/programs/GeneSplicer/human\n";
+ − 351 #<STDIN>;
+ − 352 open(GS, "$cmd $tmpfile human 2> /dev/null|")
+ − 353 or die "Could not run $cmd: $!\n";
+ − 354 my $highest_score = 0;
+ − 355 my $highest_position = -1;
+ − 356 while(<GS>){
+ − 357 # End5 End3 Score "confidence" splice_site_type
+ − 358 # E.g. :
+ − 359 # 202 203 2.994010 Medium donor
+ − 360 chomp;
+ − 361 my @F = split /\s+/, $_;
+ − 362 next if $F[1] < $F[0]; # wrong strand
+ − 363 if($splice_type eq $F[4]){
+ − 364 if(abs($F[0]-$num_flanking-1) <= $site_window){ # right type and approximate location
+ − 365 #print STDERR "*$_\n";
+ − 366 if($F[2] > $highest_score){
+ − 367 $highest_score = $F[2];
+ − 368 # last bit accounts for indels in the position offset
+ − 369 $highest_position = $pos + ($strand eq "-" ? -1: 1)*(-$num_flanking + $F[0] - 1) + (($F[0] <= $num_flanking && $strand eq "+" or $F[0] >= $num_flanking && $strand eq "-") ? -$indel_size : 0);
+ − 370 }
+ − 371 }
+ − 372 }
+ − 373 #else{print STDERR $_,"\n";}
+ − 374 }
+ − 375 close(GS);
+ − 376 return ($highest_score, $highest_position);
+ − 377 }
+ − 378
+ − 379 sub call_maxentscan{
+ − 380 my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_;
+ − 381
+ − 382 my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins
+ − 383 my $cmd;
+ − 384 my ($num_flanking5, $num_flanking3);
+ − 385 my ($region5, $region3);
+ − 386 my ($start, $stop);
+ − 387 my $highest_score = -9999;
+ − 388 my $highest_position = -1;
+ − 389 # note that donor and acceptor are very different beasts for MaxEntScan
+ − 390 if($splice_type eq "acceptor"){
+ − 391 if($strand eq "-"){
+ − 392 $pos += 2;
+ − 393 }
+ − 394 else{
+ − 395 $pos -= 2;
+ − 396 }
+ − 397 $cmd = "score3.pl";
+ − 398 $num_flanking5 = 18;
+ − 399 $num_flanking3 = 4;
+ − 400 }
+ − 401 else{ # donor
+ − 402 $cmd = "score5.pl";
+ − 403 $num_flanking5 = 3;
+ − 404 $num_flanking3 = 5;
+ − 405 }
+ − 406 # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region
+ − 407 $region5 = defined $var_offset ? 0 : $num_flanking5; # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region
+ − 408 $region3 = defined $var_offset ? 0 : $num_flanking3;
+ − 409 if($strand eq "-"){
+ − 410 $start = $pos - $num_flanking3 - $region3;
+ − 411 $stop = $pos + $num_flanking5 + $region5 + ($indel_size < 0 ? -1*$indel_size:0);
+ − 412 #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking3 - $region3)\n";
+ − 413 }
+ − 414 else{
+ − 415 $start = $pos - $num_flanking5 - $region5;
+ − 416 $stop = $pos + $num_flanking3 + $region3 + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 23 or 9 bases as the case may be
+ − 417 #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking5 - $region5)\n";
+ − 418 }
+ − 419 if(defined $var_offset and (
+ − 420 $strand eq "+" and ($pos+$var_offset < $start or $pos+$var_offset > $stop) or
+ − 421 $strand eq "-" and ($pos-$var_offset < $start or $pos-$var_offset > $stop))){ # variant is outside of range we can detect affect on original splice site
+ − 422 return ($UNAFFECTED, -1)
+ − 423 }
+ − 424
+ − 425 my $seq = $fasta_index->fetch("$chr:$start-$stop");
+ − 426 my @subseqs;
+ − 427 if(defined $var){
+ − 428 if(defined $var_offset){ # substitution away from the site of interest
+ − 429 #print STDERR "Subbing in string of length ", length($seq), "near splice site $ref -> $var at ", ($strand eq "-" ? "$region3+$num_flanking3-$var_offset":"$region5+$num_flanking5+$var_offset"), "\n";
+ − 430 substr($seq, ($strand eq "-" ? $region3+$num_flanking3-$var_offset:$region5+$num_flanking5+$var_offset), length($ref)) = $var;
+ − 431 }
+ − 432 else{ # substitution at the site of interest
+ − 433 substr($seq, ($strand eq "-" ? $region3+$num_flanking3:$region5+$num_flanking5), length($ref)) = $var;
+ − 434 }
+ − 435 }
+ − 436 if($strand eq "-"){
+ − 437 $seq = reverse($seq);
+ − 438 $seq =~ tr/ACGTacgt/TGCAtgca/;
+ − 439 }
+ − 440 # get all of the 23 or 9 base pair subsequences, depending on whether it's donors or acceptors we are looking for
+ − 441 for(my $i = 0; $i+$num_flanking5+$num_flanking3+1 <= length($seq); $i++){
+ − 442 push @subseqs, substr($seq, $i, $num_flanking5+$num_flanking3+1);
+ − 443 }
+ − 444 my $pid = open2(\*CHILD_OUT, \*CHILD_IN, "$cmd -");
+ − 445 $pid or die "Could not run $cmd: $!\n";
+ − 446 print CHILD_IN join("\n",@subseqs);
+ − 447 close(CHILD_IN);
+ − 448 while(<CHILD_OUT>){
+ − 449 chomp;
+ − 450 # out is just "seq[tab]score"
+ − 451 my @F = split /\s+/, $_;
+ − 452 if($F[1] > $highest_score){
+ − 453 #print STDERR ">";
+ − 454 $highest_score = $F[1];
+ − 455 # since we printed the subseqs in order, $. (containing the line number of the prediction output) gives us the offset in the original seq
+ − 456 $highest_position = $start + ($strand eq "-" ? length($seq) - $num_flanking5 - $. : $num_flanking5 + $. -1)+(($. <= $num_flanking5+$region5 && $strand eq "+" or $. >= $num_flanking5+$region5 && $strand eq "-") ? -$indel_size : 0);
+ − 457 }
+ − 458 #print STDERR $_,"\n";
+ − 459 }
+ − 460 close(CHILD_OUT);
+ − 461 waitpid($pid, 0);
+ − 462
+ − 463 return ($highest_score, $highest_position);
+ − 464 }
+ − 465
+ − 466 sub get_range_info($$$$){
+ − 467 my ($chr2ranges, $chr, $start, $stop) = @_;
+ − 468
+ − 469 if(not exists $chr2ranges->{$chr}){
+ − 470 return {};
+ − 471 }
+ − 472 my $range_key = "$chr:$start:$stop";
+ − 473
+ − 474 # Use a binary search to find the leftmost range of interest
+ − 475 my $left_index = 0;
+ − 476 my $right_index = $#{$chr2ranges->{$chr}};
+ − 477 my $cursor = int($right_index/2);
+ − 478 while($left_index != $right_index and $left_index != $cursor and $right_index != $cursor){
+ − 479 # need to go left
+ − 480 if($chr2ranges->{$chr}->[$cursor]->[0] >= $start){
+ − 481 $right_index = $cursor;
+ − 482 $cursor = int(($left_index+$cursor)/2);
+ − 483 }
+ − 484 # need to go to the right
+ − 485 elsif($chr2ranges->{$chr}->[$cursor]->[1] <= $stop){
+ − 486 $left_index = $cursor;
+ − 487 $cursor = int(($right_index+$cursor)/2);
+ − 488 }
+ − 489 else{
+ − 490 $right_index = $cursor;
+ − 491 $cursor--; # in the range, go left until not in the range any more
+ − 492 }
+ − 493 }
+ − 494
+ − 495 my %names;
+ − 496 for (; $cursor <= $#{$chr2ranges->{$chr}}; $cursor++){
+ − 497 my $range_ref = $chr2ranges->{$chr}->[$cursor];
+ − 498 if($range_ref->[0] > $stop){
+ − 499 last;
+ − 500 }
+ − 501 elsif($range_ref->[0] <= $stop and $range_ref->[1] >= $start){
+ − 502 $names{$range_ref->[2]}++;
+ − 503 }
+ − 504 }
+ − 505
+ − 506 #print STDERR "Names for range $chr:$start-$stop are ", join("/", keys %names), "\n";
+ − 507 return \%names;
+ − 508 }
+ − 509
+ − 510 if(not -d $sift_dir){
+ − 511 die "The specified SIFT directory $sift_dir does not exist\n";
+ − 512 }
+ − 513
+ − 514 print STDERR "Reading in OMIM morbid map...\n" unless $quiet;
+ − 515 die "Data file $morbidmap_file does not exist, aborting.\n" if not -e $morbidmap_file;
+ − 516 my %gene2morbids;
+ − 517 open(TAB, $morbidmap_file)
+ − 518 or die "Cannot open OMIM morbid map file $morbidmap_file for reading: $!\n";
+ − 519 while(<TAB>){
+ − 520 my @fields = split /\|/, $_;
+ − 521 #print STDERR "got ", ($#fields+1), " fields in $_\n";
+ − 522 #my @fields = split /\|/, $_;
+ − 523 my $morbid = $fields[0];
+ − 524 $morbid =~ s/\s*\(\d+\)$//; # get rid of trailing parenthetical number
+ − 525 my $omim_id = $fields[2];
+ − 526 for my $genename (split /, /, $fields[1]){
+ − 527 if(not exists $gene2morbids{$genename}){
+ − 528 $gene2morbids{$genename} = "OMIM:$omim_id $morbid";
+ − 529 }
+ − 530 else{
+ − 531 $gene2morbids{$genename} .= "; $morbid";
+ − 532 }
+ − 533 }
+ − 534 }
+ − 535 close(TAB);
+ − 536
+ − 537 print STDERR "Reading in interpro mappings...\n" unless $quiet;
+ − 538 die "Data file $interpro_bed_file does not exist, aborting.\n" if not -e $interpro_bed_file;
+ − 539 my %interpro_ids;
+ − 540 open(TAB, $interpro_bed_file)
+ − 541 or die "Cannot open gene name BED file $interpro_bed_file for reading: $!\n";
+ − 542 while(<TAB>){
+ − 543 chomp;
+ − 544 # format should be "chr start stop interpro desc..."
+ − 545 my @fields = split /\t/, $_;
+ − 546 my $c = $fields[0];
+ − 547 if(not exists $interpro_ids{$c}){
+ − 548 $interpro_ids{$c} = [] unless exists $interpro_ids{$c};
+ − 549 $interpro_ids{"chr".$c} = [] unless $c =~ /^chr/;
+ − 550 $interpro_ids{$1} = [] if $c =~ /^chr(\S+)/;
+ − 551 }
+ − 552 my $dataref = [@fields[1..3]];
+ − 553 push @{$interpro_ids{$c}}, $dataref;
+ − 554 push @{$interpro_ids{"chr".$c}}, $dataref unless $c =~ /^chr/;
+ − 555 push @{$interpro_ids{$1}}, $dataref if $c =~ /^chr(\S+)/;
+ − 556 }
+ − 557 close(TAB);
+ − 558 print STDERR "Sorting interpro matches per contig...\n" unless $quiet;
+ − 559 for my $chr (keys %interpro_ids){
+ − 560 $interpro_ids{$chr} = [sort {$a->[0] <=> $b->[0]} @{$interpro_ids{$chr}}];
+ − 561 }
+ − 562
+ − 563 # {uc(gene_name) => space separated info}
+ − 564 print STDERR "Reading in tissue info...\n" unless $quiet;
+ − 565 my %tissues;
+ − 566 my %tissue_desc;
+ − 567 open(TISSUE, $tissue_file)
+ − 568 or die "Cannot open $tissue_file for reading: $!\n";
+ − 569 while(<TISSUE>){
+ − 570 chomp;
+ − 571 my @fields = split /\t/, $_;
+ − 572 my $gname = uc($fields[1]);
+ − 573 $tissue_desc{$gname} = $fields[2];
+ − 574 next unless $#fields == 4;
+ − 575 my @tis = split /;\s*/, $fields[4];
+ − 576 pop @tis;
+ − 577 if(@tis < 6){
+ − 578 $tissues{$gname} = join(">", @tis);
+ − 579 }
+ − 580 elsif(@tis < 24){
+ − 581 $tissues{$gname} = join(">", @tis[0..5]).">...";
+ − 582 }
+ − 583 else{
+ − 584 $tissues{$gname} = join(">", @tis[0..(int($#tis/5))]).">...";
+ − 585 }
+ − 586 }
+ − 587 close(TISSUE);
+ − 588
+ − 589 print STDERR "Loading pathway info...\n" unless $quiet;
+ − 590 my %pathways;
+ − 591 my %pathway_ids;
+ − 592 open(PATHWAY, $pathway_file)
+ − 593 or die "Cannot open $pathway_file for reading: $!\n";
+ − 594 while(<PATHWAY>){
+ − 595 chomp;
+ − 596 my @fields = split /\t/, $_;
+ − 597 next unless @fields >= 4;
+ − 598 for my $gname (split /\s+/, $fields[1]){
+ − 599 $gname = uc($gname);
+ − 600 if(not exists $pathways{$gname}){
+ − 601 $pathway_ids{$gname} = $fields[2];
+ − 602 $pathways{$gname} = $fields[3];
+ − 603 }
+ − 604 else{
+ − 605 $pathway_ids{$gname} .= "; ".$fields[2];
+ − 606 $pathways{$gname} .= "; ".$fields[3];
+ − 607 }
+ − 608 }
+ − 609 }
+ − 610 close(PATHWAY);
+ − 611
+ − 612 print STDERR "Loading SIFT indices...\n" unless $quiet;
+ − 613 # Read in the database table list
+ − 614 my %chr_tables;
+ − 615 open(DBLIST, "$sift_dir/bins.list")
+ − 616 or die "Cannot open $sift_dir/bins.list for reading: $!\n";
+ − 617 while(<DBLIST>){
+ − 618 chomp;
+ − 619 my @fields = split /\t/, $_;
+ − 620 if(not exists $chr_tables{$fields[1]}){
+ − 621 $chr_tables{$fields[1]} = {};
+ − 622 }
+ − 623 my $chr = $fields[1];
+ − 624 $chr = "chr$chr" unless $chr =~ /^chr/;
+ − 625 $chr_tables{$chr}->{"$chr\_$fields[2]_$fields[3]"} = [$fields[2],$fields[3]];
+ − 626 }
+ − 627 close(DBLIST);
+ − 628
+ − 629 #Connect to database
+ − 630 my $db_gene = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_Supp.sqlite","","",{RaiseError=>1, AutoCommit=>1});
+ − 631
+ − 632 my $db_chr;
+ − 633 my $fr_stmt; # retrieval for frameshift
+ − 634 my $snp_stmt; # retrieval for SNP
+ − 635 my $cur_chr = "";
+ − 636 my $cur_max_pos;
+ − 637
+ − 638 if(defined $predict_cryptic_splicings){
+ − 639 if(not -e $predict_cryptic_splicings){
+ − 640 die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, does not exist.\n";
+ − 641 }
+ − 642 if(not -e $predict_cryptic_splicings.".fai" and not -w dirname($predict_cryptic_splicings)){
+ − 643 die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, is not indexed, and the directory is not writable.\n";
+ − 644 }
+ − 645 print STDERR "Loading reference FastA index...\n" unless $quiet;
+ − 646 $fasta_index = Bio::DB::Sam::Fai->load($predict_cryptic_splicings);
+ − 647 }
+ − 648
+ − 649 print STDERR "Annotating variants...\n" unless $quiet;
+ − 650 # Assume contigs and positions in order
+ − 651 print STDERR "Processing file $hgvs_table_file...\n" unless $quiet;
+ − 652 open(HGVS, $hgvs_table_file)
+ − 653 or die "Cannot open HGVS file $hgvs_table_file for reading: $!\n";
+ − 654 # Output file for annotated snps and frameshifts
+ − 655 my $header = <HGVS>; # header
+ − 656 chomp $header;
+ − 657 my @headers = split /\t/, $header;
+ − 658 my ($chr_column, $pos_column, $ref_column, $alt_column, $alt_cnt_column, $tot_cnt_column, $ftype_column, $dbid_column, $maf_src_column, $maf_column, $internal_freq_column,
+ − 659 $cdna_hgvs_column, $aa_hgvs_column, $transcript_column, $transcript_length_column, $strand_column, $zygosity_column, $splice_dist_column, $caveats_column, $phase_column,
+ − 660 $srcs_column, $pvalue_column, $to_column, $genename_column, $rares_column, $rares_header);
+ − 661 my %req_columns = (
+ − 662 "Chr" => \$chr_column,
+ − 663 "DNA From" => \$pos_column,
+ − 664 "DNA To" => \$to_column,
+ − 665 "Gene Name" => \$genename_column,
+ − 666 "Ref base" => \$ref_column,
+ − 667 "Obs base" => \$alt_column,
+ − 668 "Variant Reads" => \$alt_cnt_column,
+ − 669 "Total Reads" => \$tot_cnt_column,
+ − 670 "Feature type" => \$ftype_column,
+ − 671 "Variant DB ID" => \$dbid_column,
+ − 672 "Pop. freq. source" => \$maf_src_column,
+ − 673 "Pop. freq." => \$maf_column,
+ − 674 "Transcript HGVS" => \$cdna_hgvs_column,
+ − 675 "Protein HGVS" => \$aa_hgvs_column,
+ − 676 "Selected transcript" => \$transcript_column,
+ − 677 "Transcript length" => \$transcript_length_column,
+ − 678 "Strand" => \$strand_column,
+ − 679 "Zygosity" => \$zygosity_column,
+ − 680 "Closest exon junction (AA coding variants)" => \$splice_dist_column,
+ − 681 "Caveats" => \$caveats_column,
+ − 682 "Phase" => \$phase_column,
+ − 683 "P-value" => \$pvalue_column);
+ − 684 &load_header_columns(\%req_columns, \@headers);
+ − 685 for(my $i = 0; $i <= $#headers; $i++){
+ − 686 # Optional columns go here
+ − 687 if($headers[$i] eq "Sources"){
+ − 688 $srcs_column = $i;
+ − 689 }
+ − 690 elsif($headers[$i] eq "Internal pop. freq."){
+ − 691 $internal_freq_column = $i;
+ − 692 }
+ − 693 elsif($headers[$i] =~ /^Num rare variants/){
+ − 694 $rares_column = $i;
+ − 695 $rares_header = $headers[$i];
+ − 696 }
+ − 697 }
+ − 698 open(OUT, ">$outfile")
+ − 699 or die "Cannot open $outfile for writing: $!\n";
+ − 700 print OUT join("\t", "Chr", "DNA From", "DNA To", "Ref base", "Obs base", "% ref read","% variant read",
+ − 701 "Variant Reads", "Total Reads", "Feature type","Variant type", "Variant context", "Variant DB ID", "Pop. freq. source",
+ − 702 "Pop. freq."), (defined $internal_freq_column ? "\tInternal pop. freq.\t" : "\t"), join("\t", "SIFT Score", "Polyphen2 Score", "Polyphen2 call", "GERP score",
+ − 703 "Gene Name", "Transcript HGVS",
+ − 704 "Protein HGVS", "Zygosity","Closest exon junction (AA coding variants)","Splicing alteration potential","Splicing alteration details",
+ − 705 "OMIM Morbid Map","Tissue expression", "Pathways", "Pathway refs",
+ − 706 "Gene Function", "Spanning InterPro Domains", "P-value", "Caveats", "Phase",
+ − 707 "Selected transcript", "Transcript length", "Strand"),
+ − 708 (defined $rares_column ? "\t$rares_header" : ""),
+ − 709 (defined $srcs_column ? "\tSources" : ""), "\n";
+ − 710 while(<HGVS>){
+ − 711 chomp;
+ − 712 my @fields = split /\t/, $_, -1;
+ − 713 my $mode;
+ − 714 my $protein_hgvs = $fields[$aa_hgvs_column];
+ − 715 my $refSeq = $fields[$ref_column];
+ − 716 my $newBase = $fields[$alt_column];
+ − 717 my $splicing_potential = "NA";
+ − 718 my $splicing_details = "NA";
+ − 719 if(defined $predict_cryptic_splicings){
+ − 720 my $distance = $fields[$splice_dist_column]; # only for coding variants
+ − 721 my $site_type;
+ − 722 if($distance eq "NA"){
+ − 723 }
+ − 724 elsif($distance){
+ − 725 $site_type = $distance < 0 ? "donor" : "acceptor"; # exon context
+ − 726 }
+ − 727 else{
+ − 728 if($fields[$cdna_hgvs_column] =~ /\d+([\-+]\d+)/){ # get from the cDNA HGVS otherwise
+ − 729 $distance = $1;
+ − 730 $distance =~ s/^\+//;
+ − 731 $site_type = $distance > 0 ? "donor" : "acceptor"; # intron context
+ − 732 }
+ − 733 }
+ − 734 # only do splicing predictions for novel or rare variants, since these predictions are expensive
+ − 735 if($distance and $distance ne "NA" and ($fields[$maf_column] eq "NA" or $fields[$maf_column] <= 0.01)){
+ − 736 #print STDERR "Running prediction for rare variant ($fields[3], MAF=$fields[14]): $fields[5], $fields[6], $refSeq, $newBase, $fields[4], $distance, $site_type\n";
+ − 737 ($splicing_potential, $splicing_details) = &predict_splicing($fields[$chr_column], $fields[$pos_column], $refSeq, $newBase, $fields[$strand_column], $distance, $site_type);
+ − 738 }
+ − 739 }
+ − 740 my $lookupBase;
+ − 741 if($fields[$strand_column] eq "-"){
+ − 742 if($refSeq ne "NA"){
+ − 743 $refSeq = reverse($refSeq);
+ − 744 $refSeq =~ tr/ACGTacgt/TGCAtgca/;
+ − 745 }
+ − 746 if($newBase ne "NA"){
+ − 747 $newBase = reverse($newBase);
+ − 748 $newBase =~ tr/ACGTacgt/TGCAtgca/;
+ − 749 $newBase =~ s(TN/)(NA/);
+ − 750 $newBase =~ s(/TN)(/NA);
+ − 751 }
+ − 752 }
+ − 753 if($fields[$cdna_hgvs_column] =~ /^g\..*?(?:ins|delins|>)([ACGTN]+)$/){
+ − 754 $mode = "snps";
+ − 755 }
+ − 756 elsif($fields[$cdna_hgvs_column] =~ /^g\..*?del([ACGTN]+)$/){
+ − 757 $mode = "snps";
+ − 758 }
+ − 759 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+(?:[\-+]\d+)?ins([ACGTN]+)$/ or $fields[$cdna_hgvs_column] =~ /^c\.\d+(?:[\-+]\d+)?_\d+ins([ACGTN]+)$/){
+ − 760 if(not length($1)%3){
+ − 761 $mode = "snps";
+ − 762 }
+ − 763 else{
+ − 764 $mode = "frameshifts";
+ − 765 }
+ − 766 }
+ − 767 elsif($fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:[\-+]\d+)?_(\d+)(?:[\-+]\d+)?delins([ACGTN]+)$/){
+ − 768 if(not (length($3)-$2+$1-1)%3){
+ − 769 $mode = "snps";
+ − 770 }
+ − 771 else{
+ − 772 $mode = "frameshifts";
+ − 773 }
+ − 774 }
+ − 775 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[+\-]\d+_\d+[+\-]\d+delins([ACGTN]+)$/ or
+ − 776 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or
+ − 777 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or
+ − 778 $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+del([ACGTN]*)$/ or
+ − 779 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?(?:ins|delins)?([ACGTN]+)$/){
+ − 780 $mode = "snps";
+ − 781 }
+ − 782 elsif($fields[$cdna_hgvs_column] =~ /^c\.([-*]?\d+)del(\S+)/){
+ − 783 my $dist = $1;
+ − 784 $dist =~ s/^\*//;
+ − 785 if(abs($dist) < 4){
+ − 786 $mode = "snps";
+ − 787 }
+ − 788 else{
+ − 789 $mode = "frameshifts";
+ − 790 }
+ − 791 }
+ − 792 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+ins(\S*)/ or
+ − 793 $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+del(\S*)/){
+ − 794 $mode = "snps";
+ − 795 }
+ − 796 elsif($fields[$cdna_hgvs_column] =~ /^c\.(-?\d+)_(\d+)(?:\+\d+)?del(\S+)/ or
+ − 797 $fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:\-\d+)?_(\d+)del(\S+)/){
+ − 798 if(not ($2-$1+1)%3){
+ − 799 $mode = "snps";
+ − 800 }
+ − 801 else{
+ − 802 $mode = "frameshifts";
+ − 803 }
+ − 804 }
+ − 805 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+ins([ACGTN]+)$/){
+ − 806 if(not length($1)%3){
+ − 807 $mode = "snps";
+ − 808 }
+ − 809 else{
+ − 810 $mode = "frameshifts";
+ − 811 }
+ − 812 }
+ − 813 # special case of shifted start codon
+ − 814 elsif($fields[$cdna_hgvs_column] =~ /^c\.-1_1+ins([ACGTN]+)$/ or
+ − 815 $fields[$cdna_hgvs_column] =~ /inv$/){
+ − 816 $mode = "snps";
+ − 817 }
+ − 818 elsif($fields[$cdna_hgvs_column] =~ /[ACGTN]>([ACGTN])$/){
+ − 819 # todo: handle multiple consecutive substitutions, will have to run polyphen separately
+ − 820 $lookupBase = $newBase;
+ − 821 if($fields[$strand_column] eq "-"){ # revert to positive strand variant for SIFT index
+ − 822 $lookupBase =~ tr/ACGTacgt/TGCAtgca/;
+ − 823 }
+ − 824 $mode = "snps";
+ − 825 }
+ − 826 else{
+ − 827 $mode = "snps";
+ − 828 }
+ − 829 if($fields[$chr_column] ne $cur_chr){
+ − 830 #Prepared DB connection on per-chromosome basis
+ − 831 $cur_chr = $fields[$chr_column];
+ − 832 $cur_chr =~ s/^chr//;
+ − 833 $db_chr = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_CHR$cur_chr.sqlite",
+ − 834 "",
+ − 835 "",
+ − 836 { RaiseError => 1, AutoCommit => 1 }) unless not -e "$sift_dir/Human_CHR$cur_chr.sqlite";
+ − 837
+ − 838 $cur_max_pos = -1;
+ − 839 }
+ − 840 if($fields[$pos_column] =~ /^\d+$/ and $cur_max_pos < $fields[$pos_column]){
+ − 841 undef $fr_stmt;
+ − 842 undef $snp_stmt;
+ − 843 my $c = $fields[$chr_column];
+ − 844 $c = "chr$c" if $c !~ /^chr/;
+ − 845 for my $chr_table_key (keys %{$chr_tables{$c}}){
+ − 846 my $chr_table_range = $chr_tables{$c}->{$chr_table_key};
+ − 847 if($chr_table_range->[0] <= $fields[$pos_column] and $chr_table_range->[1] >= $fields[$pos_column]){
+ − 848 $fr_stmt = $db_chr->prepare("select SCORE".
+ − 849 " from $chr_table_key where COORD1 = ?");
+ − 850 $snp_stmt = $db_chr->prepare("select SCORE".
+ − 851 " from $chr_table_key where COORD1 = ? AND NT2 = ?");
+ − 852 $cur_max_pos = $chr_table_range->[1];
+ − 853 last;
+ − 854 }
+ − 855 }
+ − 856 if(not defined $fr_stmt and not $quiet){
+ − 857 warn "Got request for position not in range of SIFT ($c:$fields[$pos_column])\n";
+ − 858 }
+ − 859 }
+ − 860
+ − 861 my @cols;
+ − 862 if(not $fr_stmt){
+ − 863 @cols = ("NA");
+ − 864 }
+ − 865 elsif($mode eq "frameshifts"){
+ − 866 $fr_stmt->execute($fields[$pos_column]);
+ − 867 @cols = $fr_stmt->fetchrow_array();
+ − 868 }
+ − 869 else{
+ − 870 $snp_stmt->execute($fields[$pos_column]-1, $lookupBase);
+ − 871 @cols = $snp_stmt->fetchrow_array();
+ − 872 }
+ − 873 # See if the change is in a CDS, and if it's non-synonymous
+ − 874 my ($type, $location) = ("silent", "intron");
+ − 875 my $start = $fields[$pos_column];
+ − 876 my $stop = $fields[$to_column];
+ − 877 my $interpro = get_range_info(\%interpro_ids, $fields[$chr_column], $start, $stop);
+ − 878 my $domains = join("; ", sort keys %$interpro);
+ − 879
+ − 880 my $sift_score = "NA";
+ − 881 my $variant_key = "NA";
+ − 882 my $transcript_hgvs = $fields[$cdna_hgvs_column];
+ − 883 if($transcript_hgvs =~ /\.\d+[ACGTN]>/ or $transcript_hgvs =~ /\.\d+(?:_|del|ins)/ or $transcript_hgvs =~ /[+\-]\?/){
+ − 884 $location = "exon";
+ − 885 }
+ − 886 elsif($transcript_hgvs =~ /\.\d+[\-+](\d+)/){
+ − 887 if($1 < 21){
+ − 888 $type = "splice";
+ − 889 }
+ − 890 else{
+ − 891 $type = "intronic";
+ − 892 }
+ − 893 $location = "splice site";
+ − 894 }
+ − 895 elsif($transcript_hgvs =~ /\.\*\d+/){
+ − 896 $location = "3'UTR";
+ − 897 }
+ − 898 elsif($transcript_hgvs =~ /\.-\d+/){
+ − 899 $location = "5'UTR";
+ − 900 }
+ − 901 if($mode eq "frameshifts"){
+ − 902 $type = "frameshift";
+ − 903 }
+ − 904 else{
+ − 905 if(not defined $protein_hgvs){
+ − 906 $type = "unknown";
+ − 907 }
+ − 908 elsif($protein_hgvs eq "NA"){
+ − 909 $type = "non-coding";
+ − 910 }
+ − 911 elsif($protein_hgvs =~ /=$/){
+ − 912 $type = "silent";
+ − 913 }
+ − 914 elsif($protein_hgvs =~ /\d+\*/){ # nonsense
+ − 915 $type = "nonsense";
+ − 916 }
+ − 917 elsif($protein_hgvs =~ /ext/){ # nonsense
+ − 918 $type = "nonstop";
+ − 919 }
+ − 920 elsif($protein_hgvs eq "p.0?" or $protein_hgvs eq "p.?"){
+ − 921 $type = "unknown";
+ − 922 }
+ − 923 else{
+ − 924 $type = "missense";
+ − 925 }
+ − 926
+ − 927 $sift_score = $cols[0] || "NA";
+ − 928
+ − 929 # Record the need to fetch VCF polyphen and gerp data later (en masse, for efficiency)
+ − 930 # Use newBase instead of lookupBase for PolyPhen since the orientation is always relative to the transcript
+ − 931 $variant_key = get_variant_key($fields[$chr_column], $fields[$pos_column], $fields[$ref_column], $newBase);
+ − 932 #print STDERR "Variant key is $variant_key\n";
+ − 933 }
+ − 934 my $transcript_type = $fields[$ftype_column];
+ − 935 my $transcript_length = $fields[$transcript_length_column];
+ − 936 my $transcript_id = $fields[$transcript_column]; # sift score is derived from this transcript
+ − 937 my $transcript_strand = $fields[$strand_column];
+ − 938 my $zygosity = $fields[$zygosity_column];
+ − 939 my $rsID = $fields[$dbid_column];
+ − 940 my $popFreq = $fields[$maf_column];
+ − 941 my $internalFreq;
+ − 942 $internalFreq = $fields[$internal_freq_column] if defined $internal_freq_column;
+ − 943 my $popFreqSrc = $fields[$maf_src_column];
+ − 944
+ − 945 # Get the gene name and omim info
+ − 946 my %seen;
+ − 947 my @gene_names = split /; /, $fields[$genename_column];
+ − 948 my $morbid = join("; ", grep {/\S/ and not $seen{$_}++} map({$gene2morbids{$_} or ""} @gene_names));
+ − 949 my $tissue = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissues{$_} or ""} @gene_names));
+ − 950 my $function = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissue_desc{$_} or ""} @gene_names));
+ − 951 my $pathways = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathways{$_} or ""} @gene_names));
+ − 952 my $pathway_ids = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathway_ids{$_} or ""} @gene_names));
+ − 953
+ − 954 # It may be that the calls and freqs are multiple concatenated values, e.g. "X; Y"
+ − 955 my @tot_bases = split /; /,$fields[$tot_cnt_column];
+ − 956 my @var_bases = split /; /,$fields[$alt_cnt_column];
+ − 957 my (@pct_nonvar, @pct_var);
+ − 958 for (my $i = 0; $i <= $#tot_bases; $i++){
+ − 959 push @pct_nonvar, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int(($tot_bases[$i]-$var_bases[$i])/$tot_bases[$i]*100+0.5);
+ − 960 push @pct_var, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int($var_bases[$i]/$tot_bases[$i]*100+0.5);
+ − 961 }
+ − 962
+ − 963 push @lines, [
+ − 964 $fields[$chr_column], # chr
+ − 965 $fields[$pos_column], # pos
+ − 966 $fields[$to_column],
+ − 967 $fields[$ref_column],
+ − 968 $fields[$alt_column],
+ − 969 join("; ", @pct_nonvar), # pct ref
+ − 970 join("; ", @pct_var), # pct var
+ − 971 $fields[$alt_cnt_column], # num var
+ − 972 $fields[$tot_cnt_column], # num reads
+ − 973 $transcript_type, # protein_coding, pseudogene, etc.
+ − 974 $type,
+ − 975 $location,
+ − 976 $rsID,
+ − 977 $popFreqSrc,
+ − 978 (defined $internal_freq_column ? ($popFreq,$internalFreq) : ($popFreq)),
+ − 979 $sift_score,
+ − 980 $variant_key, # 16th field
+ − 981 join("; ", @gene_names),
+ − 982 $transcript_hgvs,
+ − 983 $protein_hgvs,
+ − 984 $zygosity,
+ − 985 $fields[$splice_dist_column], # distance to exon edge
+ − 986 $splicing_potential,
+ − 987 $splicing_details,
+ − 988 $morbid,
+ − 989 $tissue,
+ − 990 $pathways,
+ − 991 $pathway_ids,
+ − 992 $function,
+ − 993 $domains,
+ − 994 $fields[$pvalue_column],
+ − 995 $fields[$caveats_column], # caveats
+ − 996 (defined $rares_column ? ($fields[$rares_column],$transcript_id) : ($transcript_id)),
+ − 997 $transcript_length,
+ − 998 $transcript_strand,
+ − 999 (defined $srcs_column ? ($fields[$phase_column],$fields[$srcs_column]) : ($fields[$phase_column]))];
+ − 1000 }
+ − 1001
+ − 1002 print STDERR "Adding Polyphen2 and GERP info...\n" unless $quiet;
+ − 1003 # retrieve the polyphen and gerp info en masse for each chromosome, as this is much more efficient
+ − 1004 my $vkey_col = 16+(defined $internal_freq_column ? 1 : 0);
+ − 1005 for my $line_fields (@lines){
+ − 1006 # expand the key into the relevant MAF values
+ − 1007 $line_fields->[$vkey_col] = join("\t", polyphen_gerp_info($gerp_dir,$polyphen_file,$line_fields->[$vkey_col])); # fields[$vkey_col] is variant key
+ − 1008 print OUT join("\t", @{$line_fields}), "\n";
+ − 1009 }
+ − 1010 close(OUT);
+ − 1011 unlink $tmpfile if defined $tmpfile;
+ − 1012
+ − 1013 sub load_header_columns{
+ − 1014 my ($reqs_hash_ref, $headers_array_ref) = @_;
+ − 1015 my %unfulfilled;
+ − 1016 for my $field_name (keys %$reqs_hash_ref){
+ − 1017 $unfulfilled{$field_name} = 1;
+ − 1018 }
+ − 1019 for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){
+ − 1020 for my $req_header_name (keys %$reqs_hash_ref){
+ − 1021 if($req_header_name eq $headers_array_ref->[$i]){
+ − 1022 ${$reqs_hash_ref->{$req_header_name}} = $i;
+ − 1023 delete $unfulfilled{$req_header_name};
+ − 1024 last;
+ − 1025 }
+ − 1026 }
+ − 1027 }
+ − 1028 if(keys %unfulfilled){
+ − 1029 die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n";
+ − 1030 }
+ − 1031 }