comparison hgvs_table_annotate @ 0:9977d1935a07 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:10:47 -0600
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9977d1935a07
1 #!/usr/bin/env perl
2
3 use Bio::DB::Sam;
4 use DBI;
5 use IPC::Open2;
6 use Set::IntervalTree;
7 use strict;
8 use warnings;
9 use vars qw($quiet %chr2variant_locs %polyphen_info %chr2polyphen_vcf_lines %chr2gerp_vcf_lines %range2genes $fasta_index $UNAFFECTED $tmpfile);
10 use File::Basename;
11 $UNAFFECTED = -299999; # sentinel for splice site predictions
12 my $current_directory = dirname(__FILE__);
13 if(@ARGV == 1 and $ARGV[0] eq "-v"){
14 print "Version 1.0\n";
15 exit;
16 }
17
18 #configuration file stuff
19 my %config;
20 my $tool_dir = shift @ARGV;
21 if(not -e "$tool_dir/annotate_hgvs.loc"){
22 system("cp $current_directory/tool-data/annotate_hgvs.loc $tool_dir/annotate_hgvs.loc");
23 }
24 open CONFIG, '<', "$tool_dir/annotate_hgvs.loc";
25 while(<CONFIG>){
26 next if $_ =~ /^#/;
27 (my $key, my $value) = split(/\s+/,$_);
28 $config{$key} = $value;
29 }
30 close CONFIG;
31 my $dbs_dir = $config{"dbs_dir"};
32
33 $quiet = 0;
34 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
35 $quiet = 1;
36 shift @ARGV;
37 }
38
39 @ARGV == 9 or @ARGV == 10 or die "Usage: $0 -q <SIFT dir> <POLYPHEN2 file> <GERP dir> <tissuedb file> <pathway file> <interpro domains.bed> <omim morbidmap> <input hgvs table.txt> <output hgvs table.annotated.txt> [reference.fasta]\n Where if the reference fasta is provided, we predict cryptic splicing effects (using MaxEntScan and GeneSplicer)\n";
40
41 my $sift_dir = $dbs_dir . shift @ARGV;
42 my $polyphen_file = $dbs_dir . shift @ARGV;
43 my $gerp_dir = $dbs_dir . shift @ARGV;
44 my $tissue_file = $dbs_dir . shift @ARGV;
45 my $pathway_file = $dbs_dir. shift @ARGV;
46 my $interpro_bed_file = $dbs_dir . shift @ARGV;
47 my $morbidmap_file = $dbs_dir . shift @ARGV;
48 my $hgvs_table_file = shift @ARGV;
49 my $outfile = shift @ARGV;
50 my $predict_cryptic_splicings = @ARGV ? ($dbs_dir . shift @ARGV) : undef;
51 my $flanking_bases = 500; # assume this is ROI around a gene
52 my @lines;
53
54 sub polyphen_gerp_info($$$){
55 my($gerp_dir,$polyphen_file,$info_key) = @_;
56
57 my($chr,$pos,$ref,$variant) = split /:/, $info_key;
58
59 # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse
60 if(not exists $chr2polyphen_vcf_lines{$chr}){
61 ($chr2polyphen_vcf_lines{$chr}, $chr2gerp_vcf_lines{$chr}) = retrieve_vcf_lines($gerp_dir,$polyphen_file,$chr);
62 }
63
64 my @results;
65 if(not $ref or not $variant){ # can only get data for SNPs
66 @results = qw(NA NA NA);
67 }
68 #print STDERR "ref = $ref and variant = $variant for $info_key\n";
69 elsif(length($ref) == 1 and length($variant) == 1){
70 #print STDERR "Grabbing poly/gerp for $chr,$pos,$ref,$variant\n";
71 @results = polyphen_info($chr,$pos,$variant);
72 $results[2] = gerp_info($chr,$pos);
73 }
74
75 # It may be that a novel MNP variant is a bunch of known SNPs in a row. In this case,
76 # report the list of variants
77 elsif(length($ref) == length($variant) and $ref ne "NA"){
78 my (@poly_scores, @poly_calls, @gerp_scores);
79 for(my $i = 0; $i < length($ref); $i++){
80 my $refbase = substr($ref, $i, 1);
81 my $newbase = substr($variant, $i, 1);
82 if($newbase eq $refbase){
83 push @poly_scores, "-";
84 push @poly_calls, "-";
85 push @gerp_scores, "-";
86 next;
87 }
88 my @base_results = polyphen_info($chr,$pos+$i,$refbase,$newbase);
89 push @poly_scores, $base_results[0];
90 push @poly_calls, $base_results[1];
91 $base_results[2] = gerp_info($chr,$pos+$i);
92 push @gerp_scores, $base_results[2];
93 }
94 $results[0] = join(",", @poly_scores);
95 $results[1] = join(",", @poly_calls);
96 $results[2] = join(",", @gerp_scores);
97 }
98 else{
99 @results = qw(NA NA NA);
100 }
101
102 return @results;
103 }
104
105 sub get_variant_key($$$$){
106 # params chr pos ref variant
107 my $c = $_[0];
108 $c =~ s/^chr//;
109 my $prop_info_key = join(":", $c, @_[1..3]);
110 $chr2variant_locs{$c} = [] unless exists $chr2variant_locs{$c};
111 push @{$chr2variant_locs{$c}}, $_[1];
112 return $prop_info_key;
113 }
114
115 sub retrieve_vcf_lines($$$){
116 my ($gerp_dir, $polyphen_file, $chr) = @_;
117
118 my (%polyphen_lines, %gerp_lines);
119
120 if(not exists $chr2variant_locs{$chr}){
121 return ({}, {}); # no data requested for this chromosome
122 }
123
124 # build up the request
125 my @tabix_regions;
126 my @var_locs = grep /^\d+$/, @{$chr2variant_locs{$chr}}; # grep keeps point mutations only
127 if(not @var_locs){
128 return ({}, {}); # no point mutations
129 }
130
131 # sort by variant start location
132 for my $var_loc (sort {$a <=> $b} @var_locs){
133 push @tabix_regions, $chr.":".$var_loc."-".$var_loc;
134 }
135 my $regions = join(" ", @tabix_regions);
136
137 # retrieve the data
138 die "Cannot find Polyphen2 VCF file $polyphen_file\n" if not -e $polyphen_file;
139 open(VCF, "tabix $polyphen_file $regions |")
140 or die "Cannot run tabix on $polyphen_file: $!\n";
141 while(<VCF>){
142 if(/^\S+\t(\d+)/ and grep {$_ eq $1} @var_locs){ # take only main columns to save room, if possible
143 chomp;
144 $polyphen_lines{$1} = [] unless exists $polyphen_lines{$1};
145 push @{$polyphen_lines{$1}}, $_;
146 }
147 }
148 close(VCF);
149
150 my $vcf_file = "$gerp_dir/chr$chr.tab.gz";
151 if(not -e $vcf_file){
152 warn "Cannot find GERP VCF file $vcf_file\n" if not $quiet;
153 return ({}, {});
154 }
155 open(VCF, "tabix $vcf_file $regions |")
156 or die "Cannot run tabix on $vcf_file: $!\n";
157 while(<VCF>){
158 if(/^(\S+\t(\d+)(?:\t\S+){2})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible
159 $gerp_lines{$2} = [] unless exists $gerp_lines{$2};
160 push @{$gerp_lines{$2}}, $1;
161 }
162 }
163 close(VCF);
164
165 return (\%polyphen_lines, \%gerp_lines);
166 }
167
168 sub polyphen_info($$$){
169 my ($chr, $pos, $variant_base) = @_;
170
171 my $key = "$chr:$pos:$variant_base";
172 if(exists $polyphen_info{$key}){
173 return @{$polyphen_info{$key}};
174 }
175
176 #print STDERR "Checking for polyphen data for $chr, $pos, $variant_base\n";
177 if(exists $chr2polyphen_vcf_lines{$chr}->{$pos}){
178 for(@{$chr2polyphen_vcf_lines{$chr}->{$pos}}){
179 #print STDERR "Checking $chr, $pos, $variant_base against $_";
180 my @fields = split /\t/, $_;
181 if($fields[4] eq $variant_base){
182 if($fields[6] eq "D"){
183 $polyphen_info{$key} = [$fields[5],"deleterious"];
184 last;
185 }
186 elsif($fields[6] eq "P"){
187 $polyphen_info{$key} = [$fields[5],"poss. del."];
188 last;
189 }
190 elsif($fields[6] eq "B"){
191 $polyphen_info{$key} = [$fields[5],"benign"];
192 last;
193 }
194 else{
195 $polyphen_info{$key} = [$fields[5],$fields[6]];
196 last;
197 }
198 }
199 }
200 if(not exists $polyphen_info{$key}){
201 $polyphen_info{$key} = ["NA", "NA"];
202 }
203 }
204 else{
205 $polyphen_info{$key} = ["NA", "NA"];
206 }
207 return @{$polyphen_info{$key}};
208 }
209
210 sub gerp_info($$){
211 my ($chr,$pos) = @_;
212
213 if(exists $chr2gerp_vcf_lines{$chr}->{$pos}){
214 for(@{$chr2gerp_vcf_lines{$chr}->{$pos}}){
215 my @fields = split /\t/, $_;
216 return $fields[3];
217 }
218 }
219 return "NA";
220 }
221
222 sub predict_splicing{
223 my ($chr, $pos, $ref, $var, $strand, $exon_edge_distance, $splice_type) = @_;
224
225 my $disruption_level = 0;
226 my @disruption_details;
227 # The methods being used only look for up to a 30 base context, so only
228 # need to check for obliterated acceptors/donors if they are nearby
229 $exon_edge_distance-- if $exon_edge_distance > 0; # +1 is actually in the right spot, so decrement
230 my $edge_offset = $strand eq "-" ? -1*$exon_edge_distance : $exon_edge_distance; # flip offset sign if the call was on the negative strand (since +/- was relative to the transcript, not the genome coordinates)
231 my $annotated_detected = 0;
232 my ($genesplicer_orig, $genesplicer_orig_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref);
233 if($genesplicer_orig != 0){
234 $annotated_detected = 1;
235 }
236 my ($maxentscan_orig, $maxentscan_orig_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref);
237 if(not $annotated_detected and $maxentscan_orig){
238 $annotated_detected = 1;
239 }
240 my ($genesplicer_mod, $genesplicer_mod_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance);
241 if($genesplicer_mod != $UNAFFECTED){ # was the mod within the range where this software could detect up or downstream effects on the annotated splice site?
242 if($genesplicer_orig_pos != $genesplicer_mod_pos){
243 $disruption_level+=0.75;
244 if($genesplicer_mod_pos == -1){
245 push @disruption_details, "Variant obliterates GeneSplicer predicted splice site from the reference sequence ($chr:$genesplicer_orig_pos, orig score $genesplicer_orig)";
246 }
247 elsif($genesplicer_orig_pos != -1){
248 push @disruption_details, "Variant moves GeneSplicer preferred splice site from annotated location $chr:$genesplicer_orig_pos to $chr:$genesplicer_mod_pos (scores: orig=$genesplicer_orig, new=$genesplicer_mod)";
249 }
250 }
251 elsif($genesplicer_orig-$genesplicer_mod > 1){
252 $disruption_level+=0.5;
253 push @disruption_details, "Variant significantly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)";
254 }
255 elsif($genesplicer_orig > $genesplicer_mod){
256 $disruption_level+=0.25;
257 push @disruption_details, "Variant slightly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)";
258 }
259 $genesplicer_orig = $genesplicer_mod; # so that splice site comparison vs. cryptic is accurate
260 }
261 my ($maxentscan_mod, $maxentscan_mod_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance);
262 if($maxentscan_mod != $UNAFFECTED){
263 if($maxentscan_mod_pos != $maxentscan_orig_pos){
264 $disruption_level+=1;
265 if($maxentscan_mod_pos == -1){
266 push @disruption_details, "Variant obliterates MaxEntScan predicted splice site from the reference sequence ($chr:$maxentscan_orig_pos, orig score $maxentscan_orig)";
267 }
268 elsif($maxentscan_orig_pos != -1){
269 push @disruption_details, "Variant moves MaxEntScan preferred splice site from annotated location $chr:$maxentscan_orig_pos to $chr:$maxentscan_mod_pos (scores: orig=$maxentscan_orig, new=$maxentscan_mod)";
270 }
271 }
272 elsif($maxentscan_orig-$maxentscan_mod > 1){
273 $disruption_level+=0.5;
274 push @disruption_details, "Variant significantly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)";
275 }
276 elsif($maxentscan_orig>$maxentscan_mod){
277 $disruption_level+=0.25;
278 push @disruption_details, "Variant slightly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)";
279 }
280 $maxentscan_orig = $maxentscan_mod;
281 }
282
283 # Regardless of location, see if there is an increased chance of creating a cryptic splicing site
284 my ($genesplicer_control, $genesplicer_control_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref);
285 my ($genesplicer_cryptic, $genesplicer_cryptic_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref, $var);
286 if(defined $genesplicer_orig and $genesplicer_control < $genesplicer_orig and $genesplicer_orig < $genesplicer_cryptic){
287 $disruption_level+=0.75;
288 push @disruption_details, "Variant causes GeneSplicer score for a potential cryptic $splice_type splice site at $chr:$genesplicer_cryptic_pos to become higher than the annotated splice site $chr:$genesplicer_orig_pos (orig=$genesplicer_orig vs. var=$genesplicer_cryptic)";
289 }
290 if($genesplicer_control - $genesplicer_cryptic < -1){
291 $disruption_level+=0.5;
292 push @disruption_details, "Variant raises GeneSplicer score for a potential cryptic $splice_type splice site (orig=$genesplicer_control vs. var=$genesplicer_cryptic)";
293 }
294 my ($maxentscan_control, $maxentscan_control_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref);
295 my ($maxentscan_cryptic, $maxentscan_cryptic_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref, $var);
296 if(defined $maxentscan_orig and $maxentscan_control < $maxentscan_orig and $maxentscan_orig < $maxentscan_cryptic){
297 $disruption_level++;
298 push @disruption_details, "Variant causes MaxEntScan score for a potential cryptic $splice_type splice site at $chr:$maxentscan_cryptic_pos to become higher than the annotated splice site $chr:$maxentscan_orig_pos (orig=$maxentscan_orig vs. var=$maxentscan_cryptic)";
299 }
300 if($maxentscan_control - $maxentscan_cryptic < -1 and $maxentscan_cryptic > 0){
301 $disruption_level+=0.5;
302 push @disruption_details, "Variant raises MaxEntScan score for a potential cryptic $splice_type splice site (orig=$maxentscan_control vs. var=$maxentscan_cryptic)";
303 }
304
305 # If neither method predicted the annotated donor site, and the alternate site isn't predicted either, note that so the user doesn't think the mod is definite okay
306 if(not @disruption_details and $maxentscan_orig < 0 and $genesplicer_orig < 0){
307 $disruption_level+=0.5;
308 push @disruption_details, "Neither the annotated splice site nor any potential cryptic site introduced by the variant are predicted by either GeneSplicer or MaxEntScan, therefore the alt splicing potential for this variant should be evaluated manually";
309 }
310
311 return ($disruption_level, join("; ", @disruption_details));
312 }
313
314 sub call_genesplicer{
315 my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_;
316
317 if($splice_type eq "acceptor"){
318 $pos += $strand eq "-" ? 2: -2;
319 }
320 my $num_flanking = 110; # seems to croak if less than 160bp provided
321 my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins
322 my $site_window = defined $var_offset ? 2 : ($indel_size > 0 ? 29+$indel_size : 29); # exact position if looking for exisitng site effect, otherwise 10 or 10 + the insert size
323 my $cmd = "genesplicer";
324 my $start = $pos - $num_flanking;
325 $start = 1 if $start < 1;
326 my $stop = $pos + $num_flanking + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 2*$num_flanking after the edit
327 $stop++; # end coordinate is exclusive
328 return ($UNAFFECTED, -1) if defined $var_offset and ($pos+$var_offset < $start or $pos+$var_offset > $stop); # variant is outside of range we can detect affect on original splice site
329 my $seq = $fasta_index->fetch("$chr:$start-$stop");
330 if(defined $var){
331 if(defined $var_offset){ # substitution away from the site of interest
332 substr($seq, $num_flanking+$var_offset, length($ref)) = $var;
333 }
334 else{ # substitution at the site of interest
335 substr($seq, $num_flanking, length($ref)) = $var;
336 }
337 }
338 if($strand eq "-"){
339 $seq = reverse($seq);
340 $seq =~ tr/ACGTacgt/TGCAtgca/;
341 }
342 $tmpfile = "$$.fasta";
343 open(TMP, ">$tmpfile") or die "Could not open $tmpfile for writing: $!\n";
344 print TMP ">$chr $strand $pos $ref",($var?" $var":""), ($var_offset?" $var_offset":""),"\n$seq\n";
345 substr($seq, $num_flanking, 2) = lc(substr($seq, $num_flanking, 2));
346 #print STDERR "$chr $strand $pos $ref $var $var_offset: $seq\n";
347 close(TMP);
348 #print STDERR "$cmd $tmpfile /export/achri_data/programs/GeneSplicer/human\n";
349 #<STDIN>;
350 open(GS, "$cmd $tmpfile human 2> /dev/null|")
351 or die "Could not run $cmd: $!\n";
352 my $highest_score = 0;
353 my $highest_position = -1;
354 while(<GS>){
355 # End5 End3 Score "confidence" splice_site_type
356 # E.g. :
357 # 202 203 2.994010 Medium donor
358 chomp;
359 my @F = split /\s+/, $_;
360 next if $F[1] < $F[0]; # wrong strand
361 if($splice_type eq $F[4]){
362 if(abs($F[0]-$num_flanking-1) <= $site_window){ # right type and approximate location
363 #print STDERR "*$_\n";
364 if($F[2] > $highest_score){
365 $highest_score = $F[2];
366 # last bit accounts for indels in the position offset
367 $highest_position = $pos + ($strand eq "-" ? -1: 1)*(-$num_flanking + $F[0] - 1) + (($F[0] <= $num_flanking && $strand eq "+" or $F[0] >= $num_flanking && $strand eq "-") ? -$indel_size : 0);
368 }
369 }
370 }
371 #else{print STDERR $_,"\n";}
372 }
373 close(GS);
374 return ($highest_score, $highest_position);
375 }
376
377 sub call_maxentscan{
378 my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_;
379
380 my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins
381 my $cmd;
382 my ($num_flanking5, $num_flanking3);
383 my ($region5, $region3);
384 my ($start, $stop);
385 my $highest_score = -9999;
386 my $highest_position = -1;
387 # note that donor and acceptor are very different beasts for MaxEntScan
388 if($splice_type eq "acceptor"){
389 if($strand eq "-"){
390 $pos += 2;
391 }
392 else{
393 $pos -= 2;
394 }
395 $cmd = "score3.pl";
396 $num_flanking5 = 18;
397 $num_flanking3 = 4;
398 }
399 else{ # donor
400 $cmd = "score5.pl";
401 $num_flanking5 = 3;
402 $num_flanking3 = 5;
403 }
404 # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region
405 $region5 = defined $var_offset ? 0 : $num_flanking5; # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region
406 $region3 = defined $var_offset ? 0 : $num_flanking3;
407 if($strand eq "-"){
408 $start = $pos - $num_flanking3 - $region3;
409 $stop = $pos + $num_flanking5 + $region5 + ($indel_size < 0 ? -1*$indel_size:0);
410 #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking3 - $region3)\n";
411 }
412 else{
413 $start = $pos - $num_flanking5 - $region5;
414 $stop = $pos + $num_flanking3 + $region3 + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 23 or 9 bases as the case may be
415 #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking5 - $region5)\n";
416 }
417 if(defined $var_offset and (
418 $strand eq "+" and ($pos+$var_offset < $start or $pos+$var_offset > $stop) or
419 $strand eq "-" and ($pos-$var_offset < $start or $pos-$var_offset > $stop))){ # variant is outside of range we can detect affect on original splice site
420 return ($UNAFFECTED, -1)
421 }
422
423 my $seq = $fasta_index->fetch("$chr:$start-$stop");
424 my @subseqs;
425 if(defined $var){
426 if(defined $var_offset){ # substitution away from the site of interest
427 #print STDERR "Subbing in string of length ", length($seq), "near splice site $ref -> $var at ", ($strand eq "-" ? "$region3+$num_flanking3-$var_offset":"$region5+$num_flanking5+$var_offset"), "\n";
428 substr($seq, ($strand eq "-" ? $region3+$num_flanking3-$var_offset:$region5+$num_flanking5+$var_offset), length($ref)) = $var;
429 }
430 else{ # substitution at the site of interest
431 substr($seq, ($strand eq "-" ? $region3+$num_flanking3:$region5+$num_flanking5), length($ref)) = $var;
432 }
433 }
434 if($strand eq "-"){
435 $seq = reverse($seq);
436 $seq =~ tr/ACGTacgt/TGCAtgca/;
437 }
438 # get all of the 23 or 9 base pair subsequences, depending on whether it's donors or acceptors we are looking for
439 for(my $i = 0; $i+$num_flanking5+$num_flanking3+1 <= length($seq); $i++){
440 push @subseqs, substr($seq, $i, $num_flanking5+$num_flanking3+1);
441 }
442 my $pid = open2(\*CHILD_OUT, \*CHILD_IN, "$cmd -");
443 $pid or die "Could not run $cmd: $!\n";
444 print CHILD_IN join("\n",@subseqs);
445 close(CHILD_IN);
446 while(<CHILD_OUT>){
447 chomp;
448 # out is just "seq[tab]score"
449 my @F = split /\s+/, $_;
450 if($F[1] > $highest_score){
451 #print STDERR ">";
452 $highest_score = $F[1];
453 # since we printed the subseqs in order, $. (containing the line number of the prediction output) gives us the offset in the original seq
454 $highest_position = $start + ($strand eq "-" ? length($seq) - $num_flanking5 - $. : $num_flanking5 + $. -1)+(($. <= $num_flanking5+$region5 && $strand eq "+" or $. >= $num_flanking5+$region5 && $strand eq "-") ? -$indel_size : 0);
455 }
456 #print STDERR $_,"\n";
457 }
458 close(CHILD_OUT);
459 waitpid($pid, 0);
460
461 return ($highest_score, $highest_position);
462 }
463
464 sub get_range_info($$$$){
465 my ($chr2ranges, $chr, $start, $stop) = @_;
466
467 if(not exists $chr2ranges->{$chr}){
468 return {};
469 }
470 my $range_key = "$chr:$start:$stop";
471
472 # Use a binary search to find the leftmost range of interest
473 my $left_index = 0;
474 my $right_index = $#{$chr2ranges->{$chr}};
475 my $cursor = int($right_index/2);
476 while($left_index != $right_index and $left_index != $cursor and $right_index != $cursor){
477 # need to go left
478 if($chr2ranges->{$chr}->[$cursor]->[0] >= $start){
479 $right_index = $cursor;
480 $cursor = int(($left_index+$cursor)/2);
481 }
482 # need to go to the right
483 elsif($chr2ranges->{$chr}->[$cursor]->[1] <= $stop){
484 $left_index = $cursor;
485 $cursor = int(($right_index+$cursor)/2);
486 }
487 else{
488 $right_index = $cursor;
489 $cursor--; # in the range, go left until not in the range any more
490 }
491 }
492
493 my %names;
494 for (; $cursor <= $#{$chr2ranges->{$chr}}; $cursor++){
495 my $range_ref = $chr2ranges->{$chr}->[$cursor];
496 if($range_ref->[0] > $stop){
497 last;
498 }
499 elsif($range_ref->[0] <= $stop and $range_ref->[1] >= $start){
500 $names{$range_ref->[2]}++;
501 }
502 }
503
504 #print STDERR "Names for range $chr:$start-$stop are ", join("/", keys %names), "\n";
505 return \%names;
506 }
507
508 if(not -d $sift_dir){
509 die "The specified SIFT directory $sift_dir does not exist\n";
510 }
511
512 print STDERR "Reading in OMIM morbid map...\n" unless $quiet;
513 die "Data file $morbidmap_file does not exist, aborting.\n" if not -e $morbidmap_file;
514 my %gene2morbids;
515 open(TAB, $morbidmap_file)
516 or die "Cannot open OMIM morbid map file $morbidmap_file for reading: $!\n";
517 while(<TAB>){
518 my @fields = split /\|/, $_;
519 #print STDERR "got ", ($#fields+1), " fields in $_\n";
520 #my @fields = split /\|/, $_;
521 my $morbid = $fields[0];
522 $morbid =~ s/\s*\(\d+\)$//; # get rid of trailing parenthetical number
523 my $omim_id = $fields[2];
524 for my $genename (split /, /, $fields[1]){
525 if(not exists $gene2morbids{$genename}){
526 $gene2morbids{$genename} = "OMIM:$omim_id $morbid";
527 }
528 else{
529 $gene2morbids{$genename} .= "; $morbid";
530 }
531 }
532 }
533 close(TAB);
534
535 print STDERR "Reading in interpro mappings...\n" unless $quiet;
536 die "Data file $interpro_bed_file does not exist, aborting.\n" if not -e $interpro_bed_file;
537 my %interpro_ids;
538 open(TAB, $interpro_bed_file)
539 or die "Cannot open gene name BED file $interpro_bed_file for reading: $!\n";
540 while(<TAB>){
541 chomp;
542 # format should be "chr start stop interpro desc..."
543 my @fields = split /\t/, $_;
544 my $c = $fields[0];
545 if(not exists $interpro_ids{$c}){
546 $interpro_ids{$c} = [] unless exists $interpro_ids{$c};
547 $interpro_ids{"chr".$c} = [] unless $c =~ /^chr/;
548 $interpro_ids{$1} = [] if $c =~ /^chr(\S+)/;
549 }
550 my $dataref = [@fields[1..3]];
551 push @{$interpro_ids{$c}}, $dataref;
552 push @{$interpro_ids{"chr".$c}}, $dataref unless $c =~ /^chr/;
553 push @{$interpro_ids{$1}}, $dataref if $c =~ /^chr(\S+)/;
554 }
555 close(TAB);
556 print STDERR "Sorting interpro matches per contig...\n" unless $quiet;
557 for my $chr (keys %interpro_ids){
558 $interpro_ids{$chr} = [sort {$a->[0] <=> $b->[0]} @{$interpro_ids{$chr}}];
559 }
560
561 # {uc(gene_name) => space separated info}
562 print STDERR "Reading in tissue info...\n" unless $quiet;
563 my %tissues;
564 my %tissue_desc;
565 open(TISSUE, $tissue_file)
566 or die "Cannot open $tissue_file for reading: $!\n";
567 while(<TISSUE>){
568 chomp;
569 my @fields = split /\t/, $_;
570 my $gname = uc($fields[1]);
571 $tissue_desc{$gname} = $fields[2];
572 next unless $#fields == 4;
573 my @tis = split /;\s*/, $fields[4];
574 pop @tis;
575 if(@tis < 6){
576 $tissues{$gname} = join(">", @tis);
577 }
578 elsif(@tis < 24){
579 $tissues{$gname} = join(">", @tis[0..5]).">...";
580 }
581 else{
582 $tissues{$gname} = join(">", @tis[0..(int($#tis/5))]).">...";
583 }
584 }
585 close(TISSUE);
586
587 print STDERR "Loading pathway info...\n" unless $quiet;
588 my %pathways;
589 my %pathway_ids;
590 open(PATHWAY, $pathway_file)
591 or die "Cannot open $pathway_file for reading: $!\n";
592 while(<PATHWAY>){
593 chomp;
594 my @fields = split /\t/, $_;
595 next unless @fields >= 4;
596 for my $gname (split /\s+/, $fields[1]){
597 $gname = uc($gname);
598 if(not exists $pathways{$gname}){
599 $pathway_ids{$gname} = $fields[2];
600 $pathways{$gname} = $fields[3];
601 }
602 else{
603 $pathway_ids{$gname} .= "; ".$fields[2];
604 $pathways{$gname} .= "; ".$fields[3];
605 }
606 }
607 }
608 close(PATHWAY);
609
610 print STDERR "Loading SIFT indices...\n" unless $quiet;
611 # Read in the database table list
612 my %chr_tables;
613 open(DBLIST, "$sift_dir/bins.list")
614 or die "Cannot open $sift_dir/bins.list for reading: $!\n";
615 while(<DBLIST>){
616 chomp;
617 my @fields = split /\t/, $_;
618 if(not exists $chr_tables{$fields[1]}){
619 $chr_tables{$fields[1]} = {};
620 }
621 my $chr = $fields[1];
622 $chr = "chr$chr" unless $chr =~ /^chr/;
623 $chr_tables{$chr}->{"$chr\_$fields[2]_$fields[3]"} = [$fields[2],$fields[3]];
624 }
625 close(DBLIST);
626
627 #Connect to database
628 my $db_gene = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_Supp.sqlite","","",{RaiseError=>1, AutoCommit=>1});
629
630 my $db_chr;
631 my $fr_stmt; # retrieval for frameshift
632 my $snp_stmt; # retrieval for SNP
633 my $cur_chr = "";
634 my $cur_max_pos;
635
636 if(defined $predict_cryptic_splicings){
637 if(not -e $predict_cryptic_splicings){
638 die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, does not exist.\n";
639 }
640 if(not -e $predict_cryptic_splicings.".fai" and not -w dirname($predict_cryptic_splicings)){
641 die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, is not indexed, and the directory is not writable.\n";
642 }
643 print STDERR "Loading reference FastA index...\n" unless $quiet;
644 $fasta_index = Bio::DB::Sam::Fai->load($predict_cryptic_splicings);
645 }
646
647 print STDERR "Annotating variants...\n" unless $quiet;
648 # Assume contigs and positions in order
649 print STDERR "Processing file $hgvs_table_file...\n" unless $quiet;
650 open(HGVS, $hgvs_table_file)
651 or die "Cannot open HGVS file $hgvs_table_file for reading: $!\n";
652 # Output file for annotated snps and frameshifts
653 my $header = <HGVS>; # header
654 chomp $header;
655 my @headers = split /\t/, $header;
656 my ($chr_column, $pos_column, $ref_column, $alt_column, $alt_cnt_column, $tot_cnt_column, $ftype_column, $dbid_column, $maf_src_column, $maf_column, $internal_freq_column,
657 $cdna_hgvs_column, $aa_hgvs_column, $transcript_column, $transcript_length_column, $strand_column, $zygosity_column, $splice_dist_column, $caveats_column, $phase_column,
658 $srcs_column, $pvalue_column, $to_column, $genename_column, $rares_column, $rares_header);
659 my %req_columns = (
660 "Chr" => \$chr_column,
661 "DNA From" => \$pos_column,
662 "DNA To" => \$to_column,
663 "Gene Name" => \$genename_column,
664 "Ref base" => \$ref_column,
665 "Obs base" => \$alt_column,
666 "Variant Reads" => \$alt_cnt_column,
667 "Total Reads" => \$tot_cnt_column,
668 "Feature type" => \$ftype_column,
669 "Variant DB ID" => \$dbid_column,
670 "Pop. freq. source" => \$maf_src_column,
671 "Pop. freq." => \$maf_column,
672 "Transcript HGVS" => \$cdna_hgvs_column,
673 "Protein HGVS" => \$aa_hgvs_column,
674 "Selected transcript" => \$transcript_column,
675 "Transcript length" => \$transcript_length_column,
676 "Strand" => \$strand_column,
677 "Zygosity" => \$zygosity_column,
678 "Closest exon junction (AA coding variants)" => \$splice_dist_column,
679 "Caveats" => \$caveats_column,
680 "Phase" => \$phase_column,
681 "P-value" => \$pvalue_column);
682 &load_header_columns(\%req_columns, \@headers);
683 for(my $i = 0; $i <= $#headers; $i++){
684 # Optional columns go here
685 if($headers[$i] eq "Sources"){
686 $srcs_column = $i;
687 }
688 elsif($headers[$i] eq "Internal pop. freq."){
689 $internal_freq_column = $i;
690 }
691 elsif($headers[$i] =~ /^Num rare variants/){
692 $rares_column = $i;
693 $rares_header = $headers[$i];
694 }
695 }
696 open(OUT, ">$outfile")
697 or die "Cannot open $outfile for writing: $!\n";
698 print OUT join("\t", "Chr", "DNA From", "DNA To", "Ref base", "Obs base", "% ref read","% variant read",
699 "Variant Reads", "Total Reads", "Feature type","Variant type", "Variant context", "Variant DB ID", "Pop. freq. source",
700 "Pop. freq."), (defined $internal_freq_column ? "\tInternal pop. freq.\t" : "\t"), join("\t", "SIFT Score", "Polyphen2 Score", "Polyphen2 call", "GERP score",
701 "Gene Name", "Transcript HGVS",
702 "Protein HGVS", "Zygosity","Closest exon junction (AA coding variants)","Splicing alteration potential","Splicing alteration details",
703 "OMIM Morbid Map","Tissue expression", "Pathways", "Pathway refs",
704 "Gene Function", "Spanning InterPro Domains", "P-value", "Caveats", "Phase",
705 "Selected transcript", "Transcript length", "Strand"),
706 (defined $rares_column ? "\t$rares_header" : ""),
707 (defined $srcs_column ? "\tSources" : ""), "\n";
708 while(<HGVS>){
709 chomp;
710 my @fields = split /\t/, $_, -1;
711 my $mode;
712 my $protein_hgvs = $fields[$aa_hgvs_column];
713 my $refSeq = $fields[$ref_column];
714 my $newBase = $fields[$alt_column];
715 my $splicing_potential = "NA";
716 my $splicing_details = "NA";
717 if(defined $predict_cryptic_splicings){
718 my $distance = $fields[$splice_dist_column]; # only for coding variants
719 my $site_type;
720 if($distance eq "NA"){
721 }
722 elsif($distance){
723 $site_type = $distance < 0 ? "donor" : "acceptor"; # exon context
724 }
725 else{
726 if($fields[$cdna_hgvs_column] =~ /\d+([\-+]\d+)/){ # get from the cDNA HGVS otherwise
727 $distance = $1;
728 $distance =~ s/^\+//;
729 $site_type = $distance > 0 ? "donor" : "acceptor"; # intron context
730 }
731 }
732 # only do splicing predictions for novel or rare variants, since these predictions are expensive
733 if($distance and $distance ne "NA" and ($fields[$maf_column] eq "NA" or $fields[$maf_column] <= 0.01)){
734 #print STDERR "Running prediction for rare variant ($fields[3], MAF=$fields[14]): $fields[5], $fields[6], $refSeq, $newBase, $fields[4], $distance, $site_type\n";
735 ($splicing_potential, $splicing_details) = &predict_splicing($fields[$chr_column], $fields[$pos_column], $refSeq, $newBase, $fields[$strand_column], $distance, $site_type);
736 }
737 }
738 my $lookupBase;
739 if($fields[$strand_column] eq "-"){
740 if($refSeq ne "NA"){
741 $refSeq = reverse($refSeq);
742 $refSeq =~ tr/ACGTacgt/TGCAtgca/;
743 }
744 if($newBase ne "NA"){
745 $newBase = reverse($newBase);
746 $newBase =~ tr/ACGTacgt/TGCAtgca/;
747 $newBase =~ s(TN/)(NA/);
748 $newBase =~ s(/TN)(/NA);
749 }
750 }
751 if($fields[$cdna_hgvs_column] =~ /^g\..*?(?:ins|delins|>)([ACGTN]+)$/){
752 $mode = "snps";
753 }
754 elsif($fields[$cdna_hgvs_column] =~ /^g\..*?del([ACGTN]+)$/){
755 $mode = "snps";
756 }
757 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+(?:[\-+]\d+)?ins([ACGTN]+)$/ or $fields[$cdna_hgvs_column] =~ /^c\.\d+(?:[\-+]\d+)?_\d+ins([ACGTN]+)$/){
758 if(not length($1)%3){
759 $mode = "snps";
760 }
761 else{
762 $mode = "frameshifts";
763 }
764 }
765 elsif($fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:[\-+]\d+)?_(\d+)(?:[\-+]\d+)?delins([ACGTN]+)$/){
766 if(not (length($3)-$2+$1-1)%3){
767 $mode = "snps";
768 }
769 else{
770 $mode = "frameshifts";
771 }
772 }
773 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[+\-]\d+_\d+[+\-]\d+delins([ACGTN]+)$/ or
774 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or
775 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or
776 $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+del([ACGTN]*)$/ or
777 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?(?:ins|delins)?([ACGTN]+)$/){
778 $mode = "snps";
779 }
780 elsif($fields[$cdna_hgvs_column] =~ /^c\.([-*]?\d+)del(\S+)/){
781 my $dist = $1;
782 $dist =~ s/^\*//;
783 if(abs($dist) < 4){
784 $mode = "snps";
785 }
786 else{
787 $mode = "frameshifts";
788 }
789 }
790 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+ins(\S*)/ or
791 $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+del(\S*)/){
792 $mode = "snps";
793 }
794 elsif($fields[$cdna_hgvs_column] =~ /^c\.(-?\d+)_(\d+)(?:\+\d+)?del(\S+)/ or
795 $fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:\-\d+)?_(\d+)del(\S+)/){
796 if(not ($2-$1+1)%3){
797 $mode = "snps";
798 }
799 else{
800 $mode = "frameshifts";
801 }
802 }
803 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+ins([ACGTN]+)$/){
804 if(not length($1)%3){
805 $mode = "snps";
806 }
807 else{
808 $mode = "frameshifts";
809 }
810 }
811 # special case of shifted start codon
812 elsif($fields[$cdna_hgvs_column] =~ /^c\.-1_1+ins([ACGTN]+)$/ or
813 $fields[$cdna_hgvs_column] =~ /inv$/){
814 $mode = "snps";
815 }
816 elsif($fields[$cdna_hgvs_column] =~ /[ACGTN]>([ACGTN])$/){
817 # todo: handle multiple consecutive substitutions, will have to run polyphen separately
818 $lookupBase = $newBase;
819 if($fields[$strand_column] eq "-"){ # revert to positive strand variant for SIFT index
820 $lookupBase =~ tr/ACGTacgt/TGCAtgca/;
821 }
822 $mode = "snps";
823 }
824 else{
825 $mode = "snps";
826 }
827 if($fields[$chr_column] ne $cur_chr){
828 #Prepared DB connection on per-chromosome basis
829 $cur_chr = $fields[$chr_column];
830 $cur_chr =~ s/^chr//;
831 $db_chr = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_CHR$cur_chr.sqlite",
832 "",
833 "",
834 { RaiseError => 1, AutoCommit => 1 }) unless not -e "$sift_dir/Human_CHR$cur_chr.sqlite";
835
836 $cur_max_pos = -1;
837 }
838 if($fields[$pos_column] =~ /^\d+$/ and $cur_max_pos < $fields[$pos_column]){
839 undef $fr_stmt;
840 undef $snp_stmt;
841 my $c = $fields[$chr_column];
842 $c = "chr$c" if $c !~ /^chr/;
843 for my $chr_table_key (keys %{$chr_tables{$c}}){
844 my $chr_table_range = $chr_tables{$c}->{$chr_table_key};
845 if($chr_table_range->[0] <= $fields[$pos_column] and $chr_table_range->[1] >= $fields[$pos_column]){
846 $fr_stmt = $db_chr->prepare("select SCORE".
847 " from $chr_table_key where COORD1 = ?");
848 $snp_stmt = $db_chr->prepare("select SCORE".
849 " from $chr_table_key where COORD1 = ? AND NT2 = ?");
850 $cur_max_pos = $chr_table_range->[1];
851 last;
852 }
853 }
854 if(not defined $fr_stmt and not $quiet){
855 warn "Got request for position not in range of SIFT ($c:$fields[$pos_column])\n";
856 }
857 }
858
859 my @cols;
860 if(not $fr_stmt){
861 @cols = ("NA");
862 }
863 elsif($mode eq "frameshifts"){
864 $fr_stmt->execute($fields[$pos_column]);
865 @cols = $fr_stmt->fetchrow_array();
866 }
867 else{
868 $snp_stmt->execute($fields[$pos_column]-1, $lookupBase);
869 @cols = $snp_stmt->fetchrow_array();
870 }
871 # See if the change is in a CDS, and if it's non-synonymous
872 my ($type, $location) = ("silent", "intron");
873 my $start = $fields[$pos_column];
874 my $stop = $fields[$to_column];
875 my $interpro = get_range_info(\%interpro_ids, $fields[$chr_column], $start, $stop);
876 my $domains = join("; ", sort keys %$interpro);
877
878 my $sift_score = "NA";
879 my $variant_key = "NA";
880 my $transcript_hgvs = $fields[$cdna_hgvs_column];
881 if($transcript_hgvs =~ /\.\d+[ACGTN]>/ or $transcript_hgvs =~ /\.\d+(?:_|del|ins)/ or $transcript_hgvs =~ /[+\-]\?/){
882 $location = "exon";
883 }
884 elsif($transcript_hgvs =~ /\.\d+[\-+](\d+)/){
885 if($1 < 21){
886 $type = "splice";
887 }
888 else{
889 $type = "intronic";
890 }
891 $location = "splice site";
892 }
893 elsif($transcript_hgvs =~ /\.\*\d+/){
894 $location = "3'UTR";
895 }
896 elsif($transcript_hgvs =~ /\.-\d+/){
897 $location = "5'UTR";
898 }
899 if($mode eq "frameshifts"){
900 $type = "frameshift";
901 }
902 else{
903 if(not defined $protein_hgvs){
904 $type = "unknown";
905 }
906 elsif($protein_hgvs eq "NA"){
907 $type = "non-coding";
908 }
909 elsif($protein_hgvs =~ /=$/){
910 $type = "silent";
911 }
912 elsif($protein_hgvs =~ /\d+\*/){ # nonsense
913 $type = "nonsense";
914 }
915 elsif($protein_hgvs =~ /ext/){ # nonsense
916 $type = "nonstop";
917 }
918 elsif($protein_hgvs eq "p.0?" or $protein_hgvs eq "p.?"){
919 $type = "unknown";
920 }
921 else{
922 $type = "missense";
923 }
924
925 $sift_score = $cols[0] || "NA";
926
927 # Record the need to fetch VCF polyphen and gerp data later (en masse, for efficiency)
928 # Use newBase instead of lookupBase for PolyPhen since the orientation is always relative to the transcript
929 $variant_key = get_variant_key($fields[$chr_column], $fields[$pos_column], $fields[$ref_column], $newBase);
930 #print STDERR "Variant key is $variant_key\n";
931 }
932 my $transcript_type = $fields[$ftype_column];
933 my $transcript_length = $fields[$transcript_length_column];
934 my $transcript_id = $fields[$transcript_column]; # sift score is derived from this transcript
935 my $transcript_strand = $fields[$strand_column];
936 my $zygosity = $fields[$zygosity_column];
937 my $rsID = $fields[$dbid_column];
938 my $popFreq = $fields[$maf_column];
939 my $internalFreq;
940 $internalFreq = $fields[$internal_freq_column] if defined $internal_freq_column;
941 my $popFreqSrc = $fields[$maf_src_column];
942
943 # Get the gene name and omim info
944 my %seen;
945 my @gene_names = split /; /, $fields[$genename_column];
946 my $morbid = join("; ", grep {/\S/ and not $seen{$_}++} map({$gene2morbids{$_} or ""} @gene_names));
947 my $tissue = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissues{$_} or ""} @gene_names));
948 my $function = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissue_desc{$_} or ""} @gene_names));
949 my $pathways = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathways{$_} or ""} @gene_names));
950 my $pathway_ids = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathway_ids{$_} or ""} @gene_names));
951
952 # It may be that the calls and freqs are multiple concatenated values, e.g. "X; Y"
953 my @tot_bases = split /; /,$fields[$tot_cnt_column];
954 my @var_bases = split /; /,$fields[$alt_cnt_column];
955 my (@pct_nonvar, @pct_var);
956 for (my $i = 0; $i <= $#tot_bases; $i++){
957 push @pct_nonvar, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int(($tot_bases[$i]-$var_bases[$i])/$tot_bases[$i]*100+0.5);
958 push @pct_var, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int($var_bases[$i]/$tot_bases[$i]*100+0.5);
959 }
960
961 push @lines, [
962 $fields[$chr_column], # chr
963 $fields[$pos_column], # pos
964 $fields[$to_column],
965 $fields[$ref_column],
966 $fields[$alt_column],
967 join("; ", @pct_nonvar), # pct ref
968 join("; ", @pct_var), # pct var
969 $fields[$alt_cnt_column], # num var
970 $fields[$tot_cnt_column], # num reads
971 $transcript_type, # protein_coding, pseudogene, etc.
972 $type,
973 $location,
974 $rsID,
975 $popFreqSrc,
976 (defined $internal_freq_column ? ($popFreq,$internalFreq) : ($popFreq)),
977 $sift_score,
978 $variant_key, # 16th field
979 join("; ", @gene_names),
980 $transcript_hgvs,
981 $protein_hgvs,
982 $zygosity,
983 $fields[$splice_dist_column], # distance to exon edge
984 $splicing_potential,
985 $splicing_details,
986 $morbid,
987 $tissue,
988 $pathways,
989 $pathway_ids,
990 $function,
991 $domains,
992 $fields[$pvalue_column],
993 $fields[$caveats_column], # caveats
994 (defined $rares_column ? ($fields[$rares_column],$transcript_id) : ($transcript_id)),
995 $transcript_length,
996 $transcript_strand,
997 (defined $srcs_column ? ($fields[$phase_column],$fields[$srcs_column]) : ($fields[$phase_column]))];
998 }
999
1000 print STDERR "Adding Polyphen2 and GERP info...\n" unless $quiet;
1001 # retrieve the polyphen and gerp info en masse for each chromosome, as this is much more efficient
1002 my $vkey_col = 16+(defined $internal_freq_column ? 1 : 0);
1003 for my $line_fields (@lines){
1004 # expand the key into the relevant MAF values
1005 $line_fields->[$vkey_col] = join("\t", polyphen_gerp_info($gerp_dir,$polyphen_file,$line_fields->[$vkey_col])); # fields[$vkey_col] is variant key
1006 print OUT join("\t", @{$line_fields}), "\n";
1007 }
1008 close(OUT);
1009 unlink $tmpfile if defined $tmpfile;
1010
1011 sub load_header_columns{
1012 my ($reqs_hash_ref, $headers_array_ref) = @_;
1013 my %unfulfilled;
1014 for my $field_name (keys %$reqs_hash_ref){
1015 $unfulfilled{$field_name} = 1;
1016 }
1017 for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){
1018 for my $req_header_name (keys %$reqs_hash_ref){
1019 if($req_header_name eq $headers_array_ref->[$i]){
1020 ${$reqs_hash_ref->{$req_header_name}} = $i;
1021 delete $unfulfilled{$req_header_name};
1022 last;
1023 }
1024 }
1025 }
1026 if(keys %unfulfilled){
1027 die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n";
1028 }
1029 }