comparison hgvs_table_annotate @ 0:40dd2e7ee63a default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 16:00:12 -0600
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:40dd2e7ee63a
1 #!/usr/bin/env perl
2
3 use Bio::DB::Sam;
4 use DBI;
5 use IPC::Open2;
6 use Set::IntervalTree;
7 use File::Basename;
8 use strict;
9 use warnings;
10 use vars qw($quiet %chr2variant_locs %polyphen_info %chr2polyphen_vcf_lines %chr2gerp_vcf_lines %range2genes $fasta_index $UNAFFECTED $tmpfile);
11
12 $UNAFFECTED = -299999; # sentinel for splice site predictions
13
14 if(@ARGV == 1 and $ARGV[0] eq "-v"){
15 print "Version 1.0\n";
16 exit;
17 }
18
19 # configuration file stuff
20 my %config;
21 my $dirname = dirname(__FILE__);
22 my $tool_dir = shift @ARGV;
23 if(not -e "$tool_dir/hgvs_annotate.loc"){
24 system("cp $dirname/tool-data/hgvs_annotate.loc $tool_dir/hgvs_annotate.loc");
25 }
26 open CONFIG, '<', "$tool_dir/hgvs_annotate.loc";
27 while(<CONFIG>){
28 next if $_ =~ /^#/;
29 (my $key,my $value) = split(/\s+/, $_);
30 $config{$key} = $value;
31 }
32 my $dbs_dir = $config{"dbs_dir"};
33 close CONFIG;
34
35 $quiet = 0;
36 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
37 $quiet = 1;
38 shift @ARGV;
39 }
40
41 @ARGV == 9 or @ARGV == 10 or die "Usage: $0 -q <SIFT dir> <POLYPHEN2 file> <GERP dir> <tissuedb file> <pathway file> <interpro domains.bed> <omim morbidmap> <input hgvs table.txt> <output hgvs table.annotated.txt> [reference.fasta]\n Where if the reference fasta is provided, we predict cryptic splicing effects (using MaxEntScan and GeneSplicer)\n";
42
43 my $sift_dir = $dbs_dir . shift @ARGV;
44 my $polyphen_file = $dbs_dir . shift @ARGV;
45 my $gerp_dir = $dbs_dir . shift @ARGV;
46 my $tissue_file = $dbs_dir . shift @ARGV;
47 my $pathway_file = $dbs_dir . shift @ARGV;
48 my $interpro_bed_file = shift @ARGV;
49 my $morbidmap_file = shift @ARGV;
50 my $hgvs_table_file = shift @ARGV;
51 my $outfile = shift @ARGV;
52 my $predict_cryptic_splicings = @ARGV ? shift @ARGV : undef;
53 my $flanking_bases = 500; # assume this is ROI around a gene
54 my @lines;
55
56 sub polyphen_gerp_info($$$){
57 my($gerp_dir,$polyphen_file,$info_key) = @_;
58
59 my($chr,$pos,$ref,$variant) = split /:/, $info_key;
60
61 # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse
62 if(not exists $chr2polyphen_vcf_lines{$chr}){
63 ($chr2polyphen_vcf_lines{$chr}, $chr2gerp_vcf_lines{$chr}) = retrieve_vcf_lines($gerp_dir,$polyphen_file,$chr);
64 }
65
66 my @results;
67 if(not $ref or not $variant){ # can only get data for SNPs
68 @results = qw(NA NA NA);
69 }
70 #print STDERR "ref = $ref and variant = $variant for $info_key\n";
71 elsif(length($ref) == 1 and length($variant) == 1){
72 #print STDERR "Grabbing poly/gerp for $chr,$pos,$ref,$variant\n";
73 @results = polyphen_info($chr,$pos,$variant);
74 $results[2] = gerp_info($chr,$pos);
75 }
76
77 # It may be that a novel MNP variant is a bunch of known SNPs in a row. In this case,
78 # report the list of variants
79 elsif(length($ref) == length($variant) and $ref ne "NA"){
80 my (@poly_scores, @poly_calls, @gerp_scores);
81 for(my $i = 0; $i < length($ref); $i++){
82 my $refbase = substr($ref, $i, 1);
83 my $newbase = substr($variant, $i, 1);
84 if($newbase eq $refbase){
85 push @poly_scores, "-";
86 push @poly_calls, "-";
87 push @gerp_scores, "-";
88 next;
89 }
90 my @base_results = polyphen_info($chr,$pos+$i,$refbase,$newbase);
91 push @poly_scores, $base_results[0];
92 push @poly_calls, $base_results[1];
93 $base_results[2] = gerp_info($chr,$pos+$i);
94 push @gerp_scores, $base_results[2];
95 }
96 $results[0] = join(",", @poly_scores);
97 $results[1] = join(",", @poly_calls);
98 $results[2] = join(",", @gerp_scores);
99 }
100 else{
101 @results = qw(NA NA NA);
102 }
103
104 return @results;
105 }
106
107 sub get_variant_key($$$$){
108 # params chr pos ref variant
109 my $c = $_[0];
110 $c =~ s/^chr//;
111 my $prop_info_key = join(":", $c, @_[1..3]);
112 $chr2variant_locs{$c} = [] unless exists $chr2variant_locs{$c};
113 push @{$chr2variant_locs{$c}}, $_[1];
114 return $prop_info_key;
115 }
116
117 sub retrieve_vcf_lines($$$){
118 my ($gerp_dir, $polyphen_file, $chr) = @_;
119
120 my (%polyphen_lines, %gerp_lines);
121
122 if(not exists $chr2variant_locs{$chr}){
123 return ({}, {}); # no data requested for this chromosome
124 }
125
126 # build up the request
127 my @tabix_regions;
128 my @var_locs = grep /^\d+$/, @{$chr2variant_locs{$chr}}; # grep keeps point mutations only
129 if(not @var_locs){
130 return ({}, {}); # no point mutations
131 }
132
133 # sort by variant start location
134 for my $var_loc (sort {$a <=> $b} @var_locs){
135 push @tabix_regions, $chr.":".$var_loc."-".$var_loc;
136 }
137 my $regions = join(" ", @tabix_regions);
138
139 # retrieve the data
140 die "Cannot find Polyphen2 VCF file $polyphen_file\n" if not -e $polyphen_file;
141 open(VCF, "tabix $polyphen_file $regions |")
142 or die "Cannot run tabix on $polyphen_file: $!\n";
143 while(<VCF>){
144 if(/^\S+\t(\d+)/ and grep {$_ eq $1} @var_locs){ # take only main columns to save room, if possible
145 chomp;
146 $polyphen_lines{$1} = [] unless exists $polyphen_lines{$1};
147 push @{$polyphen_lines{$1}}, $_;
148 }
149 }
150 close(VCF);
151
152 my $vcf_file = "$gerp_dir/chr$chr.tab.gz";
153 if(not -e $vcf_file){
154 warn "Cannot find GERP VCF file $vcf_file\n" if not $quiet;
155 return ({}, {});
156 }
157 open(VCF, "tabix $vcf_file $regions |")
158 or die "Cannot run tabix on $vcf_file: $!\n";
159 while(<VCF>){
160 if(/^(\S+\t(\d+)(?:\t\S+){2})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible
161 $gerp_lines{$2} = [] unless exists $gerp_lines{$2};
162 push @{$gerp_lines{$2}}, $1;
163 }
164 }
165 close(VCF);
166
167 return (\%polyphen_lines, \%gerp_lines);
168 }
169
170 sub polyphen_info($$$){
171 my ($chr, $pos, $variant_base) = @_;
172
173 my $key = "$chr:$pos:$variant_base";
174 if(exists $polyphen_info{$key}){
175 return @{$polyphen_info{$key}};
176 }
177
178 #print STDERR "Checking for polyphen data for $chr, $pos, $variant_base\n";
179 if(exists $chr2polyphen_vcf_lines{$chr}->{$pos}){
180 for(@{$chr2polyphen_vcf_lines{$chr}->{$pos}}){
181 #print STDERR "Checking $chr, $pos, $variant_base against $_";
182 my @fields = split /\t/, $_;
183 if($fields[4] eq $variant_base){
184 if($fields[6] eq "D"){
185 $polyphen_info{$key} = [$fields[5],"deleterious"];
186 last;
187 }
188 elsif($fields[6] eq "P"){
189 $polyphen_info{$key} = [$fields[5],"poss. del."];
190 last;
191 }
192 elsif($fields[6] eq "B"){
193 $polyphen_info{$key} = [$fields[5],"benign"];
194 last;
195 }
196 else{
197 $polyphen_info{$key} = [$fields[5],$fields[6]];
198 last;
199 }
200 }
201 }
202 if(not exists $polyphen_info{$key}){
203 $polyphen_info{$key} = ["NA", "NA"];
204 }
205 }
206 else{
207 $polyphen_info{$key} = ["NA", "NA"];
208 }
209 return @{$polyphen_info{$key}};
210 }
211
212 sub gerp_info($$){
213 my ($chr,$pos) = @_;
214
215 if(exists $chr2gerp_vcf_lines{$chr}->{$pos}){
216 for(@{$chr2gerp_vcf_lines{$chr}->{$pos}}){
217 my @fields = split /\t/, $_;
218 return $fields[3];
219 }
220 }
221 return "NA";
222 }
223
224 sub predict_splicing{
225 my ($chr, $pos, $ref, $var, $strand, $exon_edge_distance, $splice_type) = @_;
226
227 my $disruption_level = 0;
228 my @disruption_details;
229 # The methods being used only look for up to a 30 base context, so only
230 # need to check for obliterated acceptors/donors if they are nearby
231 $exon_edge_distance-- if $exon_edge_distance > 0; # +1 is actually in the right spot, so decrement
232 my $edge_offset = $strand eq "-" ? -1*$exon_edge_distance : $exon_edge_distance; # flip offset sign if the call was on the negative strand (since +/- was relative to the transcript, not the genome coordinates)
233 my $annotated_detected = 0;
234 my ($genesplicer_orig, $genesplicer_orig_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref);
235 if($genesplicer_orig != 0){
236 $annotated_detected = 1;
237 }
238 my ($maxentscan_orig, $maxentscan_orig_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref);
239 if(not $annotated_detected and $maxentscan_orig){
240 $annotated_detected = 1;
241 }
242 my ($genesplicer_mod, $genesplicer_mod_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance);
243 if($genesplicer_mod != $UNAFFECTED){ # was the mod within the range where this software could detect up or downstream effects on the annotated splice site?
244 if($genesplicer_orig_pos != $genesplicer_mod_pos){
245 $disruption_level+=0.75;
246 if($genesplicer_mod_pos == -1){
247 push @disruption_details, "Variant obliterates GeneSplicer predicted splice site from the reference sequence ($chr:$genesplicer_orig_pos, orig score $genesplicer_orig)";
248 }
249 elsif($genesplicer_orig_pos != -1){
250 push @disruption_details, "Variant moves GeneSplicer preferred splice site from annotated location $chr:$genesplicer_orig_pos to $chr:$genesplicer_mod_pos (scores: orig=$genesplicer_orig, new=$genesplicer_mod)";
251 }
252 }
253 elsif($genesplicer_orig-$genesplicer_mod > 1){
254 $disruption_level+=0.5;
255 push @disruption_details, "Variant significantly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)";
256 }
257 elsif($genesplicer_orig > $genesplicer_mod){
258 $disruption_level+=0.25;
259 push @disruption_details, "Variant slightly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)";
260 }
261 $genesplicer_orig = $genesplicer_mod; # so that splice site comparison vs. cryptic is accurate
262 }
263 my ($maxentscan_mod, $maxentscan_mod_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance);
264 if($maxentscan_mod != $UNAFFECTED){
265 if($maxentscan_mod_pos != $maxentscan_orig_pos){
266 $disruption_level+=1;
267 if($maxentscan_mod_pos == -1){
268 push @disruption_details, "Variant obliterates MaxEntScan predicted splice site from the reference sequence ($chr:$maxentscan_orig_pos, orig score $maxentscan_orig)";
269 }
270 elsif($maxentscan_orig_pos != -1){
271 push @disruption_details, "Variant moves MaxEntScan preferred splice site from annotated location $chr:$maxentscan_orig_pos to $chr:$maxentscan_mod_pos (scores: orig=$maxentscan_orig, new=$maxentscan_mod)";
272 }
273 }
274 elsif($maxentscan_orig-$maxentscan_mod > 1){
275 $disruption_level+=0.5;
276 push @disruption_details, "Variant significantly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)";
277 }
278 elsif($maxentscan_orig>$maxentscan_mod){
279 $disruption_level+=0.25;
280 push @disruption_details, "Variant slightly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)";
281 }
282 $maxentscan_orig = $maxentscan_mod;
283 }
284
285 # Regardless of location, see if there is an increased chance of creating a cryptic splicing site
286 my ($genesplicer_control, $genesplicer_control_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref);
287 my ($genesplicer_cryptic, $genesplicer_cryptic_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref, $var);
288 if(defined $genesplicer_orig and $genesplicer_control < $genesplicer_orig and $genesplicer_orig < $genesplicer_cryptic){
289 $disruption_level+=0.75;
290 push @disruption_details, "Variant causes GeneSplicer score for a potential cryptic $splice_type splice site at $chr:$genesplicer_cryptic_pos to become higher than the annotated splice site $chr:$genesplicer_orig_pos (orig=$genesplicer_orig vs. var=$genesplicer_cryptic)";
291 }
292 if($genesplicer_control - $genesplicer_cryptic < -1){
293 $disruption_level+=0.5;
294 push @disruption_details, "Variant raises GeneSplicer score for a potential cryptic $splice_type splice site (orig=$genesplicer_control vs. var=$genesplicer_cryptic)";
295 }
296 my ($maxentscan_control, $maxentscan_control_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref);
297 my ($maxentscan_cryptic, $maxentscan_cryptic_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref, $var);
298 if(defined $maxentscan_orig and $maxentscan_control < $maxentscan_orig and $maxentscan_orig < $maxentscan_cryptic){
299 $disruption_level++;
300 push @disruption_details, "Variant causes MaxEntScan score for a potential cryptic $splice_type splice site at $chr:$maxentscan_cryptic_pos to become higher than the annotated splice site $chr:$maxentscan_orig_pos (orig=$maxentscan_orig vs. var=$maxentscan_cryptic)";
301 }
302 if($maxentscan_control - $maxentscan_cryptic < -1 and $maxentscan_cryptic > 0){
303 $disruption_level+=0.5;
304 push @disruption_details, "Variant raises MaxEntScan score for a potential cryptic $splice_type splice site (orig=$maxentscan_control vs. var=$maxentscan_cryptic)";
305 }
306
307 # If neither method predicted the annotated donor site, and the alternate site isn't predicted either, note that so the user doesn't think the mod is definite okay
308 if(not @disruption_details and $maxentscan_orig < 0 and $genesplicer_orig < 0){
309 $disruption_level+=0.5;
310 push @disruption_details, "Neither the annotated splice site nor any potential cryptic site introduced by the variant are predicted by either GeneSplicer or MaxEntScan, therefore the alt splicing potential for this variant should be evaluated manually";
311 }
312
313 return ($disruption_level, join("; ", @disruption_details));
314 }
315
316 sub call_genesplicer{
317 my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_;
318
319 if($splice_type eq "acceptor"){
320 $pos += $strand eq "-" ? 2: -2;
321 }
322 my $num_flanking = 110; # seems to croak if less than 160bp provided
323 my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins
324 my $site_window = defined $var_offset ? 2 : ($indel_size > 0 ? 29+$indel_size : 29); # exact position if looking for exisitng site effect, otherwise 10 or 10 + the insert size
325 my $cmd = "genesplicer";
326 my $start = $pos - $num_flanking;
327 $start = 1 if $start < 1;
328 my $stop = $pos + $num_flanking + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 2*$num_flanking after the edit
329 $stop++; # end coordinate is exclusive
330 return ($UNAFFECTED, -1) if defined $var_offset and ($pos+$var_offset < $start or $pos+$var_offset > $stop); # variant is outside of range we can detect affect on original splice site
331 my $seq = $fasta_index->fetch("$chr:$start-$stop");
332 if(defined $var){
333 if(defined $var_offset){ # substitution away from the site of interest
334 substr($seq, $num_flanking+$var_offset, length($ref)) = $var;
335 }
336 else{ # substitution at the site of interest
337 substr($seq, $num_flanking, length($ref)) = $var;
338 }
339 }
340 if($strand eq "-"){
341 $seq = reverse($seq);
342 $seq =~ tr/ACGTacgt/TGCAtgca/;
343 }
344 $tmpfile = "$$.fasta";
345 open(TMP, ">$tmpfile") or die "Could not open $tmpfile for writing: $!\n";
346 print TMP ">$chr $strand $pos $ref",($var?" $var":""), ($var_offset?" $var_offset":""),"\n$seq\n";
347 substr($seq, $num_flanking, 2) = lc(substr($seq, $num_flanking, 2));
348 #print STDERR "$chr $strand $pos $ref $var $var_offset: $seq\n";
349 close(TMP);
350 #print STDERR "$cmd $tmpfile /export/achri_data/programs/GeneSplicer/human\n";
351 #<STDIN>;
352 open(GS, "$cmd $tmpfile human 2> /dev/null|")
353 or die "Could not run $cmd: $!\n";
354 my $highest_score = 0;
355 my $highest_position = -1;
356 while(<GS>){
357 # End5 End3 Score "confidence" splice_site_type
358 # E.g. :
359 # 202 203 2.994010 Medium donor
360 chomp;
361 my @F = split /\s+/, $_;
362 next if $F[1] < $F[0]; # wrong strand
363 if($splice_type eq $F[4]){
364 if(abs($F[0]-$num_flanking-1) <= $site_window){ # right type and approximate location
365 #print STDERR "*$_\n";
366 if($F[2] > $highest_score){
367 $highest_score = $F[2];
368 # last bit accounts for indels in the position offset
369 $highest_position = $pos + ($strand eq "-" ? -1: 1)*(-$num_flanking + $F[0] - 1) + (($F[0] <= $num_flanking && $strand eq "+" or $F[0] >= $num_flanking && $strand eq "-") ? -$indel_size : 0);
370 }
371 }
372 }
373 #else{print STDERR $_,"\n";}
374 }
375 close(GS);
376 return ($highest_score, $highest_position);
377 }
378
379 sub call_maxentscan{
380 my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_;
381
382 my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins
383 my $cmd;
384 my ($num_flanking5, $num_flanking3);
385 my ($region5, $region3);
386 my ($start, $stop);
387 my $highest_score = -9999;
388 my $highest_position = -1;
389 # note that donor and acceptor are very different beasts for MaxEntScan
390 if($splice_type eq "acceptor"){
391 if($strand eq "-"){
392 $pos += 2;
393 }
394 else{
395 $pos -= 2;
396 }
397 $cmd = "score3.pl";
398 $num_flanking5 = 18;
399 $num_flanking3 = 4;
400 }
401 else{ # donor
402 $cmd = "score5.pl";
403 $num_flanking5 = 3;
404 $num_flanking3 = 5;
405 }
406 # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region
407 $region5 = defined $var_offset ? 0 : $num_flanking5; # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region
408 $region3 = defined $var_offset ? 0 : $num_flanking3;
409 if($strand eq "-"){
410 $start = $pos - $num_flanking3 - $region3;
411 $stop = $pos + $num_flanking5 + $region5 + ($indel_size < 0 ? -1*$indel_size:0);
412 #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking3 - $region3)\n";
413 }
414 else{
415 $start = $pos - $num_flanking5 - $region5;
416 $stop = $pos + $num_flanking3 + $region3 + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 23 or 9 bases as the case may be
417 #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking5 - $region5)\n";
418 }
419 if(defined $var_offset and (
420 $strand eq "+" and ($pos+$var_offset < $start or $pos+$var_offset > $stop) or
421 $strand eq "-" and ($pos-$var_offset < $start or $pos-$var_offset > $stop))){ # variant is outside of range we can detect affect on original splice site
422 return ($UNAFFECTED, -1)
423 }
424
425 my $seq = $fasta_index->fetch("$chr:$start-$stop");
426 my @subseqs;
427 if(defined $var){
428 if(defined $var_offset){ # substitution away from the site of interest
429 #print STDERR "Subbing in string of length ", length($seq), "near splice site $ref -> $var at ", ($strand eq "-" ? "$region3+$num_flanking3-$var_offset":"$region5+$num_flanking5+$var_offset"), "\n";
430 substr($seq, ($strand eq "-" ? $region3+$num_flanking3-$var_offset:$region5+$num_flanking5+$var_offset), length($ref)) = $var;
431 }
432 else{ # substitution at the site of interest
433 substr($seq, ($strand eq "-" ? $region3+$num_flanking3:$region5+$num_flanking5), length($ref)) = $var;
434 }
435 }
436 if($strand eq "-"){
437 $seq = reverse($seq);
438 $seq =~ tr/ACGTacgt/TGCAtgca/;
439 }
440 # get all of the 23 or 9 base pair subsequences, depending on whether it's donors or acceptors we are looking for
441 for(my $i = 0; $i+$num_flanking5+$num_flanking3+1 <= length($seq); $i++){
442 push @subseqs, substr($seq, $i, $num_flanking5+$num_flanking3+1);
443 }
444 my $pid = open2(\*CHILD_OUT, \*CHILD_IN, "$cmd -");
445 $pid or die "Could not run $cmd: $!\n";
446 print CHILD_IN join("\n",@subseqs);
447 close(CHILD_IN);
448 while(<CHILD_OUT>){
449 chomp;
450 # out is just "seq[tab]score"
451 my @F = split /\s+/, $_;
452 if($F[1] > $highest_score){
453 #print STDERR ">";
454 $highest_score = $F[1];
455 # since we printed the subseqs in order, $. (containing the line number of the prediction output) gives us the offset in the original seq
456 $highest_position = $start + ($strand eq "-" ? length($seq) - $num_flanking5 - $. : $num_flanking5 + $. -1)+(($. <= $num_flanking5+$region5 && $strand eq "+" or $. >= $num_flanking5+$region5 && $strand eq "-") ? -$indel_size : 0);
457 }
458 #print STDERR $_,"\n";
459 }
460 close(CHILD_OUT);
461 waitpid($pid, 0);
462
463 return ($highest_score, $highest_position);
464 }
465
466 sub get_range_info($$$$){
467 my ($chr2ranges, $chr, $start, $stop) = @_;
468
469 if(not exists $chr2ranges->{$chr}){
470 return {};
471 }
472 my $range_key = "$chr:$start:$stop";
473
474 # Use a binary search to find the leftmost range of interest
475 my $left_index = 0;
476 my $right_index = $#{$chr2ranges->{$chr}};
477 my $cursor = int($right_index/2);
478 while($left_index != $right_index and $left_index != $cursor and $right_index != $cursor){
479 # need to go left
480 if($chr2ranges->{$chr}->[$cursor]->[0] >= $start){
481 $right_index = $cursor;
482 $cursor = int(($left_index+$cursor)/2);
483 }
484 # need to go to the right
485 elsif($chr2ranges->{$chr}->[$cursor]->[1] <= $stop){
486 $left_index = $cursor;
487 $cursor = int(($right_index+$cursor)/2);
488 }
489 else{
490 $right_index = $cursor;
491 $cursor--; # in the range, go left until not in the range any more
492 }
493 }
494
495 my %names;
496 for (; $cursor <= $#{$chr2ranges->{$chr}}; $cursor++){
497 my $range_ref = $chr2ranges->{$chr}->[$cursor];
498 if($range_ref->[0] > $stop){
499 last;
500 }
501 elsif($range_ref->[0] <= $stop and $range_ref->[1] >= $start){
502 $names{$range_ref->[2]}++;
503 }
504 }
505
506 #print STDERR "Names for range $chr:$start-$stop are ", join("/", keys %names), "\n";
507 return \%names;
508 }
509
510 if(not -d $sift_dir){
511 die "The specified SIFT directory $sift_dir does not exist\n";
512 }
513
514 print STDERR "Reading in OMIM morbid map...\n" unless $quiet;
515 die "Data file $morbidmap_file does not exist, aborting.\n" if not -e $morbidmap_file;
516 my %gene2morbids;
517 open(TAB, $morbidmap_file)
518 or die "Cannot open OMIM morbid map file $morbidmap_file for reading: $!\n";
519 while(<TAB>){
520 my @fields = split /\|/, $_;
521 #print STDERR "got ", ($#fields+1), " fields in $_\n";
522 #my @fields = split /\|/, $_;
523 my $morbid = $fields[0];
524 $morbid =~ s/\s*\(\d+\)$//; # get rid of trailing parenthetical number
525 my $omim_id = $fields[2];
526 for my $genename (split /, /, $fields[1]){
527 if(not exists $gene2morbids{$genename}){
528 $gene2morbids{$genename} = "OMIM:$omim_id $morbid";
529 }
530 else{
531 $gene2morbids{$genename} .= "; $morbid";
532 }
533 }
534 }
535 close(TAB);
536
537 print STDERR "Reading in interpro mappings...\n" unless $quiet;
538 die "Data file $interpro_bed_file does not exist, aborting.\n" if not -e $interpro_bed_file;
539 my %interpro_ids;
540 open(TAB, $interpro_bed_file)
541 or die "Cannot open gene name BED file $interpro_bed_file for reading: $!\n";
542 while(<TAB>){
543 chomp;
544 # format should be "chr start stop interpro desc..."
545 my @fields = split /\t/, $_;
546 my $c = $fields[0];
547 if(not exists $interpro_ids{$c}){
548 $interpro_ids{$c} = [] unless exists $interpro_ids{$c};
549 $interpro_ids{"chr".$c} = [] unless $c =~ /^chr/;
550 $interpro_ids{$1} = [] if $c =~ /^chr(\S+)/;
551 }
552 my $dataref = [@fields[1..3]];
553 push @{$interpro_ids{$c}}, $dataref;
554 push @{$interpro_ids{"chr".$c}}, $dataref unless $c =~ /^chr/;
555 push @{$interpro_ids{$1}}, $dataref if $c =~ /^chr(\S+)/;
556 }
557 close(TAB);
558 print STDERR "Sorting interpro matches per contig...\n" unless $quiet;
559 for my $chr (keys %interpro_ids){
560 $interpro_ids{$chr} = [sort {$a->[0] <=> $b->[0]} @{$interpro_ids{$chr}}];
561 }
562
563 # {uc(gene_name) => space separated info}
564 print STDERR "Reading in tissue info...\n" unless $quiet;
565 my %tissues;
566 my %tissue_desc;
567 open(TISSUE, $tissue_file)
568 or die "Cannot open $tissue_file for reading: $!\n";
569 while(<TISSUE>){
570 chomp;
571 my @fields = split /\t/, $_;
572 my $gname = uc($fields[1]);
573 $tissue_desc{$gname} = $fields[2];
574 next unless $#fields == 4;
575 my @tis = split /;\s*/, $fields[4];
576 pop @tis;
577 if(@tis < 6){
578 $tissues{$gname} = join(">", @tis);
579 }
580 elsif(@tis < 24){
581 $tissues{$gname} = join(">", @tis[0..5]).">...";
582 }
583 else{
584 $tissues{$gname} = join(">", @tis[0..(int($#tis/5))]).">...";
585 }
586 }
587 close(TISSUE);
588
589 print STDERR "Loading pathway info...\n" unless $quiet;
590 my %pathways;
591 my %pathway_ids;
592 open(PATHWAY, $pathway_file)
593 or die "Cannot open $pathway_file for reading: $!\n";
594 while(<PATHWAY>){
595 chomp;
596 my @fields = split /\t/, $_;
597 next unless @fields >= 4;
598 for my $gname (split /\s+/, $fields[1]){
599 $gname = uc($gname);
600 if(not exists $pathways{$gname}){
601 $pathway_ids{$gname} = $fields[2];
602 $pathways{$gname} = $fields[3];
603 }
604 else{
605 $pathway_ids{$gname} .= "; ".$fields[2];
606 $pathways{$gname} .= "; ".$fields[3];
607 }
608 }
609 }
610 close(PATHWAY);
611
612 print STDERR "Loading SIFT indices...\n" unless $quiet;
613 # Read in the database table list
614 my %chr_tables;
615 open(DBLIST, "$sift_dir/bins.list")
616 or die "Cannot open $sift_dir/bins.list for reading: $!\n";
617 while(<DBLIST>){
618 chomp;
619 my @fields = split /\t/, $_;
620 if(not exists $chr_tables{$fields[1]}){
621 $chr_tables{$fields[1]} = {};
622 }
623 my $chr = $fields[1];
624 $chr = "chr$chr" unless $chr =~ /^chr/;
625 $chr_tables{$chr}->{"$chr\_$fields[2]_$fields[3]"} = [$fields[2],$fields[3]];
626 }
627 close(DBLIST);
628
629 #Connect to database
630 my $db_gene = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_Supp.sqlite","","",{RaiseError=>1, AutoCommit=>1});
631
632 my $db_chr;
633 my $fr_stmt; # retrieval for frameshift
634 my $snp_stmt; # retrieval for SNP
635 my $cur_chr = "";
636 my $cur_max_pos;
637
638 if(defined $predict_cryptic_splicings){
639 if(not -e $predict_cryptic_splicings){
640 die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, does not exist.\n";
641 }
642 if(not -e $predict_cryptic_splicings.".fai" and not -w dirname($predict_cryptic_splicings)){
643 die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, is not indexed, and the directory is not writable.\n";
644 }
645 print STDERR "Loading reference FastA index...\n" unless $quiet;
646 $fasta_index = Bio::DB::Sam::Fai->load($predict_cryptic_splicings);
647 }
648
649 print STDERR "Annotating variants...\n" unless $quiet;
650 # Assume contigs and positions in order
651 print STDERR "Processing file $hgvs_table_file...\n" unless $quiet;
652 open(HGVS, $hgvs_table_file)
653 or die "Cannot open HGVS file $hgvs_table_file for reading: $!\n";
654 # Output file for annotated snps and frameshifts
655 my $header = <HGVS>; # header
656 chomp $header;
657 my @headers = split /\t/, $header;
658 my ($chr_column, $pos_column, $ref_column, $alt_column, $alt_cnt_column, $tot_cnt_column, $ftype_column, $dbid_column, $maf_src_column, $maf_column, $internal_freq_column,
659 $cdna_hgvs_column, $aa_hgvs_column, $transcript_column, $transcript_length_column, $strand_column, $zygosity_column, $splice_dist_column, $caveats_column, $phase_column,
660 $srcs_column, $pvalue_column, $to_column, $genename_column, $rares_column, $rares_header);
661 my %req_columns = (
662 "Chr" => \$chr_column,
663 "DNA From" => \$pos_column,
664 "DNA To" => \$to_column,
665 "Gene Name" => \$genename_column,
666 "Ref base" => \$ref_column,
667 "Obs base" => \$alt_column,
668 "Variant Reads" => \$alt_cnt_column,
669 "Total Reads" => \$tot_cnt_column,
670 "Feature type" => \$ftype_column,
671 "Variant DB ID" => \$dbid_column,
672 "Pop. freq. source" => \$maf_src_column,
673 "Pop. freq." => \$maf_column,
674 "Transcript HGVS" => \$cdna_hgvs_column,
675 "Protein HGVS" => \$aa_hgvs_column,
676 "Selected transcript" => \$transcript_column,
677 "Transcript length" => \$transcript_length_column,
678 "Strand" => \$strand_column,
679 "Zygosity" => \$zygosity_column,
680 "Closest exon junction (AA coding variants)" => \$splice_dist_column,
681 "Caveats" => \$caveats_column,
682 "Phase" => \$phase_column,
683 "P-value" => \$pvalue_column);
684 &load_header_columns(\%req_columns, \@headers);
685 for(my $i = 0; $i <= $#headers; $i++){
686 # Optional columns go here
687 if($headers[$i] eq "Sources"){
688 $srcs_column = $i;
689 }
690 elsif($headers[$i] eq "Internal pop. freq."){
691 $internal_freq_column = $i;
692 }
693 elsif($headers[$i] =~ /^Num rare variants/){
694 $rares_column = $i;
695 $rares_header = $headers[$i];
696 }
697 }
698 open(OUT, ">$outfile")
699 or die "Cannot open $outfile for writing: $!\n";
700 print OUT join("\t", "Chr", "DNA From", "DNA To", "Ref base", "Obs base", "% ref read","% variant read",
701 "Variant Reads", "Total Reads", "Feature type","Variant type", "Variant context", "Variant DB ID", "Pop. freq. source",
702 "Pop. freq."), (defined $internal_freq_column ? "\tInternal pop. freq.\t" : "\t"), join("\t", "SIFT Score", "Polyphen2 Score", "Polyphen2 call", "GERP score",
703 "Gene Name", "Transcript HGVS",
704 "Protein HGVS", "Zygosity","Closest exon junction (AA coding variants)","Splicing alteration potential","Splicing alteration details",
705 "OMIM Morbid Map","Tissue expression", "Pathways", "Pathway refs",
706 "Gene Function", "Spanning InterPro Domains", "P-value", "Caveats", "Phase",
707 "Selected transcript", "Transcript length", "Strand"),
708 (defined $rares_column ? "\t$rares_header" : ""),
709 (defined $srcs_column ? "\tSources" : ""), "\n";
710 while(<HGVS>){
711 chomp;
712 my @fields = split /\t/, $_, -1;
713 my $mode;
714 my $protein_hgvs = $fields[$aa_hgvs_column];
715 my $refSeq = $fields[$ref_column];
716 my $newBase = $fields[$alt_column];
717 my $splicing_potential = "NA";
718 my $splicing_details = "NA";
719 if(defined $predict_cryptic_splicings){
720 my $distance = $fields[$splice_dist_column]; # only for coding variants
721 my $site_type;
722 if($distance eq "NA"){
723 }
724 elsif($distance){
725 $site_type = $distance < 0 ? "donor" : "acceptor"; # exon context
726 }
727 else{
728 if($fields[$cdna_hgvs_column] =~ /\d+([\-+]\d+)/){ # get from the cDNA HGVS otherwise
729 $distance = $1;
730 $distance =~ s/^\+//;
731 $site_type = $distance > 0 ? "donor" : "acceptor"; # intron context
732 }
733 }
734 # only do splicing predictions for novel or rare variants, since these predictions are expensive
735 if($distance and $distance ne "NA" and ($fields[$maf_column] eq "NA" or $fields[$maf_column] <= 0.01)){
736 #print STDERR "Running prediction for rare variant ($fields[3], MAF=$fields[14]): $fields[5], $fields[6], $refSeq, $newBase, $fields[4], $distance, $site_type\n";
737 ($splicing_potential, $splicing_details) = &predict_splicing($fields[$chr_column], $fields[$pos_column], $refSeq, $newBase, $fields[$strand_column], $distance, $site_type);
738 }
739 }
740 my $lookupBase;
741 if($fields[$strand_column] eq "-"){
742 if($refSeq ne "NA"){
743 $refSeq = reverse($refSeq);
744 $refSeq =~ tr/ACGTacgt/TGCAtgca/;
745 }
746 if($newBase ne "NA"){
747 $newBase = reverse($newBase);
748 $newBase =~ tr/ACGTacgt/TGCAtgca/;
749 $newBase =~ s(TN/)(NA/);
750 $newBase =~ s(/TN)(/NA);
751 }
752 }
753 if($fields[$cdna_hgvs_column] =~ /^g\..*?(?:ins|delins|>)([ACGTN]+)$/){
754 $mode = "snps";
755 }
756 elsif($fields[$cdna_hgvs_column] =~ /^g\..*?del([ACGTN]+)$/){
757 $mode = "snps";
758 }
759 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+(?:[\-+]\d+)?ins([ACGTN]+)$/ or $fields[$cdna_hgvs_column] =~ /^c\.\d+(?:[\-+]\d+)?_\d+ins([ACGTN]+)$/){
760 if(not length($1)%3){
761 $mode = "snps";
762 }
763 else{
764 $mode = "frameshifts";
765 }
766 }
767 elsif($fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:[\-+]\d+)?_(\d+)(?:[\-+]\d+)?delins([ACGTN]+)$/){
768 if(not (length($3)-$2+$1-1)%3){
769 $mode = "snps";
770 }
771 else{
772 $mode = "frameshifts";
773 }
774 }
775 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[+\-]\d+_\d+[+\-]\d+delins([ACGTN]+)$/ or
776 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or
777 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or
778 $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+del([ACGTN]*)$/ or
779 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?(?:ins|delins)?([ACGTN]+)$/){
780 $mode = "snps";
781 }
782 elsif($fields[$cdna_hgvs_column] =~ /^c\.([-*]?\d+)del(\S+)/){
783 my $dist = $1;
784 $dist =~ s/^\*//;
785 if(abs($dist) < 4){
786 $mode = "snps";
787 }
788 else{
789 $mode = "frameshifts";
790 }
791 }
792 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+ins(\S*)/ or
793 $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+del(\S*)/){
794 $mode = "snps";
795 }
796 elsif($fields[$cdna_hgvs_column] =~ /^c\.(-?\d+)_(\d+)(?:\+\d+)?del(\S+)/ or
797 $fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:\-\d+)?_(\d+)del(\S+)/){
798 if(not ($2-$1+1)%3){
799 $mode = "snps";
800 }
801 else{
802 $mode = "frameshifts";
803 }
804 }
805 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+ins([ACGTN]+)$/){
806 if(not length($1)%3){
807 $mode = "snps";
808 }
809 else{
810 $mode = "frameshifts";
811 }
812 }
813 # special case of shifted start codon
814 elsif($fields[$cdna_hgvs_column] =~ /^c\.-1_1+ins([ACGTN]+)$/ or
815 $fields[$cdna_hgvs_column] =~ /inv$/){
816 $mode = "snps";
817 }
818 elsif($fields[$cdna_hgvs_column] =~ /[ACGTN]>([ACGTN])$/){
819 # todo: handle multiple consecutive substitutions, will have to run polyphen separately
820 $lookupBase = $newBase;
821 if($fields[$strand_column] eq "-"){ # revert to positive strand variant for SIFT index
822 $lookupBase =~ tr/ACGTacgt/TGCAtgca/;
823 }
824 $mode = "snps";
825 }
826 else{
827 $mode = "snps";
828 }
829 if($fields[$chr_column] ne $cur_chr){
830 #Prepared DB connection on per-chromosome basis
831 $cur_chr = $fields[$chr_column];
832 $cur_chr =~ s/^chr//;
833 $db_chr = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_CHR$cur_chr.sqlite",
834 "",
835 "",
836 { RaiseError => 1, AutoCommit => 1 }) unless not -e "$sift_dir/Human_CHR$cur_chr.sqlite";
837
838 $cur_max_pos = -1;
839 }
840 if($fields[$pos_column] =~ /^\d+$/ and $cur_max_pos < $fields[$pos_column]){
841 undef $fr_stmt;
842 undef $snp_stmt;
843 my $c = $fields[$chr_column];
844 $c = "chr$c" if $c !~ /^chr/;
845 for my $chr_table_key (keys %{$chr_tables{$c}}){
846 my $chr_table_range = $chr_tables{$c}->{$chr_table_key};
847 if($chr_table_range->[0] <= $fields[$pos_column] and $chr_table_range->[1] >= $fields[$pos_column]){
848 $fr_stmt = $db_chr->prepare("select SCORE".
849 " from $chr_table_key where COORD1 = ?");
850 $snp_stmt = $db_chr->prepare("select SCORE".
851 " from $chr_table_key where COORD1 = ? AND NT2 = ?");
852 $cur_max_pos = $chr_table_range->[1];
853 last;
854 }
855 }
856 if(not defined $fr_stmt and not $quiet){
857 warn "Got request for position not in range of SIFT ($c:$fields[$pos_column])\n";
858 }
859 }
860
861 my @cols;
862 if(not $fr_stmt){
863 @cols = ("NA");
864 }
865 elsif($mode eq "frameshifts"){
866 $fr_stmt->execute($fields[$pos_column]);
867 @cols = $fr_stmt->fetchrow_array();
868 }
869 else{
870 $snp_stmt->execute($fields[$pos_column]-1, $lookupBase);
871 @cols = $snp_stmt->fetchrow_array();
872 }
873 # See if the change is in a CDS, and if it's non-synonymous
874 my ($type, $location) = ("silent", "intron");
875 my $start = $fields[$pos_column];
876 my $stop = $fields[$to_column];
877 my $interpro = get_range_info(\%interpro_ids, $fields[$chr_column], $start, $stop);
878 my $domains = join("; ", sort keys %$interpro);
879
880 my $sift_score = "NA";
881 my $variant_key = "NA";
882 my $transcript_hgvs = $fields[$cdna_hgvs_column];
883 if($transcript_hgvs =~ /\.\d+[ACGTN]>/ or $transcript_hgvs =~ /\.\d+(?:_|del|ins)/ or $transcript_hgvs =~ /[+\-]\?/){
884 $location = "exon";
885 }
886 elsif($transcript_hgvs =~ /\.\d+[\-+](\d+)/){
887 if($1 < 21){
888 $type = "splice";
889 }
890 else{
891 $type = "intronic";
892 }
893 $location = "splice site";
894 }
895 elsif($transcript_hgvs =~ /\.\*\d+/){
896 $location = "3'UTR";
897 }
898 elsif($transcript_hgvs =~ /\.-\d+/){
899 $location = "5'UTR";
900 }
901 if($mode eq "frameshifts"){
902 $type = "frameshift";
903 }
904 else{
905 if(not defined $protein_hgvs){
906 $type = "unknown";
907 }
908 elsif($protein_hgvs eq "NA"){
909 $type = "non-coding";
910 }
911 elsif($protein_hgvs =~ /=$/){
912 $type = "silent";
913 }
914 elsif($protein_hgvs =~ /\d+\*/){ # nonsense
915 $type = "nonsense";
916 }
917 elsif($protein_hgvs =~ /ext/){ # nonsense
918 $type = "nonstop";
919 }
920 elsif($protein_hgvs eq "p.0?" or $protein_hgvs eq "p.?"){
921 $type = "unknown";
922 }
923 else{
924 $type = "missense";
925 }
926
927 $sift_score = $cols[0] || "NA";
928
929 # Record the need to fetch VCF polyphen and gerp data later (en masse, for efficiency)
930 # Use newBase instead of lookupBase for PolyPhen since the orientation is always relative to the transcript
931 $variant_key = get_variant_key($fields[$chr_column], $fields[$pos_column], $fields[$ref_column], $newBase);
932 #print STDERR "Variant key is $variant_key\n";
933 }
934 my $transcript_type = $fields[$ftype_column];
935 my $transcript_length = $fields[$transcript_length_column];
936 my $transcript_id = $fields[$transcript_column]; # sift score is derived from this transcript
937 my $transcript_strand = $fields[$strand_column];
938 my $zygosity = $fields[$zygosity_column];
939 my $rsID = $fields[$dbid_column];
940 my $popFreq = $fields[$maf_column];
941 my $internalFreq;
942 $internalFreq = $fields[$internal_freq_column] if defined $internal_freq_column;
943 my $popFreqSrc = $fields[$maf_src_column];
944
945 # Get the gene name and omim info
946 my %seen;
947 my @gene_names = split /; /, $fields[$genename_column];
948 my $morbid = join("; ", grep {/\S/ and not $seen{$_}++} map({$gene2morbids{$_} or ""} @gene_names));
949 my $tissue = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissues{$_} or ""} @gene_names));
950 my $function = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissue_desc{$_} or ""} @gene_names));
951 my $pathways = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathways{$_} or ""} @gene_names));
952 my $pathway_ids = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathway_ids{$_} or ""} @gene_names));
953
954 # It may be that the calls and freqs are multiple concatenated values, e.g. "X; Y"
955 my @tot_bases = split /; /,$fields[$tot_cnt_column];
956 my @var_bases = split /; /,$fields[$alt_cnt_column];
957 my (@pct_nonvar, @pct_var);
958 for (my $i = 0; $i <= $#tot_bases; $i++){
959 push @pct_nonvar, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int(($tot_bases[$i]-$var_bases[$i])/$tot_bases[$i]*100+0.5);
960 push @pct_var, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int($var_bases[$i]/$tot_bases[$i]*100+0.5);
961 }
962
963 push @lines, [
964 $fields[$chr_column], # chr
965 $fields[$pos_column], # pos
966 $fields[$to_column],
967 $fields[$ref_column],
968 $fields[$alt_column],
969 join("; ", @pct_nonvar), # pct ref
970 join("; ", @pct_var), # pct var
971 $fields[$alt_cnt_column], # num var
972 $fields[$tot_cnt_column], # num reads
973 $transcript_type, # protein_coding, pseudogene, etc.
974 $type,
975 $location,
976 $rsID,
977 $popFreqSrc,
978 (defined $internal_freq_column ? ($popFreq,$internalFreq) : ($popFreq)),
979 $sift_score,
980 $variant_key, # 16th field
981 join("; ", @gene_names),
982 $transcript_hgvs,
983 $protein_hgvs,
984 $zygosity,
985 $fields[$splice_dist_column], # distance to exon edge
986 $splicing_potential,
987 $splicing_details,
988 $morbid,
989 $tissue,
990 $pathways,
991 $pathway_ids,
992 $function,
993 $domains,
994 $fields[$pvalue_column],
995 $fields[$caveats_column], # caveats
996 (defined $rares_column ? ($fields[$rares_column],$transcript_id) : ($transcript_id)),
997 $transcript_length,
998 $transcript_strand,
999 (defined $srcs_column ? ($fields[$phase_column],$fields[$srcs_column]) : ($fields[$phase_column]))];
1000 }
1001
1002 print STDERR "Adding Polyphen2 and GERP info...\n" unless $quiet;
1003 # retrieve the polyphen and gerp info en masse for each chromosome, as this is much more efficient
1004 my $vkey_col = 16+(defined $internal_freq_column ? 1 : 0);
1005 for my $line_fields (@lines){
1006 # expand the key into the relevant MAF values
1007 $line_fields->[$vkey_col] = join("\t", polyphen_gerp_info($gerp_dir,$polyphen_file,$line_fields->[$vkey_col])); # fields[$vkey_col] is variant key
1008 print OUT join("\t", @{$line_fields}), "\n";
1009 }
1010 close(OUT);
1011 unlink $tmpfile if defined $tmpfile;
1012
1013 sub load_header_columns{
1014 my ($reqs_hash_ref, $headers_array_ref) = @_;
1015 my %unfulfilled;
1016 for my $field_name (keys %$reqs_hash_ref){
1017 $unfulfilled{$field_name} = 1;
1018 }
1019 for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){
1020 for my $req_header_name (keys %$reqs_hash_ref){
1021 if($req_header_name eq $headers_array_ref->[$i]){
1022 ${$reqs_hash_ref->{$req_header_name}} = $i;
1023 delete $unfulfilled{$req_header_name};
1024 last;
1025 }
1026 }
1027 }
1028 if(keys %unfulfilled){
1029 die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n";
1030 }
1031 }