0
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 use Bio::DB::Sam;
|
|
4 use DBI;
|
|
5 use IPC::Open2;
|
|
6 use Set::IntervalTree;
|
|
7 use strict;
|
|
8 use warnings;
|
|
9 use vars qw($quiet %chr2variant_locs %polyphen_info %chr2polyphen_vcf_lines %chr2gerp_vcf_lines %range2genes $fasta_index $UNAFFECTED $tmpfile);
|
|
10 use File::Basename;
|
|
11 $UNAFFECTED = -299999; # sentinel for splice site predictions
|
|
12 my $current_directory = dirname(__FILE__);
|
|
13 if(@ARGV == 1 and $ARGV[0] eq "-v"){
|
|
14 print "Version 1.0\n";
|
|
15 exit;
|
|
16 }
|
|
17
|
|
18 #configuration file stuff
|
|
19 my %config;
|
|
20 my $tool_dir = shift @ARGV;
|
|
21 if(not -e "$tool_dir/annotate_hgvs.loc"){
|
|
22 system("cp $current_directory/tool-data/annotate_hgvs.loc $tool_dir/annotate_hgvs.loc");
|
|
23 }
|
|
24 open CONFIG, '<', "$tool_dir/annotate_hgvs.loc";
|
|
25 while(<CONFIG>){
|
|
26 next if $_ =~ /^#/;
|
|
27 (my $key, my $value) = split(/\s+/,$_);
|
|
28 $config{$key} = $value;
|
|
29 }
|
|
30 close CONFIG;
|
|
31 my $dbs_dir = $config{"dbs_dir"};
|
|
32
|
|
33 $quiet = 0;
|
|
34 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
|
|
35 $quiet = 1;
|
|
36 shift @ARGV;
|
|
37 }
|
|
38
|
|
39 @ARGV == 9 or @ARGV == 10 or die "Usage: $0 -q <SIFT dir> <POLYPHEN2 file> <GERP dir> <tissuedb file> <pathway file> <interpro domains.bed> <omim morbidmap> <input hgvs table.txt> <output hgvs table.annotated.txt> [reference.fasta]\n Where if the reference fasta is provided, we predict cryptic splicing effects (using MaxEntScan and GeneSplicer)\n";
|
|
40
|
|
41 my $sift_dir = $dbs_dir . shift @ARGV;
|
|
42 my $polyphen_file = $dbs_dir . shift @ARGV;
|
|
43 my $gerp_dir = $dbs_dir . shift @ARGV;
|
|
44 my $tissue_file = $dbs_dir . shift @ARGV;
|
|
45 my $pathway_file = $dbs_dir. shift @ARGV;
|
|
46 my $interpro_bed_file = $dbs_dir . shift @ARGV;
|
|
47 my $morbidmap_file = $dbs_dir . shift @ARGV;
|
|
48 my $hgvs_table_file = shift @ARGV;
|
|
49 my $outfile = shift @ARGV;
|
|
50 my $predict_cryptic_splicings = @ARGV ? ($dbs_dir . shift @ARGV) : undef;
|
|
51 my $flanking_bases = 500; # assume this is ROI around a gene
|
|
52 my @lines;
|
|
53
|
|
54 sub polyphen_gerp_info($$$){
|
|
55 my($gerp_dir,$polyphen_file,$info_key) = @_;
|
|
56
|
|
57 my($chr,$pos,$ref,$variant) = split /:/, $info_key;
|
|
58
|
|
59 # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse
|
|
60 if(not exists $chr2polyphen_vcf_lines{$chr}){
|
|
61 ($chr2polyphen_vcf_lines{$chr}, $chr2gerp_vcf_lines{$chr}) = retrieve_vcf_lines($gerp_dir,$polyphen_file,$chr);
|
|
62 }
|
|
63
|
|
64 my @results;
|
|
65 if(not $ref or not $variant){ # can only get data for SNPs
|
|
66 @results = qw(NA NA NA);
|
|
67 }
|
|
68 #print STDERR "ref = $ref and variant = $variant for $info_key\n";
|
|
69 elsif(length($ref) == 1 and length($variant) == 1){
|
|
70 #print STDERR "Grabbing poly/gerp for $chr,$pos,$ref,$variant\n";
|
|
71 @results = polyphen_info($chr,$pos,$variant);
|
|
72 $results[2] = gerp_info($chr,$pos);
|
|
73 }
|
|
74
|
|
75 # It may be that a novel MNP variant is a bunch of known SNPs in a row. In this case,
|
|
76 # report the list of variants
|
|
77 elsif(length($ref) == length($variant) and $ref ne "NA"){
|
|
78 my (@poly_scores, @poly_calls, @gerp_scores);
|
|
79 for(my $i = 0; $i < length($ref); $i++){
|
|
80 my $refbase = substr($ref, $i, 1);
|
|
81 my $newbase = substr($variant, $i, 1);
|
|
82 if($newbase eq $refbase){
|
|
83 push @poly_scores, "-";
|
|
84 push @poly_calls, "-";
|
|
85 push @gerp_scores, "-";
|
|
86 next;
|
|
87 }
|
|
88 my @base_results = polyphen_info($chr,$pos+$i,$refbase,$newbase);
|
|
89 push @poly_scores, $base_results[0];
|
|
90 push @poly_calls, $base_results[1];
|
|
91 $base_results[2] = gerp_info($chr,$pos+$i);
|
|
92 push @gerp_scores, $base_results[2];
|
|
93 }
|
|
94 $results[0] = join(",", @poly_scores);
|
|
95 $results[1] = join(",", @poly_calls);
|
|
96 $results[2] = join(",", @gerp_scores);
|
|
97 }
|
|
98 else{
|
|
99 @results = qw(NA NA NA);
|
|
100 }
|
|
101
|
|
102 return @results;
|
|
103 }
|
|
104
|
|
105 sub get_variant_key($$$$){
|
|
106 # params chr pos ref variant
|
|
107 my $c = $_[0];
|
|
108 $c =~ s/^chr//;
|
|
109 my $prop_info_key = join(":", $c, @_[1..3]);
|
|
110 $chr2variant_locs{$c} = [] unless exists $chr2variant_locs{$c};
|
|
111 push @{$chr2variant_locs{$c}}, $_[1];
|
|
112 return $prop_info_key;
|
|
113 }
|
|
114
|
|
115 sub retrieve_vcf_lines($$$){
|
|
116 my ($gerp_dir, $polyphen_file, $chr) = @_;
|
|
117
|
|
118 my (%polyphen_lines, %gerp_lines);
|
|
119
|
|
120 if(not exists $chr2variant_locs{$chr}){
|
|
121 return ({}, {}); # no data requested for this chromosome
|
|
122 }
|
|
123
|
|
124 # build up the request
|
|
125 my @tabix_regions;
|
|
126 my @var_locs = grep /^\d+$/, @{$chr2variant_locs{$chr}}; # grep keeps point mutations only
|
|
127 if(not @var_locs){
|
|
128 return ({}, {}); # no point mutations
|
|
129 }
|
|
130
|
|
131 # sort by variant start location
|
|
132 for my $var_loc (sort {$a <=> $b} @var_locs){
|
|
133 push @tabix_regions, $chr.":".$var_loc."-".$var_loc;
|
|
134 }
|
|
135 my $regions = join(" ", @tabix_regions);
|
|
136
|
|
137 # retrieve the data
|
|
138 die "Cannot find Polyphen2 VCF file $polyphen_file\n" if not -e $polyphen_file;
|
|
139 open(VCF, "tabix $polyphen_file $regions |")
|
|
140 or die "Cannot run tabix on $polyphen_file: $!\n";
|
|
141 while(<VCF>){
|
|
142 if(/^\S+\t(\d+)/ and grep {$_ eq $1} @var_locs){ # take only main columns to save room, if possible
|
|
143 chomp;
|
|
144 $polyphen_lines{$1} = [] unless exists $polyphen_lines{$1};
|
|
145 push @{$polyphen_lines{$1}}, $_;
|
|
146 }
|
|
147 }
|
|
148 close(VCF);
|
|
149
|
|
150 my $vcf_file = "$gerp_dir/chr$chr.tab.gz";
|
|
151 if(not -e $vcf_file){
|
|
152 warn "Cannot find GERP VCF file $vcf_file\n" if not $quiet;
|
|
153 return ({}, {});
|
|
154 }
|
|
155 open(VCF, "tabix $vcf_file $regions |")
|
|
156 or die "Cannot run tabix on $vcf_file: $!\n";
|
|
157 while(<VCF>){
|
|
158 if(/^(\S+\t(\d+)(?:\t\S+){2})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible
|
|
159 $gerp_lines{$2} = [] unless exists $gerp_lines{$2};
|
|
160 push @{$gerp_lines{$2}}, $1;
|
|
161 }
|
|
162 }
|
|
163 close(VCF);
|
|
164
|
|
165 return (\%polyphen_lines, \%gerp_lines);
|
|
166 }
|
|
167
|
|
168 sub polyphen_info($$$){
|
|
169 my ($chr, $pos, $variant_base) = @_;
|
|
170
|
|
171 my $key = "$chr:$pos:$variant_base";
|
|
172 if(exists $polyphen_info{$key}){
|
|
173 return @{$polyphen_info{$key}};
|
|
174 }
|
|
175
|
|
176 #print STDERR "Checking for polyphen data for $chr, $pos, $variant_base\n";
|
|
177 if(exists $chr2polyphen_vcf_lines{$chr}->{$pos}){
|
|
178 for(@{$chr2polyphen_vcf_lines{$chr}->{$pos}}){
|
|
179 #print STDERR "Checking $chr, $pos, $variant_base against $_";
|
|
180 my @fields = split /\t/, $_;
|
|
181 if($fields[4] eq $variant_base){
|
|
182 if($fields[6] eq "D"){
|
|
183 $polyphen_info{$key} = [$fields[5],"deleterious"];
|
|
184 last;
|
|
185 }
|
|
186 elsif($fields[6] eq "P"){
|
|
187 $polyphen_info{$key} = [$fields[5],"poss. del."];
|
|
188 last;
|
|
189 }
|
|
190 elsif($fields[6] eq "B"){
|
|
191 $polyphen_info{$key} = [$fields[5],"benign"];
|
|
192 last;
|
|
193 }
|
|
194 else{
|
|
195 $polyphen_info{$key} = [$fields[5],$fields[6]];
|
|
196 last;
|
|
197 }
|
|
198 }
|
|
199 }
|
|
200 if(not exists $polyphen_info{$key}){
|
|
201 $polyphen_info{$key} = ["NA", "NA"];
|
|
202 }
|
|
203 }
|
|
204 else{
|
|
205 $polyphen_info{$key} = ["NA", "NA"];
|
|
206 }
|
|
207 return @{$polyphen_info{$key}};
|
|
208 }
|
|
209
|
|
210 sub gerp_info($$){
|
|
211 my ($chr,$pos) = @_;
|
|
212
|
|
213 if(exists $chr2gerp_vcf_lines{$chr}->{$pos}){
|
|
214 for(@{$chr2gerp_vcf_lines{$chr}->{$pos}}){
|
|
215 my @fields = split /\t/, $_;
|
|
216 return $fields[3];
|
|
217 }
|
|
218 }
|
|
219 return "NA";
|
|
220 }
|
|
221
|
|
222 sub predict_splicing{
|
|
223 my ($chr, $pos, $ref, $var, $strand, $exon_edge_distance, $splice_type) = @_;
|
|
224
|
|
225 my $disruption_level = 0;
|
|
226 my @disruption_details;
|
|
227 # The methods being used only look for up to a 30 base context, so only
|
|
228 # need to check for obliterated acceptors/donors if they are nearby
|
|
229 $exon_edge_distance-- if $exon_edge_distance > 0; # +1 is actually in the right spot, so decrement
|
|
230 my $edge_offset = $strand eq "-" ? -1*$exon_edge_distance : $exon_edge_distance; # flip offset sign if the call was on the negative strand (since +/- was relative to the transcript, not the genome coordinates)
|
|
231 my $annotated_detected = 0;
|
|
232 my ($genesplicer_orig, $genesplicer_orig_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref);
|
|
233 if($genesplicer_orig != 0){
|
|
234 $annotated_detected = 1;
|
|
235 }
|
|
236 my ($maxentscan_orig, $maxentscan_orig_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref);
|
|
237 if(not $annotated_detected and $maxentscan_orig){
|
|
238 $annotated_detected = 1;
|
|
239 }
|
|
240 my ($genesplicer_mod, $genesplicer_mod_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance);
|
|
241 if($genesplicer_mod != $UNAFFECTED){ # was the mod within the range where this software could detect up or downstream effects on the annotated splice site?
|
|
242 if($genesplicer_orig_pos != $genesplicer_mod_pos){
|
|
243 $disruption_level+=0.75;
|
|
244 if($genesplicer_mod_pos == -1){
|
|
245 push @disruption_details, "Variant obliterates GeneSplicer predicted splice site from the reference sequence ($chr:$genesplicer_orig_pos, orig score $genesplicer_orig)";
|
|
246 }
|
|
247 elsif($genesplicer_orig_pos != -1){
|
|
248 push @disruption_details, "Variant moves GeneSplicer preferred splice site from annotated location $chr:$genesplicer_orig_pos to $chr:$genesplicer_mod_pos (scores: orig=$genesplicer_orig, new=$genesplicer_mod)";
|
|
249 }
|
|
250 }
|
|
251 elsif($genesplicer_orig-$genesplicer_mod > 1){
|
|
252 $disruption_level+=0.5;
|
|
253 push @disruption_details, "Variant significantly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)";
|
|
254 }
|
|
255 elsif($genesplicer_orig > $genesplicer_mod){
|
|
256 $disruption_level+=0.25;
|
|
257 push @disruption_details, "Variant slightly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)";
|
|
258 }
|
|
259 $genesplicer_orig = $genesplicer_mod; # so that splice site comparison vs. cryptic is accurate
|
|
260 }
|
|
261 my ($maxentscan_mod, $maxentscan_mod_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance);
|
|
262 if($maxentscan_mod != $UNAFFECTED){
|
|
263 if($maxentscan_mod_pos != $maxentscan_orig_pos){
|
|
264 $disruption_level+=1;
|
|
265 if($maxentscan_mod_pos == -1){
|
|
266 push @disruption_details, "Variant obliterates MaxEntScan predicted splice site from the reference sequence ($chr:$maxentscan_orig_pos, orig score $maxentscan_orig)";
|
|
267 }
|
|
268 elsif($maxentscan_orig_pos != -1){
|
|
269 push @disruption_details, "Variant moves MaxEntScan preferred splice site from annotated location $chr:$maxentscan_orig_pos to $chr:$maxentscan_mod_pos (scores: orig=$maxentscan_orig, new=$maxentscan_mod)";
|
|
270 }
|
|
271 }
|
|
272 elsif($maxentscan_orig-$maxentscan_mod > 1){
|
|
273 $disruption_level+=0.5;
|
|
274 push @disruption_details, "Variant significantly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)";
|
|
275 }
|
|
276 elsif($maxentscan_orig>$maxentscan_mod){
|
|
277 $disruption_level+=0.25;
|
|
278 push @disruption_details, "Variant slightly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)";
|
|
279 }
|
|
280 $maxentscan_orig = $maxentscan_mod;
|
|
281 }
|
|
282
|
|
283 # Regardless of location, see if there is an increased chance of creating a cryptic splicing site
|
|
284 my ($genesplicer_control, $genesplicer_control_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref);
|
|
285 my ($genesplicer_cryptic, $genesplicer_cryptic_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref, $var);
|
|
286 if(defined $genesplicer_orig and $genesplicer_control < $genesplicer_orig and $genesplicer_orig < $genesplicer_cryptic){
|
|
287 $disruption_level+=0.75;
|
|
288 push @disruption_details, "Variant causes GeneSplicer score for a potential cryptic $splice_type splice site at $chr:$genesplicer_cryptic_pos to become higher than the annotated splice site $chr:$genesplicer_orig_pos (orig=$genesplicer_orig vs. var=$genesplicer_cryptic)";
|
|
289 }
|
|
290 if($genesplicer_control - $genesplicer_cryptic < -1){
|
|
291 $disruption_level+=0.5;
|
|
292 push @disruption_details, "Variant raises GeneSplicer score for a potential cryptic $splice_type splice site (orig=$genesplicer_control vs. var=$genesplicer_cryptic)";
|
|
293 }
|
|
294 my ($maxentscan_control, $maxentscan_control_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref);
|
|
295 my ($maxentscan_cryptic, $maxentscan_cryptic_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref, $var);
|
|
296 if(defined $maxentscan_orig and $maxentscan_control < $maxentscan_orig and $maxentscan_orig < $maxentscan_cryptic){
|
|
297 $disruption_level++;
|
|
298 push @disruption_details, "Variant causes MaxEntScan score for a potential cryptic $splice_type splice site at $chr:$maxentscan_cryptic_pos to become higher than the annotated splice site $chr:$maxentscan_orig_pos (orig=$maxentscan_orig vs. var=$maxentscan_cryptic)";
|
|
299 }
|
|
300 if($maxentscan_control - $maxentscan_cryptic < -1 and $maxentscan_cryptic > 0){
|
|
301 $disruption_level+=0.5;
|
|
302 push @disruption_details, "Variant raises MaxEntScan score for a potential cryptic $splice_type splice site (orig=$maxentscan_control vs. var=$maxentscan_cryptic)";
|
|
303 }
|
|
304
|
|
305 # If neither method predicted the annotated donor site, and the alternate site isn't predicted either, note that so the user doesn't think the mod is definite okay
|
|
306 if(not @disruption_details and $maxentscan_orig < 0 and $genesplicer_orig < 0){
|
|
307 $disruption_level+=0.5;
|
|
308 push @disruption_details, "Neither the annotated splice site nor any potential cryptic site introduced by the variant are predicted by either GeneSplicer or MaxEntScan, therefore the alt splicing potential for this variant should be evaluated manually";
|
|
309 }
|
|
310
|
|
311 return ($disruption_level, join("; ", @disruption_details));
|
|
312 }
|
|
313
|
|
314 sub call_genesplicer{
|
|
315 my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_;
|
|
316
|
|
317 if($splice_type eq "acceptor"){
|
|
318 $pos += $strand eq "-" ? 2: -2;
|
|
319 }
|
|
320 my $num_flanking = 110; # seems to croak if less than 160bp provided
|
|
321 my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins
|
|
322 my $site_window = defined $var_offset ? 2 : ($indel_size > 0 ? 29+$indel_size : 29); # exact position if looking for exisitng site effect, otherwise 10 or 10 + the insert size
|
|
323 my $cmd = "genesplicer";
|
|
324 my $start = $pos - $num_flanking;
|
|
325 $start = 1 if $start < 1;
|
|
326 my $stop = $pos + $num_flanking + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 2*$num_flanking after the edit
|
|
327 $stop++; # end coordinate is exclusive
|
|
328 return ($UNAFFECTED, -1) if defined $var_offset and ($pos+$var_offset < $start or $pos+$var_offset > $stop); # variant is outside of range we can detect affect on original splice site
|
|
329 my $seq = $fasta_index->fetch("$chr:$start-$stop");
|
|
330 if(defined $var){
|
|
331 if(defined $var_offset){ # substitution away from the site of interest
|
|
332 substr($seq, $num_flanking+$var_offset, length($ref)) = $var;
|
|
333 }
|
|
334 else{ # substitution at the site of interest
|
|
335 substr($seq, $num_flanking, length($ref)) = $var;
|
|
336 }
|
|
337 }
|
|
338 if($strand eq "-"){
|
|
339 $seq = reverse($seq);
|
|
340 $seq =~ tr/ACGTacgt/TGCAtgca/;
|
|
341 }
|
|
342 $tmpfile = "$$.fasta";
|
|
343 open(TMP, ">$tmpfile") or die "Could not open $tmpfile for writing: $!\n";
|
|
344 print TMP ">$chr $strand $pos $ref",($var?" $var":""), ($var_offset?" $var_offset":""),"\n$seq\n";
|
|
345 substr($seq, $num_flanking, 2) = lc(substr($seq, $num_flanking, 2));
|
|
346 #print STDERR "$chr $strand $pos $ref $var $var_offset: $seq\n";
|
|
347 close(TMP);
|
|
348 #print STDERR "$cmd $tmpfile /export/achri_data/programs/GeneSplicer/human\n";
|
|
349 #<STDIN>;
|
|
350 open(GS, "$cmd $tmpfile human 2> /dev/null|")
|
|
351 or die "Could not run $cmd: $!\n";
|
|
352 my $highest_score = 0;
|
|
353 my $highest_position = -1;
|
|
354 while(<GS>){
|
|
355 # End5 End3 Score "confidence" splice_site_type
|
|
356 # E.g. :
|
|
357 # 202 203 2.994010 Medium donor
|
|
358 chomp;
|
|
359 my @F = split /\s+/, $_;
|
|
360 next if $F[1] < $F[0]; # wrong strand
|
|
361 if($splice_type eq $F[4]){
|
|
362 if(abs($F[0]-$num_flanking-1) <= $site_window){ # right type and approximate location
|
|
363 #print STDERR "*$_\n";
|
|
364 if($F[2] > $highest_score){
|
|
365 $highest_score = $F[2];
|
|
366 # last bit accounts for indels in the position offset
|
|
367 $highest_position = $pos + ($strand eq "-" ? -1: 1)*(-$num_flanking + $F[0] - 1) + (($F[0] <= $num_flanking && $strand eq "+" or $F[0] >= $num_flanking && $strand eq "-") ? -$indel_size : 0);
|
|
368 }
|
|
369 }
|
|
370 }
|
|
371 #else{print STDERR $_,"\n";}
|
|
372 }
|
|
373 close(GS);
|
|
374 return ($highest_score, $highest_position);
|
|
375 }
|
|
376
|
|
377 sub call_maxentscan{
|
|
378 my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_;
|
|
379
|
|
380 my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins
|
|
381 my $cmd;
|
|
382 my ($num_flanking5, $num_flanking3);
|
|
383 my ($region5, $region3);
|
|
384 my ($start, $stop);
|
|
385 my $highest_score = -9999;
|
|
386 my $highest_position = -1;
|
|
387 # note that donor and acceptor are very different beasts for MaxEntScan
|
|
388 if($splice_type eq "acceptor"){
|
|
389 if($strand eq "-"){
|
|
390 $pos += 2;
|
|
391 }
|
|
392 else{
|
|
393 $pos -= 2;
|
|
394 }
|
|
395 $cmd = "score3.pl";
|
|
396 $num_flanking5 = 18;
|
|
397 $num_flanking3 = 4;
|
|
398 }
|
|
399 else{ # donor
|
|
400 $cmd = "score5.pl";
|
|
401 $num_flanking5 = 3;
|
|
402 $num_flanking3 = 5;
|
|
403 }
|
|
404 # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region
|
|
405 $region5 = defined $var_offset ? 0 : $num_flanking5; # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region
|
|
406 $region3 = defined $var_offset ? 0 : $num_flanking3;
|
|
407 if($strand eq "-"){
|
|
408 $start = $pos - $num_flanking3 - $region3;
|
|
409 $stop = $pos + $num_flanking5 + $region5 + ($indel_size < 0 ? -1*$indel_size:0);
|
|
410 #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking3 - $region3)\n";
|
|
411 }
|
|
412 else{
|
|
413 $start = $pos - $num_flanking5 - $region5;
|
|
414 $stop = $pos + $num_flanking3 + $region3 + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 23 or 9 bases as the case may be
|
|
415 #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking5 - $region5)\n";
|
|
416 }
|
|
417 if(defined $var_offset and (
|
|
418 $strand eq "+" and ($pos+$var_offset < $start or $pos+$var_offset > $stop) or
|
|
419 $strand eq "-" and ($pos-$var_offset < $start or $pos-$var_offset > $stop))){ # variant is outside of range we can detect affect on original splice site
|
|
420 return ($UNAFFECTED, -1)
|
|
421 }
|
|
422
|
|
423 my $seq = $fasta_index->fetch("$chr:$start-$stop");
|
|
424 my @subseqs;
|
|
425 if(defined $var){
|
|
426 if(defined $var_offset){ # substitution away from the site of interest
|
|
427 #print STDERR "Subbing in string of length ", length($seq), "near splice site $ref -> $var at ", ($strand eq "-" ? "$region3+$num_flanking3-$var_offset":"$region5+$num_flanking5+$var_offset"), "\n";
|
|
428 substr($seq, ($strand eq "-" ? $region3+$num_flanking3-$var_offset:$region5+$num_flanking5+$var_offset), length($ref)) = $var;
|
|
429 }
|
|
430 else{ # substitution at the site of interest
|
|
431 substr($seq, ($strand eq "-" ? $region3+$num_flanking3:$region5+$num_flanking5), length($ref)) = $var;
|
|
432 }
|
|
433 }
|
|
434 if($strand eq "-"){
|
|
435 $seq = reverse($seq);
|
|
436 $seq =~ tr/ACGTacgt/TGCAtgca/;
|
|
437 }
|
|
438 # get all of the 23 or 9 base pair subsequences, depending on whether it's donors or acceptors we are looking for
|
|
439 for(my $i = 0; $i+$num_flanking5+$num_flanking3+1 <= length($seq); $i++){
|
|
440 push @subseqs, substr($seq, $i, $num_flanking5+$num_flanking3+1);
|
|
441 }
|
|
442 my $pid = open2(\*CHILD_OUT, \*CHILD_IN, "$cmd -");
|
|
443 $pid or die "Could not run $cmd: $!\n";
|
|
444 print CHILD_IN join("\n",@subseqs);
|
|
445 close(CHILD_IN);
|
|
446 while(<CHILD_OUT>){
|
|
447 chomp;
|
|
448 # out is just "seq[tab]score"
|
|
449 my @F = split /\s+/, $_;
|
|
450 if($F[1] > $highest_score){
|
|
451 #print STDERR ">";
|
|
452 $highest_score = $F[1];
|
|
453 # since we printed the subseqs in order, $. (containing the line number of the prediction output) gives us the offset in the original seq
|
|
454 $highest_position = $start + ($strand eq "-" ? length($seq) - $num_flanking5 - $. : $num_flanking5 + $. -1)+(($. <= $num_flanking5+$region5 && $strand eq "+" or $. >= $num_flanking5+$region5 && $strand eq "-") ? -$indel_size : 0);
|
|
455 }
|
|
456 #print STDERR $_,"\n";
|
|
457 }
|
|
458 close(CHILD_OUT);
|
|
459 waitpid($pid, 0);
|
|
460
|
|
461 return ($highest_score, $highest_position);
|
|
462 }
|
|
463
|
|
464 sub get_range_info($$$$){
|
|
465 my ($chr2ranges, $chr, $start, $stop) = @_;
|
|
466
|
|
467 if(not exists $chr2ranges->{$chr}){
|
|
468 return {};
|
|
469 }
|
|
470 my $range_key = "$chr:$start:$stop";
|
|
471
|
|
472 # Use a binary search to find the leftmost range of interest
|
|
473 my $left_index = 0;
|
|
474 my $right_index = $#{$chr2ranges->{$chr}};
|
|
475 my $cursor = int($right_index/2);
|
|
476 while($left_index != $right_index and $left_index != $cursor and $right_index != $cursor){
|
|
477 # need to go left
|
|
478 if($chr2ranges->{$chr}->[$cursor]->[0] >= $start){
|
|
479 $right_index = $cursor;
|
|
480 $cursor = int(($left_index+$cursor)/2);
|
|
481 }
|
|
482 # need to go to the right
|
|
483 elsif($chr2ranges->{$chr}->[$cursor]->[1] <= $stop){
|
|
484 $left_index = $cursor;
|
|
485 $cursor = int(($right_index+$cursor)/2);
|
|
486 }
|
|
487 else{
|
|
488 $right_index = $cursor;
|
|
489 $cursor--; # in the range, go left until not in the range any more
|
|
490 }
|
|
491 }
|
|
492
|
|
493 my %names;
|
|
494 for (; $cursor <= $#{$chr2ranges->{$chr}}; $cursor++){
|
|
495 my $range_ref = $chr2ranges->{$chr}->[$cursor];
|
|
496 if($range_ref->[0] > $stop){
|
|
497 last;
|
|
498 }
|
|
499 elsif($range_ref->[0] <= $stop and $range_ref->[1] >= $start){
|
|
500 $names{$range_ref->[2]}++;
|
|
501 }
|
|
502 }
|
|
503
|
|
504 #print STDERR "Names for range $chr:$start-$stop are ", join("/", keys %names), "\n";
|
|
505 return \%names;
|
|
506 }
|
|
507
|
|
508 if(not -d $sift_dir){
|
|
509 die "The specified SIFT directory $sift_dir does not exist\n";
|
|
510 }
|
|
511
|
|
512 print STDERR "Reading in OMIM morbid map...\n" unless $quiet;
|
|
513 die "Data file $morbidmap_file does not exist, aborting.\n" if not -e $morbidmap_file;
|
|
514 my %gene2morbids;
|
|
515 open(TAB, $morbidmap_file)
|
|
516 or die "Cannot open OMIM morbid map file $morbidmap_file for reading: $!\n";
|
|
517 while(<TAB>){
|
|
518 my @fields = split /\|/, $_;
|
|
519 #print STDERR "got ", ($#fields+1), " fields in $_\n";
|
|
520 #my @fields = split /\|/, $_;
|
|
521 my $morbid = $fields[0];
|
|
522 $morbid =~ s/\s*\(\d+\)$//; # get rid of trailing parenthetical number
|
|
523 my $omim_id = $fields[2];
|
|
524 for my $genename (split /, /, $fields[1]){
|
|
525 if(not exists $gene2morbids{$genename}){
|
|
526 $gene2morbids{$genename} = "OMIM:$omim_id $morbid";
|
|
527 }
|
|
528 else{
|
|
529 $gene2morbids{$genename} .= "; $morbid";
|
|
530 }
|
|
531 }
|
|
532 }
|
|
533 close(TAB);
|
|
534
|
|
535 print STDERR "Reading in interpro mappings...\n" unless $quiet;
|
|
536 die "Data file $interpro_bed_file does not exist, aborting.\n" if not -e $interpro_bed_file;
|
|
537 my %interpro_ids;
|
|
538 open(TAB, $interpro_bed_file)
|
|
539 or die "Cannot open gene name BED file $interpro_bed_file for reading: $!\n";
|
|
540 while(<TAB>){
|
|
541 chomp;
|
|
542 # format should be "chr start stop interpro desc..."
|
|
543 my @fields = split /\t/, $_;
|
|
544 my $c = $fields[0];
|
|
545 if(not exists $interpro_ids{$c}){
|
|
546 $interpro_ids{$c} = [] unless exists $interpro_ids{$c};
|
|
547 $interpro_ids{"chr".$c} = [] unless $c =~ /^chr/;
|
|
548 $interpro_ids{$1} = [] if $c =~ /^chr(\S+)/;
|
|
549 }
|
|
550 my $dataref = [@fields[1..3]];
|
|
551 push @{$interpro_ids{$c}}, $dataref;
|
|
552 push @{$interpro_ids{"chr".$c}}, $dataref unless $c =~ /^chr/;
|
|
553 push @{$interpro_ids{$1}}, $dataref if $c =~ /^chr(\S+)/;
|
|
554 }
|
|
555 close(TAB);
|
|
556 print STDERR "Sorting interpro matches per contig...\n" unless $quiet;
|
|
557 for my $chr (keys %interpro_ids){
|
|
558 $interpro_ids{$chr} = [sort {$a->[0] <=> $b->[0]} @{$interpro_ids{$chr}}];
|
|
559 }
|
|
560
|
|
561 # {uc(gene_name) => space separated info}
|
|
562 print STDERR "Reading in tissue info...\n" unless $quiet;
|
|
563 my %tissues;
|
|
564 my %tissue_desc;
|
|
565 open(TISSUE, $tissue_file)
|
|
566 or die "Cannot open $tissue_file for reading: $!\n";
|
|
567 while(<TISSUE>){
|
|
568 chomp;
|
|
569 my @fields = split /\t/, $_;
|
|
570 my $gname = uc($fields[1]);
|
|
571 $tissue_desc{$gname} = $fields[2];
|
|
572 next unless $#fields == 4;
|
|
573 my @tis = split /;\s*/, $fields[4];
|
|
574 pop @tis;
|
|
575 if(@tis < 6){
|
|
576 $tissues{$gname} = join(">", @tis);
|
|
577 }
|
|
578 elsif(@tis < 24){
|
|
579 $tissues{$gname} = join(">", @tis[0..5]).">...";
|
|
580 }
|
|
581 else{
|
|
582 $tissues{$gname} = join(">", @tis[0..(int($#tis/5))]).">...";
|
|
583 }
|
|
584 }
|
|
585 close(TISSUE);
|
|
586
|
|
587 print STDERR "Loading pathway info...\n" unless $quiet;
|
|
588 my %pathways;
|
|
589 my %pathway_ids;
|
|
590 open(PATHWAY, $pathway_file)
|
|
591 or die "Cannot open $pathway_file for reading: $!\n";
|
|
592 while(<PATHWAY>){
|
|
593 chomp;
|
|
594 my @fields = split /\t/, $_;
|
|
595 next unless @fields >= 4;
|
|
596 for my $gname (split /\s+/, $fields[1]){
|
|
597 $gname = uc($gname);
|
|
598 if(not exists $pathways{$gname}){
|
|
599 $pathway_ids{$gname} = $fields[2];
|
|
600 $pathways{$gname} = $fields[3];
|
|
601 }
|
|
602 else{
|
|
603 $pathway_ids{$gname} .= "; ".$fields[2];
|
|
604 $pathways{$gname} .= "; ".$fields[3];
|
|
605 }
|
|
606 }
|
|
607 }
|
|
608 close(PATHWAY);
|
|
609
|
|
610 print STDERR "Loading SIFT indices...\n" unless $quiet;
|
|
611 # Read in the database table list
|
|
612 my %chr_tables;
|
|
613 open(DBLIST, "$sift_dir/bins.list")
|
|
614 or die "Cannot open $sift_dir/bins.list for reading: $!\n";
|
|
615 while(<DBLIST>){
|
|
616 chomp;
|
|
617 my @fields = split /\t/, $_;
|
|
618 if(not exists $chr_tables{$fields[1]}){
|
|
619 $chr_tables{$fields[1]} = {};
|
|
620 }
|
|
621 my $chr = $fields[1];
|
|
622 $chr = "chr$chr" unless $chr =~ /^chr/;
|
|
623 $chr_tables{$chr}->{"$chr\_$fields[2]_$fields[3]"} = [$fields[2],$fields[3]];
|
|
624 }
|
|
625 close(DBLIST);
|
|
626
|
|
627 #Connect to database
|
|
628 my $db_gene = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_Supp.sqlite","","",{RaiseError=>1, AutoCommit=>1});
|
|
629
|
|
630 my $db_chr;
|
|
631 my $fr_stmt; # retrieval for frameshift
|
|
632 my $snp_stmt; # retrieval for SNP
|
|
633 my $cur_chr = "";
|
|
634 my $cur_max_pos;
|
|
635
|
|
636 if(defined $predict_cryptic_splicings){
|
|
637 if(not -e $predict_cryptic_splicings){
|
|
638 die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, does not exist.\n";
|
|
639 }
|
|
640 if(not -e $predict_cryptic_splicings.".fai" and not -w dirname($predict_cryptic_splicings)){
|
|
641 die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, is not indexed, and the directory is not writable.\n";
|
|
642 }
|
|
643 print STDERR "Loading reference FastA index...\n" unless $quiet;
|
|
644 $fasta_index = Bio::DB::Sam::Fai->load($predict_cryptic_splicings);
|
|
645 }
|
|
646
|
|
647 print STDERR "Annotating variants...\n" unless $quiet;
|
|
648 # Assume contigs and positions in order
|
|
649 print STDERR "Processing file $hgvs_table_file...\n" unless $quiet;
|
|
650 open(HGVS, $hgvs_table_file)
|
|
651 or die "Cannot open HGVS file $hgvs_table_file for reading: $!\n";
|
|
652 # Output file for annotated snps and frameshifts
|
|
653 my $header = <HGVS>; # header
|
|
654 chomp $header;
|
|
655 my @headers = split /\t/, $header;
|
|
656 my ($chr_column, $pos_column, $ref_column, $alt_column, $alt_cnt_column, $tot_cnt_column, $ftype_column, $dbid_column, $maf_src_column, $maf_column, $internal_freq_column,
|
|
657 $cdna_hgvs_column, $aa_hgvs_column, $transcript_column, $transcript_length_column, $strand_column, $zygosity_column, $splice_dist_column, $caveats_column, $phase_column,
|
|
658 $srcs_column, $pvalue_column, $to_column, $genename_column, $rares_column, $rares_header);
|
|
659 my %req_columns = (
|
|
660 "Chr" => \$chr_column,
|
|
661 "DNA From" => \$pos_column,
|
|
662 "DNA To" => \$to_column,
|
|
663 "Gene Name" => \$genename_column,
|
|
664 "Ref base" => \$ref_column,
|
|
665 "Obs base" => \$alt_column,
|
|
666 "Variant Reads" => \$alt_cnt_column,
|
|
667 "Total Reads" => \$tot_cnt_column,
|
|
668 "Feature type" => \$ftype_column,
|
|
669 "Variant DB ID" => \$dbid_column,
|
|
670 "Pop. freq. source" => \$maf_src_column,
|
|
671 "Pop. freq." => \$maf_column,
|
|
672 "Transcript HGVS" => \$cdna_hgvs_column,
|
|
673 "Protein HGVS" => \$aa_hgvs_column,
|
|
674 "Selected transcript" => \$transcript_column,
|
|
675 "Transcript length" => \$transcript_length_column,
|
|
676 "Strand" => \$strand_column,
|
|
677 "Zygosity" => \$zygosity_column,
|
|
678 "Closest exon junction (AA coding variants)" => \$splice_dist_column,
|
|
679 "Caveats" => \$caveats_column,
|
|
680 "Phase" => \$phase_column,
|
|
681 "P-value" => \$pvalue_column);
|
|
682 &load_header_columns(\%req_columns, \@headers);
|
|
683 for(my $i = 0; $i <= $#headers; $i++){
|
|
684 # Optional columns go here
|
|
685 if($headers[$i] eq "Sources"){
|
|
686 $srcs_column = $i;
|
|
687 }
|
|
688 elsif($headers[$i] eq "Internal pop. freq."){
|
|
689 $internal_freq_column = $i;
|
|
690 }
|
|
691 elsif($headers[$i] =~ /^Num rare variants/){
|
|
692 $rares_column = $i;
|
|
693 $rares_header = $headers[$i];
|
|
694 }
|
|
695 }
|
|
696 open(OUT, ">$outfile")
|
|
697 or die "Cannot open $outfile for writing: $!\n";
|
|
698 print OUT join("\t", "Chr", "DNA From", "DNA To", "Ref base", "Obs base", "% ref read","% variant read",
|
|
699 "Variant Reads", "Total Reads", "Feature type","Variant type", "Variant context", "Variant DB ID", "Pop. freq. source",
|
|
700 "Pop. freq."), (defined $internal_freq_column ? "\tInternal pop. freq.\t" : "\t"), join("\t", "SIFT Score", "Polyphen2 Score", "Polyphen2 call", "GERP score",
|
|
701 "Gene Name", "Transcript HGVS",
|
|
702 "Protein HGVS", "Zygosity","Closest exon junction (AA coding variants)","Splicing alteration potential","Splicing alteration details",
|
|
703 "OMIM Morbid Map","Tissue expression", "Pathways", "Pathway refs",
|
|
704 "Gene Function", "Spanning InterPro Domains", "P-value", "Caveats", "Phase",
|
|
705 "Selected transcript", "Transcript length", "Strand"),
|
|
706 (defined $rares_column ? "\t$rares_header" : ""),
|
|
707 (defined $srcs_column ? "\tSources" : ""), "\n";
|
|
708 while(<HGVS>){
|
|
709 chomp;
|
|
710 my @fields = split /\t/, $_, -1;
|
|
711 my $mode;
|
|
712 my $protein_hgvs = $fields[$aa_hgvs_column];
|
|
713 my $refSeq = $fields[$ref_column];
|
|
714 my $newBase = $fields[$alt_column];
|
|
715 my $splicing_potential = "NA";
|
|
716 my $splicing_details = "NA";
|
|
717 if(defined $predict_cryptic_splicings){
|
|
718 my $distance = $fields[$splice_dist_column]; # only for coding variants
|
|
719 my $site_type;
|
|
720 if($distance eq "NA"){
|
|
721 }
|
|
722 elsif($distance){
|
|
723 $site_type = $distance < 0 ? "donor" : "acceptor"; # exon context
|
|
724 }
|
|
725 else{
|
|
726 if($fields[$cdna_hgvs_column] =~ /\d+([\-+]\d+)/){ # get from the cDNA HGVS otherwise
|
|
727 $distance = $1;
|
|
728 $distance =~ s/^\+//;
|
|
729 $site_type = $distance > 0 ? "donor" : "acceptor"; # intron context
|
|
730 }
|
|
731 }
|
|
732 # only do splicing predictions for novel or rare variants, since these predictions are expensive
|
|
733 if($distance and $distance ne "NA" and ($fields[$maf_column] eq "NA" or $fields[$maf_column] <= 0.01)){
|
|
734 #print STDERR "Running prediction for rare variant ($fields[3], MAF=$fields[14]): $fields[5], $fields[6], $refSeq, $newBase, $fields[4], $distance, $site_type\n";
|
|
735 ($splicing_potential, $splicing_details) = &predict_splicing($fields[$chr_column], $fields[$pos_column], $refSeq, $newBase, $fields[$strand_column], $distance, $site_type);
|
|
736 }
|
|
737 }
|
|
738 my $lookupBase;
|
|
739 if($fields[$strand_column] eq "-"){
|
|
740 if($refSeq ne "NA"){
|
|
741 $refSeq = reverse($refSeq);
|
|
742 $refSeq =~ tr/ACGTacgt/TGCAtgca/;
|
|
743 }
|
|
744 if($newBase ne "NA"){
|
|
745 $newBase = reverse($newBase);
|
|
746 $newBase =~ tr/ACGTacgt/TGCAtgca/;
|
|
747 $newBase =~ s(TN/)(NA/);
|
|
748 $newBase =~ s(/TN)(/NA);
|
|
749 }
|
|
750 }
|
|
751 if($fields[$cdna_hgvs_column] =~ /^g\..*?(?:ins|delins|>)([ACGTN]+)$/){
|
|
752 $mode = "snps";
|
|
753 }
|
|
754 elsif($fields[$cdna_hgvs_column] =~ /^g\..*?del([ACGTN]+)$/){
|
|
755 $mode = "snps";
|
|
756 }
|
|
757 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+(?:[\-+]\d+)?ins([ACGTN]+)$/ or $fields[$cdna_hgvs_column] =~ /^c\.\d+(?:[\-+]\d+)?_\d+ins([ACGTN]+)$/){
|
|
758 if(not length($1)%3){
|
|
759 $mode = "snps";
|
|
760 }
|
|
761 else{
|
|
762 $mode = "frameshifts";
|
|
763 }
|
|
764 }
|
|
765 elsif($fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:[\-+]\d+)?_(\d+)(?:[\-+]\d+)?delins([ACGTN]+)$/){
|
|
766 if(not (length($3)-$2+$1-1)%3){
|
|
767 $mode = "snps";
|
|
768 }
|
|
769 else{
|
|
770 $mode = "frameshifts";
|
|
771 }
|
|
772 }
|
|
773 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[+\-]\d+_\d+[+\-]\d+delins([ACGTN]+)$/ or
|
|
774 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or
|
|
775 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or
|
|
776 $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+del([ACGTN]*)$/ or
|
|
777 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?(?:ins|delins)?([ACGTN]+)$/){
|
|
778 $mode = "snps";
|
|
779 }
|
|
780 elsif($fields[$cdna_hgvs_column] =~ /^c\.([-*]?\d+)del(\S+)/){
|
|
781 my $dist = $1;
|
|
782 $dist =~ s/^\*//;
|
|
783 if(abs($dist) < 4){
|
|
784 $mode = "snps";
|
|
785 }
|
|
786 else{
|
|
787 $mode = "frameshifts";
|
|
788 }
|
|
789 }
|
|
790 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+ins(\S*)/ or
|
|
791 $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+del(\S*)/){
|
|
792 $mode = "snps";
|
|
793 }
|
|
794 elsif($fields[$cdna_hgvs_column] =~ /^c\.(-?\d+)_(\d+)(?:\+\d+)?del(\S+)/ or
|
|
795 $fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:\-\d+)?_(\d+)del(\S+)/){
|
|
796 if(not ($2-$1+1)%3){
|
|
797 $mode = "snps";
|
|
798 }
|
|
799 else{
|
|
800 $mode = "frameshifts";
|
|
801 }
|
|
802 }
|
|
803 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+ins([ACGTN]+)$/){
|
|
804 if(not length($1)%3){
|
|
805 $mode = "snps";
|
|
806 }
|
|
807 else{
|
|
808 $mode = "frameshifts";
|
|
809 }
|
|
810 }
|
|
811 # special case of shifted start codon
|
|
812 elsif($fields[$cdna_hgvs_column] =~ /^c\.-1_1+ins([ACGTN]+)$/ or
|
|
813 $fields[$cdna_hgvs_column] =~ /inv$/){
|
|
814 $mode = "snps";
|
|
815 }
|
|
816 elsif($fields[$cdna_hgvs_column] =~ /[ACGTN]>([ACGTN])$/){
|
|
817 # todo: handle multiple consecutive substitutions, will have to run polyphen separately
|
|
818 $lookupBase = $newBase;
|
|
819 if($fields[$strand_column] eq "-"){ # revert to positive strand variant for SIFT index
|
|
820 $lookupBase =~ tr/ACGTacgt/TGCAtgca/;
|
|
821 }
|
|
822 $mode = "snps";
|
|
823 }
|
|
824 else{
|
|
825 $mode = "snps";
|
|
826 }
|
|
827 if($fields[$chr_column] ne $cur_chr){
|
|
828 #Prepared DB connection on per-chromosome basis
|
|
829 $cur_chr = $fields[$chr_column];
|
|
830 $cur_chr =~ s/^chr//;
|
|
831 $db_chr = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_CHR$cur_chr.sqlite",
|
|
832 "",
|
|
833 "",
|
|
834 { RaiseError => 1, AutoCommit => 1 }) unless not -e "$sift_dir/Human_CHR$cur_chr.sqlite";
|
|
835
|
|
836 $cur_max_pos = -1;
|
|
837 }
|
|
838 if($fields[$pos_column] =~ /^\d+$/ and $cur_max_pos < $fields[$pos_column]){
|
|
839 undef $fr_stmt;
|
|
840 undef $snp_stmt;
|
|
841 my $c = $fields[$chr_column];
|
|
842 $c = "chr$c" if $c !~ /^chr/;
|
|
843 for my $chr_table_key (keys %{$chr_tables{$c}}){
|
|
844 my $chr_table_range = $chr_tables{$c}->{$chr_table_key};
|
|
845 if($chr_table_range->[0] <= $fields[$pos_column] and $chr_table_range->[1] >= $fields[$pos_column]){
|
|
846 $fr_stmt = $db_chr->prepare("select SCORE".
|
|
847 " from $chr_table_key where COORD1 = ?");
|
|
848 $snp_stmt = $db_chr->prepare("select SCORE".
|
|
849 " from $chr_table_key where COORD1 = ? AND NT2 = ?");
|
|
850 $cur_max_pos = $chr_table_range->[1];
|
|
851 last;
|
|
852 }
|
|
853 }
|
|
854 if(not defined $fr_stmt and not $quiet){
|
|
855 warn "Got request for position not in range of SIFT ($c:$fields[$pos_column])\n";
|
|
856 }
|
|
857 }
|
|
858
|
|
859 my @cols;
|
|
860 if(not $fr_stmt){
|
|
861 @cols = ("NA");
|
|
862 }
|
|
863 elsif($mode eq "frameshifts"){
|
|
864 $fr_stmt->execute($fields[$pos_column]);
|
|
865 @cols = $fr_stmt->fetchrow_array();
|
|
866 }
|
|
867 else{
|
|
868 $snp_stmt->execute($fields[$pos_column]-1, $lookupBase);
|
|
869 @cols = $snp_stmt->fetchrow_array();
|
|
870 }
|
|
871 # See if the change is in a CDS, and if it's non-synonymous
|
|
872 my ($type, $location) = ("silent", "intron");
|
|
873 my $start = $fields[$pos_column];
|
|
874 my $stop = $fields[$to_column];
|
|
875 my $interpro = get_range_info(\%interpro_ids, $fields[$chr_column], $start, $stop);
|
|
876 my $domains = join("; ", sort keys %$interpro);
|
|
877
|
|
878 my $sift_score = "NA";
|
|
879 my $variant_key = "NA";
|
|
880 my $transcript_hgvs = $fields[$cdna_hgvs_column];
|
|
881 if($transcript_hgvs =~ /\.\d+[ACGTN]>/ or $transcript_hgvs =~ /\.\d+(?:_|del|ins)/ or $transcript_hgvs =~ /[+\-]\?/){
|
|
882 $location = "exon";
|
|
883 }
|
|
884 elsif($transcript_hgvs =~ /\.\d+[\-+](\d+)/){
|
|
885 if($1 < 21){
|
|
886 $type = "splice";
|
|
887 }
|
|
888 else{
|
|
889 $type = "intronic";
|
|
890 }
|
|
891 $location = "splice site";
|
|
892 }
|
|
893 elsif($transcript_hgvs =~ /\.\*\d+/){
|
|
894 $location = "3'UTR";
|
|
895 }
|
|
896 elsif($transcript_hgvs =~ /\.-\d+/){
|
|
897 $location = "5'UTR";
|
|
898 }
|
|
899 if($mode eq "frameshifts"){
|
|
900 $type = "frameshift";
|
|
901 }
|
|
902 else{
|
|
903 if(not defined $protein_hgvs){
|
|
904 $type = "unknown";
|
|
905 }
|
|
906 elsif($protein_hgvs eq "NA"){
|
|
907 $type = "non-coding";
|
|
908 }
|
|
909 elsif($protein_hgvs =~ /=$/){
|
|
910 $type = "silent";
|
|
911 }
|
|
912 elsif($protein_hgvs =~ /\d+\*/){ # nonsense
|
|
913 $type = "nonsense";
|
|
914 }
|
|
915 elsif($protein_hgvs =~ /ext/){ # nonsense
|
|
916 $type = "nonstop";
|
|
917 }
|
|
918 elsif($protein_hgvs eq "p.0?" or $protein_hgvs eq "p.?"){
|
|
919 $type = "unknown";
|
|
920 }
|
|
921 else{
|
|
922 $type = "missense";
|
|
923 }
|
|
924
|
|
925 $sift_score = $cols[0] || "NA";
|
|
926
|
|
927 # Record the need to fetch VCF polyphen and gerp data later (en masse, for efficiency)
|
|
928 # Use newBase instead of lookupBase for PolyPhen since the orientation is always relative to the transcript
|
|
929 $variant_key = get_variant_key($fields[$chr_column], $fields[$pos_column], $fields[$ref_column], $newBase);
|
|
930 #print STDERR "Variant key is $variant_key\n";
|
|
931 }
|
|
932 my $transcript_type = $fields[$ftype_column];
|
|
933 my $transcript_length = $fields[$transcript_length_column];
|
|
934 my $transcript_id = $fields[$transcript_column]; # sift score is derived from this transcript
|
|
935 my $transcript_strand = $fields[$strand_column];
|
|
936 my $zygosity = $fields[$zygosity_column];
|
|
937 my $rsID = $fields[$dbid_column];
|
|
938 my $popFreq = $fields[$maf_column];
|
|
939 my $internalFreq;
|
|
940 $internalFreq = $fields[$internal_freq_column] if defined $internal_freq_column;
|
|
941 my $popFreqSrc = $fields[$maf_src_column];
|
|
942
|
|
943 # Get the gene name and omim info
|
|
944 my %seen;
|
|
945 my @gene_names = split /; /, $fields[$genename_column];
|
|
946 my $morbid = join("; ", grep {/\S/ and not $seen{$_}++} map({$gene2morbids{$_} or ""} @gene_names));
|
|
947 my $tissue = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissues{$_} or ""} @gene_names));
|
|
948 my $function = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissue_desc{$_} or ""} @gene_names));
|
|
949 my $pathways = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathways{$_} or ""} @gene_names));
|
|
950 my $pathway_ids = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathway_ids{$_} or ""} @gene_names));
|
|
951
|
|
952 # It may be that the calls and freqs are multiple concatenated values, e.g. "X; Y"
|
|
953 my @tot_bases = split /; /,$fields[$tot_cnt_column];
|
|
954 my @var_bases = split /; /,$fields[$alt_cnt_column];
|
|
955 my (@pct_nonvar, @pct_var);
|
|
956 for (my $i = 0; $i <= $#tot_bases; $i++){
|
|
957 push @pct_nonvar, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int(($tot_bases[$i]-$var_bases[$i])/$tot_bases[$i]*100+0.5);
|
|
958 push @pct_var, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int($var_bases[$i]/$tot_bases[$i]*100+0.5);
|
|
959 }
|
|
960
|
|
961 push @lines, [
|
|
962 $fields[$chr_column], # chr
|
|
963 $fields[$pos_column], # pos
|
|
964 $fields[$to_column],
|
|
965 $fields[$ref_column],
|
|
966 $fields[$alt_column],
|
|
967 join("; ", @pct_nonvar), # pct ref
|
|
968 join("; ", @pct_var), # pct var
|
|
969 $fields[$alt_cnt_column], # num var
|
|
970 $fields[$tot_cnt_column], # num reads
|
|
971 $transcript_type, # protein_coding, pseudogene, etc.
|
|
972 $type,
|
|
973 $location,
|
|
974 $rsID,
|
|
975 $popFreqSrc,
|
|
976 (defined $internal_freq_column ? ($popFreq,$internalFreq) : ($popFreq)),
|
|
977 $sift_score,
|
|
978 $variant_key, # 16th field
|
|
979 join("; ", @gene_names),
|
|
980 $transcript_hgvs,
|
|
981 $protein_hgvs,
|
|
982 $zygosity,
|
|
983 $fields[$splice_dist_column], # distance to exon edge
|
|
984 $splicing_potential,
|
|
985 $splicing_details,
|
|
986 $morbid,
|
|
987 $tissue,
|
|
988 $pathways,
|
|
989 $pathway_ids,
|
|
990 $function,
|
|
991 $domains,
|
|
992 $fields[$pvalue_column],
|
|
993 $fields[$caveats_column], # caveats
|
|
994 (defined $rares_column ? ($fields[$rares_column],$transcript_id) : ($transcript_id)),
|
|
995 $transcript_length,
|
|
996 $transcript_strand,
|
|
997 (defined $srcs_column ? ($fields[$phase_column],$fields[$srcs_column]) : ($fields[$phase_column]))];
|
|
998 }
|
|
999
|
|
1000 print STDERR "Adding Polyphen2 and GERP info...\n" unless $quiet;
|
|
1001 # retrieve the polyphen and gerp info en masse for each chromosome, as this is much more efficient
|
|
1002 my $vkey_col = 16+(defined $internal_freq_column ? 1 : 0);
|
|
1003 for my $line_fields (@lines){
|
|
1004 # expand the key into the relevant MAF values
|
|
1005 $line_fields->[$vkey_col] = join("\t", polyphen_gerp_info($gerp_dir,$polyphen_file,$line_fields->[$vkey_col])); # fields[$vkey_col] is variant key
|
|
1006 print OUT join("\t", @{$line_fields}), "\n";
|
|
1007 }
|
|
1008 close(OUT);
|
|
1009 unlink $tmpfile if defined $tmpfile;
|
|
1010
|
|
1011 sub load_header_columns{
|
|
1012 my ($reqs_hash_ref, $headers_array_ref) = @_;
|
|
1013 my %unfulfilled;
|
|
1014 for my $field_name (keys %$reqs_hash_ref){
|
|
1015 $unfulfilled{$field_name} = 1;
|
|
1016 }
|
|
1017 for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){
|
|
1018 for my $req_header_name (keys %$reqs_hash_ref){
|
|
1019 if($req_header_name eq $headers_array_ref->[$i]){
|
|
1020 ${$reqs_hash_ref->{$req_header_name}} = $i;
|
|
1021 delete $unfulfilled{$req_header_name};
|
|
1022 last;
|
|
1023 }
|
|
1024 }
|
|
1025 }
|
|
1026 if(keys %unfulfilled){
|
|
1027 die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n";
|
|
1028 }
|
|
1029 }
|