annotate hgvs_table_annotate @ 0:40dd2e7ee63a default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 16:00:12 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use Bio::DB::Sam;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use DBI;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5 use IPC::Open2;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6 use Set::IntervalTree;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 use File::Basename;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8 use strict;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9 use warnings;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10 use vars qw($quiet %chr2variant_locs %polyphen_info %chr2polyphen_vcf_lines %chr2gerp_vcf_lines %range2genes $fasta_index $UNAFFECTED $tmpfile);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 $UNAFFECTED = -299999; # sentinel for splice site predictions
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 if(@ARGV == 1 and $ARGV[0] eq "-v"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 print "Version 1.0\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16 exit;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19 # configuration file stuff
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 my %config;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 my $dirname = dirname(__FILE__);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22 my $tool_dir = shift @ARGV;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 if(not -e "$tool_dir/hgvs_annotate.loc"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24 system("cp $dirname/tool-data/hgvs_annotate.loc $tool_dir/hgvs_annotate.loc");
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26 open CONFIG, '<', "$tool_dir/hgvs_annotate.loc";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 while(<CONFIG>){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 next if $_ =~ /^#/;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 (my $key,my $value) = split(/\s+/, $_);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 $config{$key} = $value;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32 my $dbs_dir = $config{"dbs_dir"};
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 close CONFIG;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 $quiet = 0;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37 $quiet = 1;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38 shift @ARGV;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 @ARGV == 9 or @ARGV == 10 or die "Usage: $0 -q <SIFT dir> <POLYPHEN2 file> <GERP dir> <tissuedb file> <pathway file> <interpro domains.bed> <omim morbidmap> <input hgvs table.txt> <output hgvs table.annotated.txt> [reference.fasta]\n Where if the reference fasta is provided, we predict cryptic splicing effects (using MaxEntScan and GeneSplicer)\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 my $sift_dir = $dbs_dir . shift @ARGV;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 my $polyphen_file = $dbs_dir . shift @ARGV;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 my $gerp_dir = $dbs_dir . shift @ARGV;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 my $tissue_file = $dbs_dir . shift @ARGV;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 my $pathway_file = $dbs_dir . shift @ARGV;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 my $interpro_bed_file = shift @ARGV;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49 my $morbidmap_file = shift @ARGV;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 my $hgvs_table_file = shift @ARGV;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 my $outfile = shift @ARGV;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 my $predict_cryptic_splicings = @ARGV ? shift @ARGV : undef;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 my $flanking_bases = 500; # assume this is ROI around a gene
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 my @lines;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56 sub polyphen_gerp_info($$$){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 my($gerp_dir,$polyphen_file,$info_key) = @_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 my($chr,$pos,$ref,$variant) = split /:/, $info_key;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 if(not exists $chr2polyphen_vcf_lines{$chr}){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63 ($chr2polyphen_vcf_lines{$chr}, $chr2gerp_vcf_lines{$chr}) = retrieve_vcf_lines($gerp_dir,$polyphen_file,$chr);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 my @results;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 if(not $ref or not $variant){ # can only get data for SNPs
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68 @results = qw(NA NA NA);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70 #print STDERR "ref = $ref and variant = $variant for $info_key\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
71 elsif(length($ref) == 1 and length($variant) == 1){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
72 #print STDERR "Grabbing poly/gerp for $chr,$pos,$ref,$variant\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
73 @results = polyphen_info($chr,$pos,$variant);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
74 $results[2] = gerp_info($chr,$pos);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
75 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
76
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
77 # It may be that a novel MNP variant is a bunch of known SNPs in a row. In this case,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
78 # report the list of variants
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
79 elsif(length($ref) == length($variant) and $ref ne "NA"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
80 my (@poly_scores, @poly_calls, @gerp_scores);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
81 for(my $i = 0; $i < length($ref); $i++){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
82 my $refbase = substr($ref, $i, 1);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
83 my $newbase = substr($variant, $i, 1);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
84 if($newbase eq $refbase){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
85 push @poly_scores, "-";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
86 push @poly_calls, "-";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
87 push @gerp_scores, "-";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
88 next;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
89 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
90 my @base_results = polyphen_info($chr,$pos+$i,$refbase,$newbase);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
91 push @poly_scores, $base_results[0];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
92 push @poly_calls, $base_results[1];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
93 $base_results[2] = gerp_info($chr,$pos+$i);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
94 push @gerp_scores, $base_results[2];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
95 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
96 $results[0] = join(",", @poly_scores);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
97 $results[1] = join(",", @poly_calls);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
98 $results[2] = join(",", @gerp_scores);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
99 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
100 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
101 @results = qw(NA NA NA);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
102 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
103
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
104 return @results;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
105 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
106
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
107 sub get_variant_key($$$$){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
108 # params chr pos ref variant
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
109 my $c = $_[0];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
110 $c =~ s/^chr//;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
111 my $prop_info_key = join(":", $c, @_[1..3]);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
112 $chr2variant_locs{$c} = [] unless exists $chr2variant_locs{$c};
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
113 push @{$chr2variant_locs{$c}}, $_[1];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
114 return $prop_info_key;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
115 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
116
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
117 sub retrieve_vcf_lines($$$){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
118 my ($gerp_dir, $polyphen_file, $chr) = @_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
119
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
120 my (%polyphen_lines, %gerp_lines);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
121
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
122 if(not exists $chr2variant_locs{$chr}){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
123 return ({}, {}); # no data requested for this chromosome
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
124 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
125
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
126 # build up the request
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
127 my @tabix_regions;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
128 my @var_locs = grep /^\d+$/, @{$chr2variant_locs{$chr}}; # grep keeps point mutations only
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
129 if(not @var_locs){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
130 return ({}, {}); # no point mutations
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
131 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
132
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
133 # sort by variant start location
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
134 for my $var_loc (sort {$a <=> $b} @var_locs){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
135 push @tabix_regions, $chr.":".$var_loc."-".$var_loc;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
136 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
137 my $regions = join(" ", @tabix_regions);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
138
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
139 # retrieve the data
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
140 die "Cannot find Polyphen2 VCF file $polyphen_file\n" if not -e $polyphen_file;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
141 open(VCF, "tabix $polyphen_file $regions |")
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
142 or die "Cannot run tabix on $polyphen_file: $!\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
143 while(<VCF>){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
144 if(/^\S+\t(\d+)/ and grep {$_ eq $1} @var_locs){ # take only main columns to save room, if possible
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
145 chomp;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
146 $polyphen_lines{$1} = [] unless exists $polyphen_lines{$1};
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
147 push @{$polyphen_lines{$1}}, $_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
148 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
149 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
150 close(VCF);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
151
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
152 my $vcf_file = "$gerp_dir/chr$chr.tab.gz";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
153 if(not -e $vcf_file){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
154 warn "Cannot find GERP VCF file $vcf_file\n" if not $quiet;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
155 return ({}, {});
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
156 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
157 open(VCF, "tabix $vcf_file $regions |")
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
158 or die "Cannot run tabix on $vcf_file: $!\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
159 while(<VCF>){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
160 if(/^(\S+\t(\d+)(?:\t\S+){2})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
161 $gerp_lines{$2} = [] unless exists $gerp_lines{$2};
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
162 push @{$gerp_lines{$2}}, $1;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
163 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
164 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
165 close(VCF);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
166
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
167 return (\%polyphen_lines, \%gerp_lines);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
168 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
169
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
170 sub polyphen_info($$$){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
171 my ($chr, $pos, $variant_base) = @_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
172
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
173 my $key = "$chr:$pos:$variant_base";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
174 if(exists $polyphen_info{$key}){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
175 return @{$polyphen_info{$key}};
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
176 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
177
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
178 #print STDERR "Checking for polyphen data for $chr, $pos, $variant_base\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
179 if(exists $chr2polyphen_vcf_lines{$chr}->{$pos}){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
180 for(@{$chr2polyphen_vcf_lines{$chr}->{$pos}}){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
181 #print STDERR "Checking $chr, $pos, $variant_base against $_";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
182 my @fields = split /\t/, $_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
183 if($fields[4] eq $variant_base){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
184 if($fields[6] eq "D"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
185 $polyphen_info{$key} = [$fields[5],"deleterious"];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
186 last;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
187 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
188 elsif($fields[6] eq "P"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
189 $polyphen_info{$key} = [$fields[5],"poss. del."];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
190 last;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
191 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
192 elsif($fields[6] eq "B"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
193 $polyphen_info{$key} = [$fields[5],"benign"];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
194 last;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
195 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
196 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
197 $polyphen_info{$key} = [$fields[5],$fields[6]];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
198 last;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
199 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
200 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
201 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
202 if(not exists $polyphen_info{$key}){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
203 $polyphen_info{$key} = ["NA", "NA"];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
204 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
205 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
206 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
207 $polyphen_info{$key} = ["NA", "NA"];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
208 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
209 return @{$polyphen_info{$key}};
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
210 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
211
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
212 sub gerp_info($$){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
213 my ($chr,$pos) = @_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
214
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
215 if(exists $chr2gerp_vcf_lines{$chr}->{$pos}){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
216 for(@{$chr2gerp_vcf_lines{$chr}->{$pos}}){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
217 my @fields = split /\t/, $_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
218 return $fields[3];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
219 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
220 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
221 return "NA";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
222 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
223
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
224 sub predict_splicing{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
225 my ($chr, $pos, $ref, $var, $strand, $exon_edge_distance, $splice_type) = @_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
226
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
227 my $disruption_level = 0;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
228 my @disruption_details;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
229 # The methods being used only look for up to a 30 base context, so only
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
230 # need to check for obliterated acceptors/donors if they are nearby
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
231 $exon_edge_distance-- if $exon_edge_distance > 0; # +1 is actually in the right spot, so decrement
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
232 my $edge_offset = $strand eq "-" ? -1*$exon_edge_distance : $exon_edge_distance; # flip offset sign if the call was on the negative strand (since +/- was relative to the transcript, not the genome coordinates)
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
233 my $annotated_detected = 0;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
234 my ($genesplicer_orig, $genesplicer_orig_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
235 if($genesplicer_orig != 0){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
236 $annotated_detected = 1;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
237 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
238 my ($maxentscan_orig, $maxentscan_orig_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
239 if(not $annotated_detected and $maxentscan_orig){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
240 $annotated_detected = 1;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
241 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
242 my ($genesplicer_mod, $genesplicer_mod_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
243 if($genesplicer_mod != $UNAFFECTED){ # was the mod within the range where this software could detect up or downstream effects on the annotated splice site?
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
244 if($genesplicer_orig_pos != $genesplicer_mod_pos){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
245 $disruption_level+=0.75;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
246 if($genesplicer_mod_pos == -1){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
247 push @disruption_details, "Variant obliterates GeneSplicer predicted splice site from the reference sequence ($chr:$genesplicer_orig_pos, orig score $genesplicer_orig)";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
248 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
249 elsif($genesplicer_orig_pos != -1){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
250 push @disruption_details, "Variant moves GeneSplicer preferred splice site from annotated location $chr:$genesplicer_orig_pos to $chr:$genesplicer_mod_pos (scores: orig=$genesplicer_orig, new=$genesplicer_mod)";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
251 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
252 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
253 elsif($genesplicer_orig-$genesplicer_mod > 1){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
254 $disruption_level+=0.5;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
255 push @disruption_details, "Variant significantly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
256 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
257 elsif($genesplicer_orig > $genesplicer_mod){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
258 $disruption_level+=0.25;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
259 push @disruption_details, "Variant slightly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
260 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
261 $genesplicer_orig = $genesplicer_mod; # so that splice site comparison vs. cryptic is accurate
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
262 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
263 my ($maxentscan_mod, $maxentscan_mod_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
264 if($maxentscan_mod != $UNAFFECTED){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
265 if($maxentscan_mod_pos != $maxentscan_orig_pos){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
266 $disruption_level+=1;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
267 if($maxentscan_mod_pos == -1){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
268 push @disruption_details, "Variant obliterates MaxEntScan predicted splice site from the reference sequence ($chr:$maxentscan_orig_pos, orig score $maxentscan_orig)";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
269 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
270 elsif($maxentscan_orig_pos != -1){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
271 push @disruption_details, "Variant moves MaxEntScan preferred splice site from annotated location $chr:$maxentscan_orig_pos to $chr:$maxentscan_mod_pos (scores: orig=$maxentscan_orig, new=$maxentscan_mod)";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
272 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
273 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
274 elsif($maxentscan_orig-$maxentscan_mod > 1){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
275 $disruption_level+=0.5;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
276 push @disruption_details, "Variant significantly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
277 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
278 elsif($maxentscan_orig>$maxentscan_mod){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
279 $disruption_level+=0.25;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
280 push @disruption_details, "Variant slightly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
281 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
282 $maxentscan_orig = $maxentscan_mod;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
283 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
284
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
285 # Regardless of location, see if there is an increased chance of creating a cryptic splicing site
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
286 my ($genesplicer_control, $genesplicer_control_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
287 my ($genesplicer_cryptic, $genesplicer_cryptic_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref, $var);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
288 if(defined $genesplicer_orig and $genesplicer_control < $genesplicer_orig and $genesplicer_orig < $genesplicer_cryptic){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
289 $disruption_level+=0.75;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
290 push @disruption_details, "Variant causes GeneSplicer score for a potential cryptic $splice_type splice site at $chr:$genesplicer_cryptic_pos to become higher than the annotated splice site $chr:$genesplicer_orig_pos (orig=$genesplicer_orig vs. var=$genesplicer_cryptic)";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
291 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
292 if($genesplicer_control - $genesplicer_cryptic < -1){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
293 $disruption_level+=0.5;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
294 push @disruption_details, "Variant raises GeneSplicer score for a potential cryptic $splice_type splice site (orig=$genesplicer_control vs. var=$genesplicer_cryptic)";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
295 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
296 my ($maxentscan_control, $maxentscan_control_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
297 my ($maxentscan_cryptic, $maxentscan_cryptic_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref, $var);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
298 if(defined $maxentscan_orig and $maxentscan_control < $maxentscan_orig and $maxentscan_orig < $maxentscan_cryptic){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
299 $disruption_level++;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
300 push @disruption_details, "Variant causes MaxEntScan score for a potential cryptic $splice_type splice site at $chr:$maxentscan_cryptic_pos to become higher than the annotated splice site $chr:$maxentscan_orig_pos (orig=$maxentscan_orig vs. var=$maxentscan_cryptic)";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
301 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
302 if($maxentscan_control - $maxentscan_cryptic < -1 and $maxentscan_cryptic > 0){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
303 $disruption_level+=0.5;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
304 push @disruption_details, "Variant raises MaxEntScan score for a potential cryptic $splice_type splice site (orig=$maxentscan_control vs. var=$maxentscan_cryptic)";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
305 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
306
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
307 # If neither method predicted the annotated donor site, and the alternate site isn't predicted either, note that so the user doesn't think the mod is definite okay
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
308 if(not @disruption_details and $maxentscan_orig < 0 and $genesplicer_orig < 0){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
309 $disruption_level+=0.5;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
310 push @disruption_details, "Neither the annotated splice site nor any potential cryptic site introduced by the variant are predicted by either GeneSplicer or MaxEntScan, therefore the alt splicing potential for this variant should be evaluated manually";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
311 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
312
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
313 return ($disruption_level, join("; ", @disruption_details));
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
314 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
315
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
316 sub call_genesplicer{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
317 my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
318
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
319 if($splice_type eq "acceptor"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
320 $pos += $strand eq "-" ? 2: -2;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
321 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
322 my $num_flanking = 110; # seems to croak if less than 160bp provided
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
323 my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
324 my $site_window = defined $var_offset ? 2 : ($indel_size > 0 ? 29+$indel_size : 29); # exact position if looking for exisitng site effect, otherwise 10 or 10 + the insert size
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
325 my $cmd = "genesplicer";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
326 my $start = $pos - $num_flanking;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
327 $start = 1 if $start < 1;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
328 my $stop = $pos + $num_flanking + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 2*$num_flanking after the edit
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
329 $stop++; # end coordinate is exclusive
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
330 return ($UNAFFECTED, -1) if defined $var_offset and ($pos+$var_offset < $start or $pos+$var_offset > $stop); # variant is outside of range we can detect affect on original splice site
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
331 my $seq = $fasta_index->fetch("$chr:$start-$stop");
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
332 if(defined $var){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
333 if(defined $var_offset){ # substitution away from the site of interest
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
334 substr($seq, $num_flanking+$var_offset, length($ref)) = $var;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
335 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
336 else{ # substitution at the site of interest
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
337 substr($seq, $num_flanking, length($ref)) = $var;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
338 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
339 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
340 if($strand eq "-"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
341 $seq = reverse($seq);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
342 $seq =~ tr/ACGTacgt/TGCAtgca/;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
343 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
344 $tmpfile = "$$.fasta";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
345 open(TMP, ">$tmpfile") or die "Could not open $tmpfile for writing: $!\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
346 print TMP ">$chr $strand $pos $ref",($var?" $var":""), ($var_offset?" $var_offset":""),"\n$seq\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
347 substr($seq, $num_flanking, 2) = lc(substr($seq, $num_flanking, 2));
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
348 #print STDERR "$chr $strand $pos $ref $var $var_offset: $seq\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
349 close(TMP);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
350 #print STDERR "$cmd $tmpfile /export/achri_data/programs/GeneSplicer/human\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
351 #<STDIN>;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
352 open(GS, "$cmd $tmpfile human 2> /dev/null|")
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
353 or die "Could not run $cmd: $!\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
354 my $highest_score = 0;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
355 my $highest_position = -1;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
356 while(<GS>){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
357 # End5 End3 Score "confidence" splice_site_type
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
358 # E.g. :
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
359 # 202 203 2.994010 Medium donor
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
360 chomp;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
361 my @F = split /\s+/, $_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
362 next if $F[1] < $F[0]; # wrong strand
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
363 if($splice_type eq $F[4]){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
364 if(abs($F[0]-$num_flanking-1) <= $site_window){ # right type and approximate location
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
365 #print STDERR "*$_\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
366 if($F[2] > $highest_score){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
367 $highest_score = $F[2];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
368 # last bit accounts for indels in the position offset
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
369 $highest_position = $pos + ($strand eq "-" ? -1: 1)*(-$num_flanking + $F[0] - 1) + (($F[0] <= $num_flanking && $strand eq "+" or $F[0] >= $num_flanking && $strand eq "-") ? -$indel_size : 0);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
370 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
371 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
372 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
373 #else{print STDERR $_,"\n";}
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
374 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
375 close(GS);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
376 return ($highest_score, $highest_position);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
377 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
378
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
379 sub call_maxentscan{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
380 my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
381
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
382 my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
383 my $cmd;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
384 my ($num_flanking5, $num_flanking3);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
385 my ($region5, $region3);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
386 my ($start, $stop);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
387 my $highest_score = -9999;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
388 my $highest_position = -1;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
389 # note that donor and acceptor are very different beasts for MaxEntScan
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
390 if($splice_type eq "acceptor"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
391 if($strand eq "-"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
392 $pos += 2;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
393 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
394 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
395 $pos -= 2;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
396 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
397 $cmd = "score3.pl";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
398 $num_flanking5 = 18;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
399 $num_flanking3 = 4;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
400 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
401 else{ # donor
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
402 $cmd = "score5.pl";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
403 $num_flanking5 = 3;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
404 $num_flanking3 = 5;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
405 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
406 # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
407 $region5 = defined $var_offset ? 0 : $num_flanking5; # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
408 $region3 = defined $var_offset ? 0 : $num_flanking3;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
409 if($strand eq "-"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
410 $start = $pos - $num_flanking3 - $region3;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
411 $stop = $pos + $num_flanking5 + $region5 + ($indel_size < 0 ? -1*$indel_size:0);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
412 #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking3 - $region3)\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
413 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
414 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
415 $start = $pos - $num_flanking5 - $region5;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
416 $stop = $pos + $num_flanking3 + $region3 + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 23 or 9 bases as the case may be
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
417 #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking5 - $region5)\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
418 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
419 if(defined $var_offset and (
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
420 $strand eq "+" and ($pos+$var_offset < $start or $pos+$var_offset > $stop) or
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
421 $strand eq "-" and ($pos-$var_offset < $start or $pos-$var_offset > $stop))){ # variant is outside of range we can detect affect on original splice site
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
422 return ($UNAFFECTED, -1)
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
423 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
424
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
425 my $seq = $fasta_index->fetch("$chr:$start-$stop");
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
426 my @subseqs;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
427 if(defined $var){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
428 if(defined $var_offset){ # substitution away from the site of interest
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
429 #print STDERR "Subbing in string of length ", length($seq), "near splice site $ref -> $var at ", ($strand eq "-" ? "$region3+$num_flanking3-$var_offset":"$region5+$num_flanking5+$var_offset"), "\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
430 substr($seq, ($strand eq "-" ? $region3+$num_flanking3-$var_offset:$region5+$num_flanking5+$var_offset), length($ref)) = $var;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
431 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
432 else{ # substitution at the site of interest
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
433 substr($seq, ($strand eq "-" ? $region3+$num_flanking3:$region5+$num_flanking5), length($ref)) = $var;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
434 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
435 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
436 if($strand eq "-"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
437 $seq = reverse($seq);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
438 $seq =~ tr/ACGTacgt/TGCAtgca/;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
439 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
440 # get all of the 23 or 9 base pair subsequences, depending on whether it's donors or acceptors we are looking for
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
441 for(my $i = 0; $i+$num_flanking5+$num_flanking3+1 <= length($seq); $i++){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
442 push @subseqs, substr($seq, $i, $num_flanking5+$num_flanking3+1);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
443 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
444 my $pid = open2(\*CHILD_OUT, \*CHILD_IN, "$cmd -");
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
445 $pid or die "Could not run $cmd: $!\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
446 print CHILD_IN join("\n",@subseqs);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
447 close(CHILD_IN);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
448 while(<CHILD_OUT>){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
449 chomp;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
450 # out is just "seq[tab]score"
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
451 my @F = split /\s+/, $_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
452 if($F[1] > $highest_score){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
453 #print STDERR ">";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
454 $highest_score = $F[1];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
455 # since we printed the subseqs in order, $. (containing the line number of the prediction output) gives us the offset in the original seq
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
456 $highest_position = $start + ($strand eq "-" ? length($seq) - $num_flanking5 - $. : $num_flanking5 + $. -1)+(($. <= $num_flanking5+$region5 && $strand eq "+" or $. >= $num_flanking5+$region5 && $strand eq "-") ? -$indel_size : 0);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
457 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
458 #print STDERR $_,"\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
459 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
460 close(CHILD_OUT);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
461 waitpid($pid, 0);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
462
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
463 return ($highest_score, $highest_position);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
464 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
465
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
466 sub get_range_info($$$$){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
467 my ($chr2ranges, $chr, $start, $stop) = @_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
468
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
469 if(not exists $chr2ranges->{$chr}){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
470 return {};
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
471 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
472 my $range_key = "$chr:$start:$stop";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
473
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
474 # Use a binary search to find the leftmost range of interest
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
475 my $left_index = 0;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
476 my $right_index = $#{$chr2ranges->{$chr}};
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
477 my $cursor = int($right_index/2);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
478 while($left_index != $right_index and $left_index != $cursor and $right_index != $cursor){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
479 # need to go left
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
480 if($chr2ranges->{$chr}->[$cursor]->[0] >= $start){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
481 $right_index = $cursor;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
482 $cursor = int(($left_index+$cursor)/2);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
483 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
484 # need to go to the right
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
485 elsif($chr2ranges->{$chr}->[$cursor]->[1] <= $stop){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
486 $left_index = $cursor;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
487 $cursor = int(($right_index+$cursor)/2);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
488 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
489 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
490 $right_index = $cursor;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
491 $cursor--; # in the range, go left until not in the range any more
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
492 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
493 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
494
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
495 my %names;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
496 for (; $cursor <= $#{$chr2ranges->{$chr}}; $cursor++){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
497 my $range_ref = $chr2ranges->{$chr}->[$cursor];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
498 if($range_ref->[0] > $stop){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
499 last;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
500 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
501 elsif($range_ref->[0] <= $stop and $range_ref->[1] >= $start){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
502 $names{$range_ref->[2]}++;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
503 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
504 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
505
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
506 #print STDERR "Names for range $chr:$start-$stop are ", join("/", keys %names), "\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
507 return \%names;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
508 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
509
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
510 if(not -d $sift_dir){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
511 die "The specified SIFT directory $sift_dir does not exist\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
512 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
513
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
514 print STDERR "Reading in OMIM morbid map...\n" unless $quiet;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
515 die "Data file $morbidmap_file does not exist, aborting.\n" if not -e $morbidmap_file;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
516 my %gene2morbids;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
517 open(TAB, $morbidmap_file)
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
518 or die "Cannot open OMIM morbid map file $morbidmap_file for reading: $!\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
519 while(<TAB>){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
520 my @fields = split /\|/, $_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
521 #print STDERR "got ", ($#fields+1), " fields in $_\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
522 #my @fields = split /\|/, $_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
523 my $morbid = $fields[0];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
524 $morbid =~ s/\s*\(\d+\)$//; # get rid of trailing parenthetical number
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
525 my $omim_id = $fields[2];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
526 for my $genename (split /, /, $fields[1]){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
527 if(not exists $gene2morbids{$genename}){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
528 $gene2morbids{$genename} = "OMIM:$omim_id $morbid";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
529 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
530 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
531 $gene2morbids{$genename} .= "; $morbid";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
532 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
533 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
534 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
535 close(TAB);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
536
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
537 print STDERR "Reading in interpro mappings...\n" unless $quiet;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
538 die "Data file $interpro_bed_file does not exist, aborting.\n" if not -e $interpro_bed_file;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
539 my %interpro_ids;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
540 open(TAB, $interpro_bed_file)
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
541 or die "Cannot open gene name BED file $interpro_bed_file for reading: $!\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
542 while(<TAB>){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
543 chomp;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
544 # format should be "chr start stop interpro desc..."
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
545 my @fields = split /\t/, $_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
546 my $c = $fields[0];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
547 if(not exists $interpro_ids{$c}){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
548 $interpro_ids{$c} = [] unless exists $interpro_ids{$c};
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
549 $interpro_ids{"chr".$c} = [] unless $c =~ /^chr/;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
550 $interpro_ids{$1} = [] if $c =~ /^chr(\S+)/;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
551 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
552 my $dataref = [@fields[1..3]];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
553 push @{$interpro_ids{$c}}, $dataref;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
554 push @{$interpro_ids{"chr".$c}}, $dataref unless $c =~ /^chr/;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
555 push @{$interpro_ids{$1}}, $dataref if $c =~ /^chr(\S+)/;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
556 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
557 close(TAB);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
558 print STDERR "Sorting interpro matches per contig...\n" unless $quiet;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
559 for my $chr (keys %interpro_ids){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
560 $interpro_ids{$chr} = [sort {$a->[0] <=> $b->[0]} @{$interpro_ids{$chr}}];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
561 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
562
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
563 # {uc(gene_name) => space separated info}
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
564 print STDERR "Reading in tissue info...\n" unless $quiet;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
565 my %tissues;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
566 my %tissue_desc;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
567 open(TISSUE, $tissue_file)
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
568 or die "Cannot open $tissue_file for reading: $!\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
569 while(<TISSUE>){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
570 chomp;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
571 my @fields = split /\t/, $_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
572 my $gname = uc($fields[1]);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
573 $tissue_desc{$gname} = $fields[2];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
574 next unless $#fields == 4;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
575 my @tis = split /;\s*/, $fields[4];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
576 pop @tis;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
577 if(@tis < 6){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
578 $tissues{$gname} = join(">", @tis);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
579 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
580 elsif(@tis < 24){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
581 $tissues{$gname} = join(">", @tis[0..5]).">...";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
582 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
583 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
584 $tissues{$gname} = join(">", @tis[0..(int($#tis/5))]).">...";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
585 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
586 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
587 close(TISSUE);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
588
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
589 print STDERR "Loading pathway info...\n" unless $quiet;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
590 my %pathways;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
591 my %pathway_ids;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
592 open(PATHWAY, $pathway_file)
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
593 or die "Cannot open $pathway_file for reading: $!\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
594 while(<PATHWAY>){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
595 chomp;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
596 my @fields = split /\t/, $_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
597 next unless @fields >= 4;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
598 for my $gname (split /\s+/, $fields[1]){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
599 $gname = uc($gname);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
600 if(not exists $pathways{$gname}){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
601 $pathway_ids{$gname} = $fields[2];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
602 $pathways{$gname} = $fields[3];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
603 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
604 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
605 $pathway_ids{$gname} .= "; ".$fields[2];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
606 $pathways{$gname} .= "; ".$fields[3];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
607 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
608 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
609 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
610 close(PATHWAY);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
611
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
612 print STDERR "Loading SIFT indices...\n" unless $quiet;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
613 # Read in the database table list
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
614 my %chr_tables;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
615 open(DBLIST, "$sift_dir/bins.list")
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
616 or die "Cannot open $sift_dir/bins.list for reading: $!\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
617 while(<DBLIST>){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
618 chomp;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
619 my @fields = split /\t/, $_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
620 if(not exists $chr_tables{$fields[1]}){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
621 $chr_tables{$fields[1]} = {};
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
622 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
623 my $chr = $fields[1];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
624 $chr = "chr$chr" unless $chr =~ /^chr/;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
625 $chr_tables{$chr}->{"$chr\_$fields[2]_$fields[3]"} = [$fields[2],$fields[3]];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
626 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
627 close(DBLIST);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
628
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
629 #Connect to database
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
630 my $db_gene = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_Supp.sqlite","","",{RaiseError=>1, AutoCommit=>1});
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
631
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
632 my $db_chr;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
633 my $fr_stmt; # retrieval for frameshift
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
634 my $snp_stmt; # retrieval for SNP
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
635 my $cur_chr = "";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
636 my $cur_max_pos;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
637
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
638 if(defined $predict_cryptic_splicings){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
639 if(not -e $predict_cryptic_splicings){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
640 die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, does not exist.\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
641 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
642 if(not -e $predict_cryptic_splicings.".fai" and not -w dirname($predict_cryptic_splicings)){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
643 die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, is not indexed, and the directory is not writable.\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
644 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
645 print STDERR "Loading reference FastA index...\n" unless $quiet;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
646 $fasta_index = Bio::DB::Sam::Fai->load($predict_cryptic_splicings);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
647 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
648
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
649 print STDERR "Annotating variants...\n" unless $quiet;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
650 # Assume contigs and positions in order
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
651 print STDERR "Processing file $hgvs_table_file...\n" unless $quiet;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
652 open(HGVS, $hgvs_table_file)
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
653 or die "Cannot open HGVS file $hgvs_table_file for reading: $!\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
654 # Output file for annotated snps and frameshifts
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
655 my $header = <HGVS>; # header
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
656 chomp $header;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
657 my @headers = split /\t/, $header;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
658 my ($chr_column, $pos_column, $ref_column, $alt_column, $alt_cnt_column, $tot_cnt_column, $ftype_column, $dbid_column, $maf_src_column, $maf_column, $internal_freq_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
659 $cdna_hgvs_column, $aa_hgvs_column, $transcript_column, $transcript_length_column, $strand_column, $zygosity_column, $splice_dist_column, $caveats_column, $phase_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
660 $srcs_column, $pvalue_column, $to_column, $genename_column, $rares_column, $rares_header);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
661 my %req_columns = (
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
662 "Chr" => \$chr_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
663 "DNA From" => \$pos_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
664 "DNA To" => \$to_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
665 "Gene Name" => \$genename_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
666 "Ref base" => \$ref_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
667 "Obs base" => \$alt_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
668 "Variant Reads" => \$alt_cnt_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
669 "Total Reads" => \$tot_cnt_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
670 "Feature type" => \$ftype_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
671 "Variant DB ID" => \$dbid_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
672 "Pop. freq. source" => \$maf_src_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
673 "Pop. freq." => \$maf_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
674 "Transcript HGVS" => \$cdna_hgvs_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
675 "Protein HGVS" => \$aa_hgvs_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
676 "Selected transcript" => \$transcript_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
677 "Transcript length" => \$transcript_length_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
678 "Strand" => \$strand_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
679 "Zygosity" => \$zygosity_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
680 "Closest exon junction (AA coding variants)" => \$splice_dist_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
681 "Caveats" => \$caveats_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
682 "Phase" => \$phase_column,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
683 "P-value" => \$pvalue_column);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
684 &load_header_columns(\%req_columns, \@headers);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
685 for(my $i = 0; $i <= $#headers; $i++){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
686 # Optional columns go here
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
687 if($headers[$i] eq "Sources"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
688 $srcs_column = $i;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
689 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
690 elsif($headers[$i] eq "Internal pop. freq."){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
691 $internal_freq_column = $i;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
692 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
693 elsif($headers[$i] =~ /^Num rare variants/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
694 $rares_column = $i;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
695 $rares_header = $headers[$i];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
696 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
697 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
698 open(OUT, ">$outfile")
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
699 or die "Cannot open $outfile for writing: $!\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
700 print OUT join("\t", "Chr", "DNA From", "DNA To", "Ref base", "Obs base", "% ref read","% variant read",
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
701 "Variant Reads", "Total Reads", "Feature type","Variant type", "Variant context", "Variant DB ID", "Pop. freq. source",
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
702 "Pop. freq."), (defined $internal_freq_column ? "\tInternal pop. freq.\t" : "\t"), join("\t", "SIFT Score", "Polyphen2 Score", "Polyphen2 call", "GERP score",
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
703 "Gene Name", "Transcript HGVS",
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
704 "Protein HGVS", "Zygosity","Closest exon junction (AA coding variants)","Splicing alteration potential","Splicing alteration details",
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
705 "OMIM Morbid Map","Tissue expression", "Pathways", "Pathway refs",
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
706 "Gene Function", "Spanning InterPro Domains", "P-value", "Caveats", "Phase",
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
707 "Selected transcript", "Transcript length", "Strand"),
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
708 (defined $rares_column ? "\t$rares_header" : ""),
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
709 (defined $srcs_column ? "\tSources" : ""), "\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
710 while(<HGVS>){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
711 chomp;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
712 my @fields = split /\t/, $_, -1;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
713 my $mode;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
714 my $protein_hgvs = $fields[$aa_hgvs_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
715 my $refSeq = $fields[$ref_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
716 my $newBase = $fields[$alt_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
717 my $splicing_potential = "NA";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
718 my $splicing_details = "NA";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
719 if(defined $predict_cryptic_splicings){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
720 my $distance = $fields[$splice_dist_column]; # only for coding variants
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
721 my $site_type;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
722 if($distance eq "NA"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
723 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
724 elsif($distance){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
725 $site_type = $distance < 0 ? "donor" : "acceptor"; # exon context
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
726 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
727 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
728 if($fields[$cdna_hgvs_column] =~ /\d+([\-+]\d+)/){ # get from the cDNA HGVS otherwise
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
729 $distance = $1;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
730 $distance =~ s/^\+//;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
731 $site_type = $distance > 0 ? "donor" : "acceptor"; # intron context
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
732 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
733 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
734 # only do splicing predictions for novel or rare variants, since these predictions are expensive
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
735 if($distance and $distance ne "NA" and ($fields[$maf_column] eq "NA" or $fields[$maf_column] <= 0.01)){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
736 #print STDERR "Running prediction for rare variant ($fields[3], MAF=$fields[14]): $fields[5], $fields[6], $refSeq, $newBase, $fields[4], $distance, $site_type\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
737 ($splicing_potential, $splicing_details) = &predict_splicing($fields[$chr_column], $fields[$pos_column], $refSeq, $newBase, $fields[$strand_column], $distance, $site_type);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
738 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
739 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
740 my $lookupBase;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
741 if($fields[$strand_column] eq "-"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
742 if($refSeq ne "NA"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
743 $refSeq = reverse($refSeq);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
744 $refSeq =~ tr/ACGTacgt/TGCAtgca/;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
745 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
746 if($newBase ne "NA"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
747 $newBase = reverse($newBase);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
748 $newBase =~ tr/ACGTacgt/TGCAtgca/;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
749 $newBase =~ s(TN/)(NA/);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
750 $newBase =~ s(/TN)(/NA);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
751 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
752 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
753 if($fields[$cdna_hgvs_column] =~ /^g\..*?(?:ins|delins|>)([ACGTN]+)$/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
754 $mode = "snps";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
755 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
756 elsif($fields[$cdna_hgvs_column] =~ /^g\..*?del([ACGTN]+)$/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
757 $mode = "snps";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
758 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
759 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+(?:[\-+]\d+)?ins([ACGTN]+)$/ or $fields[$cdna_hgvs_column] =~ /^c\.\d+(?:[\-+]\d+)?_\d+ins([ACGTN]+)$/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
760 if(not length($1)%3){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
761 $mode = "snps";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
762 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
763 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
764 $mode = "frameshifts";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
765 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
766 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
767 elsif($fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:[\-+]\d+)?_(\d+)(?:[\-+]\d+)?delins([ACGTN]+)$/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
768 if(not (length($3)-$2+$1-1)%3){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
769 $mode = "snps";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
770 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
771 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
772 $mode = "frameshifts";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
773 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
774 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
775 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[+\-]\d+_\d+[+\-]\d+delins([ACGTN]+)$/ or
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
776 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
777 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
778 $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+del([ACGTN]*)$/ or
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
779 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?(?:ins|delins)?([ACGTN]+)$/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
780 $mode = "snps";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
781 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
782 elsif($fields[$cdna_hgvs_column] =~ /^c\.([-*]?\d+)del(\S+)/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
783 my $dist = $1;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
784 $dist =~ s/^\*//;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
785 if(abs($dist) < 4){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
786 $mode = "snps";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
787 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
788 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
789 $mode = "frameshifts";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
790 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
791 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
792 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+ins(\S*)/ or
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
793 $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+del(\S*)/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
794 $mode = "snps";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
795 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
796 elsif($fields[$cdna_hgvs_column] =~ /^c\.(-?\d+)_(\d+)(?:\+\d+)?del(\S+)/ or
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
797 $fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:\-\d+)?_(\d+)del(\S+)/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
798 if(not ($2-$1+1)%3){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
799 $mode = "snps";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
800 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
801 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
802 $mode = "frameshifts";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
803 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
804 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
805 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+ins([ACGTN]+)$/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
806 if(not length($1)%3){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
807 $mode = "snps";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
808 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
809 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
810 $mode = "frameshifts";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
811 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
812 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
813 # special case of shifted start codon
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
814 elsif($fields[$cdna_hgvs_column] =~ /^c\.-1_1+ins([ACGTN]+)$/ or
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
815 $fields[$cdna_hgvs_column] =~ /inv$/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
816 $mode = "snps";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
817 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
818 elsif($fields[$cdna_hgvs_column] =~ /[ACGTN]>([ACGTN])$/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
819 # todo: handle multiple consecutive substitutions, will have to run polyphen separately
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
820 $lookupBase = $newBase;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
821 if($fields[$strand_column] eq "-"){ # revert to positive strand variant for SIFT index
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
822 $lookupBase =~ tr/ACGTacgt/TGCAtgca/;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
823 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
824 $mode = "snps";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
825 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
826 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
827 $mode = "snps";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
828 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
829 if($fields[$chr_column] ne $cur_chr){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
830 #Prepared DB connection on per-chromosome basis
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
831 $cur_chr = $fields[$chr_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
832 $cur_chr =~ s/^chr//;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
833 $db_chr = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_CHR$cur_chr.sqlite",
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
834 "",
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
835 "",
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
836 { RaiseError => 1, AutoCommit => 1 }) unless not -e "$sift_dir/Human_CHR$cur_chr.sqlite";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
837
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
838 $cur_max_pos = -1;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
839 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
840 if($fields[$pos_column] =~ /^\d+$/ and $cur_max_pos < $fields[$pos_column]){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
841 undef $fr_stmt;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
842 undef $snp_stmt;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
843 my $c = $fields[$chr_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
844 $c = "chr$c" if $c !~ /^chr/;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
845 for my $chr_table_key (keys %{$chr_tables{$c}}){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
846 my $chr_table_range = $chr_tables{$c}->{$chr_table_key};
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
847 if($chr_table_range->[0] <= $fields[$pos_column] and $chr_table_range->[1] >= $fields[$pos_column]){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
848 $fr_stmt = $db_chr->prepare("select SCORE".
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
849 " from $chr_table_key where COORD1 = ?");
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
850 $snp_stmt = $db_chr->prepare("select SCORE".
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
851 " from $chr_table_key where COORD1 = ? AND NT2 = ?");
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
852 $cur_max_pos = $chr_table_range->[1];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
853 last;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
854 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
855 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
856 if(not defined $fr_stmt and not $quiet){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
857 warn "Got request for position not in range of SIFT ($c:$fields[$pos_column])\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
858 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
859 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
860
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
861 my @cols;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
862 if(not $fr_stmt){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
863 @cols = ("NA");
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
864 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
865 elsif($mode eq "frameshifts"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
866 $fr_stmt->execute($fields[$pos_column]);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
867 @cols = $fr_stmt->fetchrow_array();
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
868 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
869 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
870 $snp_stmt->execute($fields[$pos_column]-1, $lookupBase);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
871 @cols = $snp_stmt->fetchrow_array();
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
872 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
873 # See if the change is in a CDS, and if it's non-synonymous
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
874 my ($type, $location) = ("silent", "intron");
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
875 my $start = $fields[$pos_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
876 my $stop = $fields[$to_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
877 my $interpro = get_range_info(\%interpro_ids, $fields[$chr_column], $start, $stop);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
878 my $domains = join("; ", sort keys %$interpro);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
879
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
880 my $sift_score = "NA";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
881 my $variant_key = "NA";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
882 my $transcript_hgvs = $fields[$cdna_hgvs_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
883 if($transcript_hgvs =~ /\.\d+[ACGTN]>/ or $transcript_hgvs =~ /\.\d+(?:_|del|ins)/ or $transcript_hgvs =~ /[+\-]\?/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
884 $location = "exon";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
885 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
886 elsif($transcript_hgvs =~ /\.\d+[\-+](\d+)/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
887 if($1 < 21){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
888 $type = "splice";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
889 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
890 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
891 $type = "intronic";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
892 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
893 $location = "splice site";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
894 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
895 elsif($transcript_hgvs =~ /\.\*\d+/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
896 $location = "3'UTR";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
897 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
898 elsif($transcript_hgvs =~ /\.-\d+/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
899 $location = "5'UTR";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
900 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
901 if($mode eq "frameshifts"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
902 $type = "frameshift";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
903 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
904 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
905 if(not defined $protein_hgvs){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
906 $type = "unknown";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
907 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
908 elsif($protein_hgvs eq "NA"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
909 $type = "non-coding";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
910 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
911 elsif($protein_hgvs =~ /=$/){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
912 $type = "silent";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
913 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
914 elsif($protein_hgvs =~ /\d+\*/){ # nonsense
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
915 $type = "nonsense";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
916 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
917 elsif($protein_hgvs =~ /ext/){ # nonsense
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
918 $type = "nonstop";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
919 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
920 elsif($protein_hgvs eq "p.0?" or $protein_hgvs eq "p.?"){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
921 $type = "unknown";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
922 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
923 else{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
924 $type = "missense";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
925 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
926
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
927 $sift_score = $cols[0] || "NA";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
928
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
929 # Record the need to fetch VCF polyphen and gerp data later (en masse, for efficiency)
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
930 # Use newBase instead of lookupBase for PolyPhen since the orientation is always relative to the transcript
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
931 $variant_key = get_variant_key($fields[$chr_column], $fields[$pos_column], $fields[$ref_column], $newBase);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
932 #print STDERR "Variant key is $variant_key\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
933 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
934 my $transcript_type = $fields[$ftype_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
935 my $transcript_length = $fields[$transcript_length_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
936 my $transcript_id = $fields[$transcript_column]; # sift score is derived from this transcript
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
937 my $transcript_strand = $fields[$strand_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
938 my $zygosity = $fields[$zygosity_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
939 my $rsID = $fields[$dbid_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
940 my $popFreq = $fields[$maf_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
941 my $internalFreq;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
942 $internalFreq = $fields[$internal_freq_column] if defined $internal_freq_column;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
943 my $popFreqSrc = $fields[$maf_src_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
944
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
945 # Get the gene name and omim info
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
946 my %seen;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
947 my @gene_names = split /; /, $fields[$genename_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
948 my $morbid = join("; ", grep {/\S/ and not $seen{$_}++} map({$gene2morbids{$_} or ""} @gene_names));
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
949 my $tissue = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissues{$_} or ""} @gene_names));
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
950 my $function = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissue_desc{$_} or ""} @gene_names));
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
951 my $pathways = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathways{$_} or ""} @gene_names));
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
952 my $pathway_ids = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathway_ids{$_} or ""} @gene_names));
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
953
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
954 # It may be that the calls and freqs are multiple concatenated values, e.g. "X; Y"
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
955 my @tot_bases = split /; /,$fields[$tot_cnt_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
956 my @var_bases = split /; /,$fields[$alt_cnt_column];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
957 my (@pct_nonvar, @pct_var);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
958 for (my $i = 0; $i <= $#tot_bases; $i++){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
959 push @pct_nonvar, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int(($tot_bases[$i]-$var_bases[$i])/$tot_bases[$i]*100+0.5);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
960 push @pct_var, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int($var_bases[$i]/$tot_bases[$i]*100+0.5);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
961 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
962
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
963 push @lines, [
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
964 $fields[$chr_column], # chr
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
965 $fields[$pos_column], # pos
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
966 $fields[$to_column],
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
967 $fields[$ref_column],
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
968 $fields[$alt_column],
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
969 join("; ", @pct_nonvar), # pct ref
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
970 join("; ", @pct_var), # pct var
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
971 $fields[$alt_cnt_column], # num var
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
972 $fields[$tot_cnt_column], # num reads
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
973 $transcript_type, # protein_coding, pseudogene, etc.
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
974 $type,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
975 $location,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
976 $rsID,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
977 $popFreqSrc,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
978 (defined $internal_freq_column ? ($popFreq,$internalFreq) : ($popFreq)),
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
979 $sift_score,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
980 $variant_key, # 16th field
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
981 join("; ", @gene_names),
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
982 $transcript_hgvs,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
983 $protein_hgvs,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
984 $zygosity,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
985 $fields[$splice_dist_column], # distance to exon edge
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
986 $splicing_potential,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
987 $splicing_details,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
988 $morbid,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
989 $tissue,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
990 $pathways,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
991 $pathway_ids,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
992 $function,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
993 $domains,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
994 $fields[$pvalue_column],
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
995 $fields[$caveats_column], # caveats
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
996 (defined $rares_column ? ($fields[$rares_column],$transcript_id) : ($transcript_id)),
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
997 $transcript_length,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
998 $transcript_strand,
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
999 (defined $srcs_column ? ($fields[$phase_column],$fields[$srcs_column]) : ($fields[$phase_column]))];
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1000 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1001
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1002 print STDERR "Adding Polyphen2 and GERP info...\n" unless $quiet;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1003 # retrieve the polyphen and gerp info en masse for each chromosome, as this is much more efficient
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1004 my $vkey_col = 16+(defined $internal_freq_column ? 1 : 0);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1005 for my $line_fields (@lines){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1006 # expand the key into the relevant MAF values
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1007 $line_fields->[$vkey_col] = join("\t", polyphen_gerp_info($gerp_dir,$polyphen_file,$line_fields->[$vkey_col])); # fields[$vkey_col] is variant key
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1008 print OUT join("\t", @{$line_fields}), "\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1009 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1010 close(OUT);
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1011 unlink $tmpfile if defined $tmpfile;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1012
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1013 sub load_header_columns{
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1014 my ($reqs_hash_ref, $headers_array_ref) = @_;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1015 my %unfulfilled;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1016 for my $field_name (keys %$reqs_hash_ref){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1017 $unfulfilled{$field_name} = 1;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1018 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1019 for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1020 for my $req_header_name (keys %$reqs_hash_ref){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1021 if($req_header_name eq $headers_array_ref->[$i]){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1022 ${$reqs_hash_ref->{$req_header_name}} = $i;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1023 delete $unfulfilled{$req_header_name};
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1024 last;
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1025 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1026 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1027 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1028 if(keys %unfulfilled){
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1029 die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n";
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1030 }
40dd2e7ee63a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1031 }