annotate hgvs_table_annotate @ 0:9977d1935a07 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:10:47 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use Bio::DB::Sam;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use DBI;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5 use IPC::Open2;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6 use Set::IntervalTree;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 use strict;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8 use warnings;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9 use vars qw($quiet %chr2variant_locs %polyphen_info %chr2polyphen_vcf_lines %chr2gerp_vcf_lines %range2genes $fasta_index $UNAFFECTED $tmpfile);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10 use File::Basename;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11 $UNAFFECTED = -299999; # sentinel for splice site predictions
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 my $current_directory = dirname(__FILE__);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 if(@ARGV == 1 and $ARGV[0] eq "-v"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 print "Version 1.0\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 exit;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 #configuration file stuff
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19 my %config;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 my $tool_dir = shift @ARGV;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 if(not -e "$tool_dir/annotate_hgvs.loc"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22 system("cp $current_directory/tool-data/annotate_hgvs.loc $tool_dir/annotate_hgvs.loc");
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24 open CONFIG, '<', "$tool_dir/annotate_hgvs.loc";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 while(<CONFIG>){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26 next if $_ =~ /^#/;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 (my $key, my $value) = split(/\s+/,$_);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 $config{$key} = $value;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 close CONFIG;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31 my $dbs_dir = $config{"dbs_dir"};
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 $quiet = 0;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 $quiet = 1;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 shift @ARGV;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 @ARGV == 9 or @ARGV == 10 or die "Usage: $0 -q <SIFT dir> <POLYPHEN2 file> <GERP dir> <tissuedb file> <pathway file> <interpro domains.bed> <omim morbidmap> <input hgvs table.txt> <output hgvs table.annotated.txt> [reference.fasta]\n Where if the reference fasta is provided, we predict cryptic splicing effects (using MaxEntScan and GeneSplicer)\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 my $sift_dir = $dbs_dir . shift @ARGV;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42 my $polyphen_file = $dbs_dir . shift @ARGV;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 my $gerp_dir = $dbs_dir . shift @ARGV;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 my $tissue_file = $dbs_dir . shift @ARGV;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 my $pathway_file = $dbs_dir. shift @ARGV;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 my $interpro_bed_file = $dbs_dir . shift @ARGV;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 my $morbidmap_file = $dbs_dir . shift @ARGV;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 my $hgvs_table_file = shift @ARGV;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49 my $outfile = shift @ARGV;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 my $predict_cryptic_splicings = @ARGV ? ($dbs_dir . shift @ARGV) : undef;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 my $flanking_bases = 500; # assume this is ROI around a gene
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 my @lines;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 sub polyphen_gerp_info($$$){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55 my($gerp_dir,$polyphen_file,$info_key) = @_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 my($chr,$pos,$ref,$variant) = split /:/, $info_key;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60 if(not exists $chr2polyphen_vcf_lines{$chr}){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 ($chr2polyphen_vcf_lines{$chr}, $chr2gerp_vcf_lines{$chr}) = retrieve_vcf_lines($gerp_dir,$polyphen_file,$chr);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 my @results;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 if(not $ref or not $variant){ # can only get data for SNPs
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 @results = qw(NA NA NA);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68 #print STDERR "ref = $ref and variant = $variant for $info_key\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69 elsif(length($ref) == 1 and length($variant) == 1){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70 #print STDERR "Grabbing poly/gerp for $chr,$pos,$ref,$variant\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
71 @results = polyphen_info($chr,$pos,$variant);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
72 $results[2] = gerp_info($chr,$pos);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
73 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
74
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
75 # It may be that a novel MNP variant is a bunch of known SNPs in a row. In this case,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
76 # report the list of variants
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
77 elsif(length($ref) == length($variant) and $ref ne "NA"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
78 my (@poly_scores, @poly_calls, @gerp_scores);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
79 for(my $i = 0; $i < length($ref); $i++){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
80 my $refbase = substr($ref, $i, 1);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
81 my $newbase = substr($variant, $i, 1);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
82 if($newbase eq $refbase){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
83 push @poly_scores, "-";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
84 push @poly_calls, "-";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
85 push @gerp_scores, "-";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
86 next;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
87 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
88 my @base_results = polyphen_info($chr,$pos+$i,$refbase,$newbase);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
89 push @poly_scores, $base_results[0];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
90 push @poly_calls, $base_results[1];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
91 $base_results[2] = gerp_info($chr,$pos+$i);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
92 push @gerp_scores, $base_results[2];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
93 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
94 $results[0] = join(",", @poly_scores);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
95 $results[1] = join(",", @poly_calls);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
96 $results[2] = join(",", @gerp_scores);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
97 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
98 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
99 @results = qw(NA NA NA);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
100 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
101
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
102 return @results;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
103 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
104
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
105 sub get_variant_key($$$$){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
106 # params chr pos ref variant
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
107 my $c = $_[0];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
108 $c =~ s/^chr//;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
109 my $prop_info_key = join(":", $c, @_[1..3]);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
110 $chr2variant_locs{$c} = [] unless exists $chr2variant_locs{$c};
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
111 push @{$chr2variant_locs{$c}}, $_[1];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
112 return $prop_info_key;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
113 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
114
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
115 sub retrieve_vcf_lines($$$){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
116 my ($gerp_dir, $polyphen_file, $chr) = @_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
117
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
118 my (%polyphen_lines, %gerp_lines);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
119
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
120 if(not exists $chr2variant_locs{$chr}){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
121 return ({}, {}); # no data requested for this chromosome
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
122 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
123
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
124 # build up the request
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
125 my @tabix_regions;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
126 my @var_locs = grep /^\d+$/, @{$chr2variant_locs{$chr}}; # grep keeps point mutations only
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
127 if(not @var_locs){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
128 return ({}, {}); # no point mutations
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
129 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
130
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
131 # sort by variant start location
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
132 for my $var_loc (sort {$a <=> $b} @var_locs){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
133 push @tabix_regions, $chr.":".$var_loc."-".$var_loc;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
134 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
135 my $regions = join(" ", @tabix_regions);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
136
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
137 # retrieve the data
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
138 die "Cannot find Polyphen2 VCF file $polyphen_file\n" if not -e $polyphen_file;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
139 open(VCF, "tabix $polyphen_file $regions |")
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
140 or die "Cannot run tabix on $polyphen_file: $!\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
141 while(<VCF>){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
142 if(/^\S+\t(\d+)/ and grep {$_ eq $1} @var_locs){ # take only main columns to save room, if possible
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
143 chomp;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
144 $polyphen_lines{$1} = [] unless exists $polyphen_lines{$1};
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
145 push @{$polyphen_lines{$1}}, $_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
146 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
147 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
148 close(VCF);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
149
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
150 my $vcf_file = "$gerp_dir/chr$chr.tab.gz";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
151 if(not -e $vcf_file){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
152 warn "Cannot find GERP VCF file $vcf_file\n" if not $quiet;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
153 return ({}, {});
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
154 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
155 open(VCF, "tabix $vcf_file $regions |")
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
156 or die "Cannot run tabix on $vcf_file: $!\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
157 while(<VCF>){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
158 if(/^(\S+\t(\d+)(?:\t\S+){2})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
159 $gerp_lines{$2} = [] unless exists $gerp_lines{$2};
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
160 push @{$gerp_lines{$2}}, $1;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
161 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
162 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
163 close(VCF);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
164
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
165 return (\%polyphen_lines, \%gerp_lines);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
166 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
167
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
168 sub polyphen_info($$$){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
169 my ($chr, $pos, $variant_base) = @_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
170
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
171 my $key = "$chr:$pos:$variant_base";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
172 if(exists $polyphen_info{$key}){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
173 return @{$polyphen_info{$key}};
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
174 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
175
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
176 #print STDERR "Checking for polyphen data for $chr, $pos, $variant_base\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
177 if(exists $chr2polyphen_vcf_lines{$chr}->{$pos}){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
178 for(@{$chr2polyphen_vcf_lines{$chr}->{$pos}}){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
179 #print STDERR "Checking $chr, $pos, $variant_base against $_";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
180 my @fields = split /\t/, $_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
181 if($fields[4] eq $variant_base){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
182 if($fields[6] eq "D"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
183 $polyphen_info{$key} = [$fields[5],"deleterious"];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
184 last;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
185 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
186 elsif($fields[6] eq "P"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
187 $polyphen_info{$key} = [$fields[5],"poss. del."];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
188 last;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
189 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
190 elsif($fields[6] eq "B"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
191 $polyphen_info{$key} = [$fields[5],"benign"];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
192 last;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
193 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
194 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
195 $polyphen_info{$key} = [$fields[5],$fields[6]];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
196 last;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
197 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
198 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
199 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
200 if(not exists $polyphen_info{$key}){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
201 $polyphen_info{$key} = ["NA", "NA"];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
202 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
203 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
204 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
205 $polyphen_info{$key} = ["NA", "NA"];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
206 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
207 return @{$polyphen_info{$key}};
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
208 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
209
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
210 sub gerp_info($$){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
211 my ($chr,$pos) = @_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
212
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
213 if(exists $chr2gerp_vcf_lines{$chr}->{$pos}){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
214 for(@{$chr2gerp_vcf_lines{$chr}->{$pos}}){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
215 my @fields = split /\t/, $_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
216 return $fields[3];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
217 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
218 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
219 return "NA";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
220 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
221
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
222 sub predict_splicing{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
223 my ($chr, $pos, $ref, $var, $strand, $exon_edge_distance, $splice_type) = @_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
224
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
225 my $disruption_level = 0;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
226 my @disruption_details;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
227 # The methods being used only look for up to a 30 base context, so only
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
228 # need to check for obliterated acceptors/donors if they are nearby
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
229 $exon_edge_distance-- if $exon_edge_distance > 0; # +1 is actually in the right spot, so decrement
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
230 my $edge_offset = $strand eq "-" ? -1*$exon_edge_distance : $exon_edge_distance; # flip offset sign if the call was on the negative strand (since +/- was relative to the transcript, not the genome coordinates)
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
231 my $annotated_detected = 0;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
232 my ($genesplicer_orig, $genesplicer_orig_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
233 if($genesplicer_orig != 0){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
234 $annotated_detected = 1;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
235 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
236 my ($maxentscan_orig, $maxentscan_orig_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
237 if(not $annotated_detected and $maxentscan_orig){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
238 $annotated_detected = 1;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
239 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
240 my ($genesplicer_mod, $genesplicer_mod_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
241 if($genesplicer_mod != $UNAFFECTED){ # was the mod within the range where this software could detect up or downstream effects on the annotated splice site?
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
242 if($genesplicer_orig_pos != $genesplicer_mod_pos){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
243 $disruption_level+=0.75;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
244 if($genesplicer_mod_pos == -1){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
245 push @disruption_details, "Variant obliterates GeneSplicer predicted splice site from the reference sequence ($chr:$genesplicer_orig_pos, orig score $genesplicer_orig)";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
246 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
247 elsif($genesplicer_orig_pos != -1){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
248 push @disruption_details, "Variant moves GeneSplicer preferred splice site from annotated location $chr:$genesplicer_orig_pos to $chr:$genesplicer_mod_pos (scores: orig=$genesplicer_orig, new=$genesplicer_mod)";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
249 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
250 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
251 elsif($genesplicer_orig-$genesplicer_mod > 1){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
252 $disruption_level+=0.5;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
253 push @disruption_details, "Variant significantly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
254 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
255 elsif($genesplicer_orig > $genesplicer_mod){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
256 $disruption_level+=0.25;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
257 push @disruption_details, "Variant slightly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
258 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
259 $genesplicer_orig = $genesplicer_mod; # so that splice site comparison vs. cryptic is accurate
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
260 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
261 my ($maxentscan_mod, $maxentscan_mod_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
262 if($maxentscan_mod != $UNAFFECTED){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
263 if($maxentscan_mod_pos != $maxentscan_orig_pos){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
264 $disruption_level+=1;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
265 if($maxentscan_mod_pos == -1){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
266 push @disruption_details, "Variant obliterates MaxEntScan predicted splice site from the reference sequence ($chr:$maxentscan_orig_pos, orig score $maxentscan_orig)";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
267 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
268 elsif($maxentscan_orig_pos != -1){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
269 push @disruption_details, "Variant moves MaxEntScan preferred splice site from annotated location $chr:$maxentscan_orig_pos to $chr:$maxentscan_mod_pos (scores: orig=$maxentscan_orig, new=$maxentscan_mod)";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
270 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
271 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
272 elsif($maxentscan_orig-$maxentscan_mod > 1){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
273 $disruption_level+=0.5;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
274 push @disruption_details, "Variant significantly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
275 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
276 elsif($maxentscan_orig>$maxentscan_mod){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
277 $disruption_level+=0.25;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
278 push @disruption_details, "Variant slightly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
279 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
280 $maxentscan_orig = $maxentscan_mod;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
281 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
282
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
283 # Regardless of location, see if there is an increased chance of creating a cryptic splicing site
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
284 my ($genesplicer_control, $genesplicer_control_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
285 my ($genesplicer_cryptic, $genesplicer_cryptic_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref, $var);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
286 if(defined $genesplicer_orig and $genesplicer_control < $genesplicer_orig and $genesplicer_orig < $genesplicer_cryptic){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
287 $disruption_level+=0.75;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
288 push @disruption_details, "Variant causes GeneSplicer score for a potential cryptic $splice_type splice site at $chr:$genesplicer_cryptic_pos to become higher than the annotated splice site $chr:$genesplicer_orig_pos (orig=$genesplicer_orig vs. var=$genesplicer_cryptic)";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
289 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
290 if($genesplicer_control - $genesplicer_cryptic < -1){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
291 $disruption_level+=0.5;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
292 push @disruption_details, "Variant raises GeneSplicer score for a potential cryptic $splice_type splice site (orig=$genesplicer_control vs. var=$genesplicer_cryptic)";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
293 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
294 my ($maxentscan_control, $maxentscan_control_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
295 my ($maxentscan_cryptic, $maxentscan_cryptic_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref, $var);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
296 if(defined $maxentscan_orig and $maxentscan_control < $maxentscan_orig and $maxentscan_orig < $maxentscan_cryptic){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
297 $disruption_level++;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
298 push @disruption_details, "Variant causes MaxEntScan score for a potential cryptic $splice_type splice site at $chr:$maxentscan_cryptic_pos to become higher than the annotated splice site $chr:$maxentscan_orig_pos (orig=$maxentscan_orig vs. var=$maxentscan_cryptic)";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
299 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
300 if($maxentscan_control - $maxentscan_cryptic < -1 and $maxentscan_cryptic > 0){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
301 $disruption_level+=0.5;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
302 push @disruption_details, "Variant raises MaxEntScan score for a potential cryptic $splice_type splice site (orig=$maxentscan_control vs. var=$maxentscan_cryptic)";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
303 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
304
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
305 # If neither method predicted the annotated donor site, and the alternate site isn't predicted either, note that so the user doesn't think the mod is definite okay
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
306 if(not @disruption_details and $maxentscan_orig < 0 and $genesplicer_orig < 0){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
307 $disruption_level+=0.5;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
308 push @disruption_details, "Neither the annotated splice site nor any potential cryptic site introduced by the variant are predicted by either GeneSplicer or MaxEntScan, therefore the alt splicing potential for this variant should be evaluated manually";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
309 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
310
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
311 return ($disruption_level, join("; ", @disruption_details));
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
312 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
313
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
314 sub call_genesplicer{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
315 my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
316
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
317 if($splice_type eq "acceptor"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
318 $pos += $strand eq "-" ? 2: -2;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
319 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
320 my $num_flanking = 110; # seems to croak if less than 160bp provided
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
321 my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
322 my $site_window = defined $var_offset ? 2 : ($indel_size > 0 ? 29+$indel_size : 29); # exact position if looking for exisitng site effect, otherwise 10 or 10 + the insert size
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
323 my $cmd = "genesplicer";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
324 my $start = $pos - $num_flanking;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
325 $start = 1 if $start < 1;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
326 my $stop = $pos + $num_flanking + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 2*$num_flanking after the edit
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
327 $stop++; # end coordinate is exclusive
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
328 return ($UNAFFECTED, -1) if defined $var_offset and ($pos+$var_offset < $start or $pos+$var_offset > $stop); # variant is outside of range we can detect affect on original splice site
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
329 my $seq = $fasta_index->fetch("$chr:$start-$stop");
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
330 if(defined $var){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
331 if(defined $var_offset){ # substitution away from the site of interest
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
332 substr($seq, $num_flanking+$var_offset, length($ref)) = $var;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
333 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
334 else{ # substitution at the site of interest
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
335 substr($seq, $num_flanking, length($ref)) = $var;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
336 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
337 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
338 if($strand eq "-"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
339 $seq = reverse($seq);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
340 $seq =~ tr/ACGTacgt/TGCAtgca/;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
341 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
342 $tmpfile = "$$.fasta";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
343 open(TMP, ">$tmpfile") or die "Could not open $tmpfile for writing: $!\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
344 print TMP ">$chr $strand $pos $ref",($var?" $var":""), ($var_offset?" $var_offset":""),"\n$seq\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
345 substr($seq, $num_flanking, 2) = lc(substr($seq, $num_flanking, 2));
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
346 #print STDERR "$chr $strand $pos $ref $var $var_offset: $seq\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
347 close(TMP);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
348 #print STDERR "$cmd $tmpfile /export/achri_data/programs/GeneSplicer/human\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
349 #<STDIN>;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
350 open(GS, "$cmd $tmpfile human 2> /dev/null|")
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
351 or die "Could not run $cmd: $!\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
352 my $highest_score = 0;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
353 my $highest_position = -1;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
354 while(<GS>){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
355 # End5 End3 Score "confidence" splice_site_type
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
356 # E.g. :
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
357 # 202 203 2.994010 Medium donor
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
358 chomp;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
359 my @F = split /\s+/, $_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
360 next if $F[1] < $F[0]; # wrong strand
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
361 if($splice_type eq $F[4]){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
362 if(abs($F[0]-$num_flanking-1) <= $site_window){ # right type and approximate location
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
363 #print STDERR "*$_\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
364 if($F[2] > $highest_score){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
365 $highest_score = $F[2];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
366 # last bit accounts for indels in the position offset
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
367 $highest_position = $pos + ($strand eq "-" ? -1: 1)*(-$num_flanking + $F[0] - 1) + (($F[0] <= $num_flanking && $strand eq "+" or $F[0] >= $num_flanking && $strand eq "-") ? -$indel_size : 0);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
368 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
369 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
370 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
371 #else{print STDERR $_,"\n";}
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
372 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
373 close(GS);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
374 return ($highest_score, $highest_position);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
375 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
376
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
377 sub call_maxentscan{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
378 my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
379
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
380 my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
381 my $cmd;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
382 my ($num_flanking5, $num_flanking3);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
383 my ($region5, $region3);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
384 my ($start, $stop);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
385 my $highest_score = -9999;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
386 my $highest_position = -1;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
387 # note that donor and acceptor are very different beasts for MaxEntScan
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
388 if($splice_type eq "acceptor"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
389 if($strand eq "-"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
390 $pos += 2;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
391 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
392 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
393 $pos -= 2;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
394 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
395 $cmd = "score3.pl";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
396 $num_flanking5 = 18;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
397 $num_flanking3 = 4;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
398 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
399 else{ # donor
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
400 $cmd = "score5.pl";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
401 $num_flanking5 = 3;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
402 $num_flanking3 = 5;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
403 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
404 # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
405 $region5 = defined $var_offset ? 0 : $num_flanking5; # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
406 $region3 = defined $var_offset ? 0 : $num_flanking3;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
407 if($strand eq "-"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
408 $start = $pos - $num_flanking3 - $region3;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
409 $stop = $pos + $num_flanking5 + $region5 + ($indel_size < 0 ? -1*$indel_size:0);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
410 #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking3 - $region3)\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
411 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
412 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
413 $start = $pos - $num_flanking5 - $region5;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
414 $stop = $pos + $num_flanking3 + $region3 + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 23 or 9 bases as the case may be
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
415 #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking5 - $region5)\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
416 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
417 if(defined $var_offset and (
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
418 $strand eq "+" and ($pos+$var_offset < $start or $pos+$var_offset > $stop) or
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
419 $strand eq "-" and ($pos-$var_offset < $start or $pos-$var_offset > $stop))){ # variant is outside of range we can detect affect on original splice site
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
420 return ($UNAFFECTED, -1)
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
421 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
422
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
423 my $seq = $fasta_index->fetch("$chr:$start-$stop");
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
424 my @subseqs;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
425 if(defined $var){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
426 if(defined $var_offset){ # substitution away from the site of interest
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
427 #print STDERR "Subbing in string of length ", length($seq), "near splice site $ref -> $var at ", ($strand eq "-" ? "$region3+$num_flanking3-$var_offset":"$region5+$num_flanking5+$var_offset"), "\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
428 substr($seq, ($strand eq "-" ? $region3+$num_flanking3-$var_offset:$region5+$num_flanking5+$var_offset), length($ref)) = $var;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
429 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
430 else{ # substitution at the site of interest
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
431 substr($seq, ($strand eq "-" ? $region3+$num_flanking3:$region5+$num_flanking5), length($ref)) = $var;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
432 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
433 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
434 if($strand eq "-"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
435 $seq = reverse($seq);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
436 $seq =~ tr/ACGTacgt/TGCAtgca/;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
437 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
438 # get all of the 23 or 9 base pair subsequences, depending on whether it's donors or acceptors we are looking for
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
439 for(my $i = 0; $i+$num_flanking5+$num_flanking3+1 <= length($seq); $i++){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
440 push @subseqs, substr($seq, $i, $num_flanking5+$num_flanking3+1);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
441 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
442 my $pid = open2(\*CHILD_OUT, \*CHILD_IN, "$cmd -");
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
443 $pid or die "Could not run $cmd: $!\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
444 print CHILD_IN join("\n",@subseqs);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
445 close(CHILD_IN);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
446 while(<CHILD_OUT>){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
447 chomp;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
448 # out is just "seq[tab]score"
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
449 my @F = split /\s+/, $_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
450 if($F[1] > $highest_score){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
451 #print STDERR ">";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
452 $highest_score = $F[1];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
453 # since we printed the subseqs in order, $. (containing the line number of the prediction output) gives us the offset in the original seq
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
454 $highest_position = $start + ($strand eq "-" ? length($seq) - $num_flanking5 - $. : $num_flanking5 + $. -1)+(($. <= $num_flanking5+$region5 && $strand eq "+" or $. >= $num_flanking5+$region5 && $strand eq "-") ? -$indel_size : 0);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
455 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
456 #print STDERR $_,"\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
457 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
458 close(CHILD_OUT);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
459 waitpid($pid, 0);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
460
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
461 return ($highest_score, $highest_position);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
462 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
463
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
464 sub get_range_info($$$$){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
465 my ($chr2ranges, $chr, $start, $stop) = @_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
466
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
467 if(not exists $chr2ranges->{$chr}){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
468 return {};
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
469 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
470 my $range_key = "$chr:$start:$stop";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
471
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
472 # Use a binary search to find the leftmost range of interest
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
473 my $left_index = 0;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
474 my $right_index = $#{$chr2ranges->{$chr}};
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
475 my $cursor = int($right_index/2);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
476 while($left_index != $right_index and $left_index != $cursor and $right_index != $cursor){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
477 # need to go left
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
478 if($chr2ranges->{$chr}->[$cursor]->[0] >= $start){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
479 $right_index = $cursor;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
480 $cursor = int(($left_index+$cursor)/2);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
481 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
482 # need to go to the right
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
483 elsif($chr2ranges->{$chr}->[$cursor]->[1] <= $stop){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
484 $left_index = $cursor;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
485 $cursor = int(($right_index+$cursor)/2);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
486 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
487 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
488 $right_index = $cursor;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
489 $cursor--; # in the range, go left until not in the range any more
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
490 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
491 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
492
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
493 my %names;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
494 for (; $cursor <= $#{$chr2ranges->{$chr}}; $cursor++){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
495 my $range_ref = $chr2ranges->{$chr}->[$cursor];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
496 if($range_ref->[0] > $stop){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
497 last;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
498 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
499 elsif($range_ref->[0] <= $stop and $range_ref->[1] >= $start){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
500 $names{$range_ref->[2]}++;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
501 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
502 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
503
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
504 #print STDERR "Names for range $chr:$start-$stop are ", join("/", keys %names), "\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
505 return \%names;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
506 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
507
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
508 if(not -d $sift_dir){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
509 die "The specified SIFT directory $sift_dir does not exist\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
510 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
511
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
512 print STDERR "Reading in OMIM morbid map...\n" unless $quiet;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
513 die "Data file $morbidmap_file does not exist, aborting.\n" if not -e $morbidmap_file;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
514 my %gene2morbids;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
515 open(TAB, $morbidmap_file)
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
516 or die "Cannot open OMIM morbid map file $morbidmap_file for reading: $!\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
517 while(<TAB>){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
518 my @fields = split /\|/, $_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
519 #print STDERR "got ", ($#fields+1), " fields in $_\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
520 #my @fields = split /\|/, $_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
521 my $morbid = $fields[0];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
522 $morbid =~ s/\s*\(\d+\)$//; # get rid of trailing parenthetical number
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
523 my $omim_id = $fields[2];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
524 for my $genename (split /, /, $fields[1]){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
525 if(not exists $gene2morbids{$genename}){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
526 $gene2morbids{$genename} = "OMIM:$omim_id $morbid";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
527 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
528 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
529 $gene2morbids{$genename} .= "; $morbid";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
530 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
531 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
532 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
533 close(TAB);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
534
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
535 print STDERR "Reading in interpro mappings...\n" unless $quiet;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
536 die "Data file $interpro_bed_file does not exist, aborting.\n" if not -e $interpro_bed_file;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
537 my %interpro_ids;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
538 open(TAB, $interpro_bed_file)
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
539 or die "Cannot open gene name BED file $interpro_bed_file for reading: $!\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
540 while(<TAB>){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
541 chomp;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
542 # format should be "chr start stop interpro desc..."
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
543 my @fields = split /\t/, $_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
544 my $c = $fields[0];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
545 if(not exists $interpro_ids{$c}){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
546 $interpro_ids{$c} = [] unless exists $interpro_ids{$c};
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
547 $interpro_ids{"chr".$c} = [] unless $c =~ /^chr/;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
548 $interpro_ids{$1} = [] if $c =~ /^chr(\S+)/;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
549 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
550 my $dataref = [@fields[1..3]];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
551 push @{$interpro_ids{$c}}, $dataref;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
552 push @{$interpro_ids{"chr".$c}}, $dataref unless $c =~ /^chr/;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
553 push @{$interpro_ids{$1}}, $dataref if $c =~ /^chr(\S+)/;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
554 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
555 close(TAB);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
556 print STDERR "Sorting interpro matches per contig...\n" unless $quiet;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
557 for my $chr (keys %interpro_ids){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
558 $interpro_ids{$chr} = [sort {$a->[0] <=> $b->[0]} @{$interpro_ids{$chr}}];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
559 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
560
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
561 # {uc(gene_name) => space separated info}
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
562 print STDERR "Reading in tissue info...\n" unless $quiet;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
563 my %tissues;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
564 my %tissue_desc;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
565 open(TISSUE, $tissue_file)
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
566 or die "Cannot open $tissue_file for reading: $!\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
567 while(<TISSUE>){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
568 chomp;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
569 my @fields = split /\t/, $_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
570 my $gname = uc($fields[1]);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
571 $tissue_desc{$gname} = $fields[2];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
572 next unless $#fields == 4;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
573 my @tis = split /;\s*/, $fields[4];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
574 pop @tis;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
575 if(@tis < 6){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
576 $tissues{$gname} = join(">", @tis);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
577 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
578 elsif(@tis < 24){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
579 $tissues{$gname} = join(">", @tis[0..5]).">...";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
580 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
581 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
582 $tissues{$gname} = join(">", @tis[0..(int($#tis/5))]).">...";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
583 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
584 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
585 close(TISSUE);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
586
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
587 print STDERR "Loading pathway info...\n" unless $quiet;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
588 my %pathways;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
589 my %pathway_ids;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
590 open(PATHWAY, $pathway_file)
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
591 or die "Cannot open $pathway_file for reading: $!\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
592 while(<PATHWAY>){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
593 chomp;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
594 my @fields = split /\t/, $_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
595 next unless @fields >= 4;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
596 for my $gname (split /\s+/, $fields[1]){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
597 $gname = uc($gname);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
598 if(not exists $pathways{$gname}){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
599 $pathway_ids{$gname} = $fields[2];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
600 $pathways{$gname} = $fields[3];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
601 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
602 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
603 $pathway_ids{$gname} .= "; ".$fields[2];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
604 $pathways{$gname} .= "; ".$fields[3];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
605 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
606 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
607 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
608 close(PATHWAY);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
609
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
610 print STDERR "Loading SIFT indices...\n" unless $quiet;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
611 # Read in the database table list
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
612 my %chr_tables;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
613 open(DBLIST, "$sift_dir/bins.list")
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
614 or die "Cannot open $sift_dir/bins.list for reading: $!\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
615 while(<DBLIST>){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
616 chomp;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
617 my @fields = split /\t/, $_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
618 if(not exists $chr_tables{$fields[1]}){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
619 $chr_tables{$fields[1]} = {};
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
620 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
621 my $chr = $fields[1];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
622 $chr = "chr$chr" unless $chr =~ /^chr/;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
623 $chr_tables{$chr}->{"$chr\_$fields[2]_$fields[3]"} = [$fields[2],$fields[3]];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
624 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
625 close(DBLIST);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
626
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
627 #Connect to database
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
628 my $db_gene = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_Supp.sqlite","","",{RaiseError=>1, AutoCommit=>1});
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
629
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
630 my $db_chr;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
631 my $fr_stmt; # retrieval for frameshift
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
632 my $snp_stmt; # retrieval for SNP
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
633 my $cur_chr = "";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
634 my $cur_max_pos;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
635
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
636 if(defined $predict_cryptic_splicings){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
637 if(not -e $predict_cryptic_splicings){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
638 die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, does not exist.\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
639 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
640 if(not -e $predict_cryptic_splicings.".fai" and not -w dirname($predict_cryptic_splicings)){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
641 die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, is not indexed, and the directory is not writable.\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
642 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
643 print STDERR "Loading reference FastA index...\n" unless $quiet;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
644 $fasta_index = Bio::DB::Sam::Fai->load($predict_cryptic_splicings);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
645 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
646
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
647 print STDERR "Annotating variants...\n" unless $quiet;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
648 # Assume contigs and positions in order
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
649 print STDERR "Processing file $hgvs_table_file...\n" unless $quiet;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
650 open(HGVS, $hgvs_table_file)
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
651 or die "Cannot open HGVS file $hgvs_table_file for reading: $!\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
652 # Output file for annotated snps and frameshifts
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
653 my $header = <HGVS>; # header
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
654 chomp $header;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
655 my @headers = split /\t/, $header;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
656 my ($chr_column, $pos_column, $ref_column, $alt_column, $alt_cnt_column, $tot_cnt_column, $ftype_column, $dbid_column, $maf_src_column, $maf_column, $internal_freq_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
657 $cdna_hgvs_column, $aa_hgvs_column, $transcript_column, $transcript_length_column, $strand_column, $zygosity_column, $splice_dist_column, $caveats_column, $phase_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
658 $srcs_column, $pvalue_column, $to_column, $genename_column, $rares_column, $rares_header);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
659 my %req_columns = (
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
660 "Chr" => \$chr_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
661 "DNA From" => \$pos_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
662 "DNA To" => \$to_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
663 "Gene Name" => \$genename_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
664 "Ref base" => \$ref_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
665 "Obs base" => \$alt_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
666 "Variant Reads" => \$alt_cnt_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
667 "Total Reads" => \$tot_cnt_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
668 "Feature type" => \$ftype_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
669 "Variant DB ID" => \$dbid_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
670 "Pop. freq. source" => \$maf_src_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
671 "Pop. freq." => \$maf_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
672 "Transcript HGVS" => \$cdna_hgvs_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
673 "Protein HGVS" => \$aa_hgvs_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
674 "Selected transcript" => \$transcript_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
675 "Transcript length" => \$transcript_length_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
676 "Strand" => \$strand_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
677 "Zygosity" => \$zygosity_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
678 "Closest exon junction (AA coding variants)" => \$splice_dist_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
679 "Caveats" => \$caveats_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
680 "Phase" => \$phase_column,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
681 "P-value" => \$pvalue_column);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
682 &load_header_columns(\%req_columns, \@headers);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
683 for(my $i = 0; $i <= $#headers; $i++){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
684 # Optional columns go here
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
685 if($headers[$i] eq "Sources"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
686 $srcs_column = $i;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
687 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
688 elsif($headers[$i] eq "Internal pop. freq."){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
689 $internal_freq_column = $i;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
690 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
691 elsif($headers[$i] =~ /^Num rare variants/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
692 $rares_column = $i;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
693 $rares_header = $headers[$i];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
694 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
695 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
696 open(OUT, ">$outfile")
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
697 or die "Cannot open $outfile for writing: $!\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
698 print OUT join("\t", "Chr", "DNA From", "DNA To", "Ref base", "Obs base", "% ref read","% variant read",
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
699 "Variant Reads", "Total Reads", "Feature type","Variant type", "Variant context", "Variant DB ID", "Pop. freq. source",
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
700 "Pop. freq."), (defined $internal_freq_column ? "\tInternal pop. freq.\t" : "\t"), join("\t", "SIFT Score", "Polyphen2 Score", "Polyphen2 call", "GERP score",
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
701 "Gene Name", "Transcript HGVS",
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
702 "Protein HGVS", "Zygosity","Closest exon junction (AA coding variants)","Splicing alteration potential","Splicing alteration details",
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
703 "OMIM Morbid Map","Tissue expression", "Pathways", "Pathway refs",
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
704 "Gene Function", "Spanning InterPro Domains", "P-value", "Caveats", "Phase",
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
705 "Selected transcript", "Transcript length", "Strand"),
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
706 (defined $rares_column ? "\t$rares_header" : ""),
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
707 (defined $srcs_column ? "\tSources" : ""), "\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
708 while(<HGVS>){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
709 chomp;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
710 my @fields = split /\t/, $_, -1;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
711 my $mode;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
712 my $protein_hgvs = $fields[$aa_hgvs_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
713 my $refSeq = $fields[$ref_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
714 my $newBase = $fields[$alt_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
715 my $splicing_potential = "NA";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
716 my $splicing_details = "NA";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
717 if(defined $predict_cryptic_splicings){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
718 my $distance = $fields[$splice_dist_column]; # only for coding variants
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
719 my $site_type;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
720 if($distance eq "NA"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
721 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
722 elsif($distance){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
723 $site_type = $distance < 0 ? "donor" : "acceptor"; # exon context
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
724 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
725 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
726 if($fields[$cdna_hgvs_column] =~ /\d+([\-+]\d+)/){ # get from the cDNA HGVS otherwise
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
727 $distance = $1;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
728 $distance =~ s/^\+//;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
729 $site_type = $distance > 0 ? "donor" : "acceptor"; # intron context
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
730 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
731 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
732 # only do splicing predictions for novel or rare variants, since these predictions are expensive
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
733 if($distance and $distance ne "NA" and ($fields[$maf_column] eq "NA" or $fields[$maf_column] <= 0.01)){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
734 #print STDERR "Running prediction for rare variant ($fields[3], MAF=$fields[14]): $fields[5], $fields[6], $refSeq, $newBase, $fields[4], $distance, $site_type\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
735 ($splicing_potential, $splicing_details) = &predict_splicing($fields[$chr_column], $fields[$pos_column], $refSeq, $newBase, $fields[$strand_column], $distance, $site_type);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
736 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
737 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
738 my $lookupBase;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
739 if($fields[$strand_column] eq "-"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
740 if($refSeq ne "NA"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
741 $refSeq = reverse($refSeq);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
742 $refSeq =~ tr/ACGTacgt/TGCAtgca/;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
743 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
744 if($newBase ne "NA"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
745 $newBase = reverse($newBase);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
746 $newBase =~ tr/ACGTacgt/TGCAtgca/;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
747 $newBase =~ s(TN/)(NA/);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
748 $newBase =~ s(/TN)(/NA);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
749 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
750 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
751 if($fields[$cdna_hgvs_column] =~ /^g\..*?(?:ins|delins|>)([ACGTN]+)$/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
752 $mode = "snps";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
753 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
754 elsif($fields[$cdna_hgvs_column] =~ /^g\..*?del([ACGTN]+)$/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
755 $mode = "snps";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
756 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
757 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+(?:[\-+]\d+)?ins([ACGTN]+)$/ or $fields[$cdna_hgvs_column] =~ /^c\.\d+(?:[\-+]\d+)?_\d+ins([ACGTN]+)$/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
758 if(not length($1)%3){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
759 $mode = "snps";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
760 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
761 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
762 $mode = "frameshifts";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
763 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
764 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
765 elsif($fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:[\-+]\d+)?_(\d+)(?:[\-+]\d+)?delins([ACGTN]+)$/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
766 if(not (length($3)-$2+$1-1)%3){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
767 $mode = "snps";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
768 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
769 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
770 $mode = "frameshifts";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
771 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
772 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
773 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[+\-]\d+_\d+[+\-]\d+delins([ACGTN]+)$/ or
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
774 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
775 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
776 $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+del([ACGTN]*)$/ or
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
777 $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?(?:ins|delins)?([ACGTN]+)$/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
778 $mode = "snps";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
779 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
780 elsif($fields[$cdna_hgvs_column] =~ /^c\.([-*]?\d+)del(\S+)/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
781 my $dist = $1;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
782 $dist =~ s/^\*//;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
783 if(abs($dist) < 4){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
784 $mode = "snps";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
785 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
786 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
787 $mode = "frameshifts";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
788 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
789 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
790 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+ins(\S*)/ or
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
791 $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+del(\S*)/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
792 $mode = "snps";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
793 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
794 elsif($fields[$cdna_hgvs_column] =~ /^c\.(-?\d+)_(\d+)(?:\+\d+)?del(\S+)/ or
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
795 $fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:\-\d+)?_(\d+)del(\S+)/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
796 if(not ($2-$1+1)%3){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
797 $mode = "snps";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
798 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
799 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
800 $mode = "frameshifts";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
801 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
802 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
803 elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+ins([ACGTN]+)$/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
804 if(not length($1)%3){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
805 $mode = "snps";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
806 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
807 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
808 $mode = "frameshifts";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
809 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
810 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
811 # special case of shifted start codon
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
812 elsif($fields[$cdna_hgvs_column] =~ /^c\.-1_1+ins([ACGTN]+)$/ or
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
813 $fields[$cdna_hgvs_column] =~ /inv$/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
814 $mode = "snps";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
815 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
816 elsif($fields[$cdna_hgvs_column] =~ /[ACGTN]>([ACGTN])$/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
817 # todo: handle multiple consecutive substitutions, will have to run polyphen separately
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
818 $lookupBase = $newBase;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
819 if($fields[$strand_column] eq "-"){ # revert to positive strand variant for SIFT index
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
820 $lookupBase =~ tr/ACGTacgt/TGCAtgca/;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
821 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
822 $mode = "snps";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
823 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
824 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
825 $mode = "snps";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
826 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
827 if($fields[$chr_column] ne $cur_chr){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
828 #Prepared DB connection on per-chromosome basis
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
829 $cur_chr = $fields[$chr_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
830 $cur_chr =~ s/^chr//;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
831 $db_chr = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_CHR$cur_chr.sqlite",
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
832 "",
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
833 "",
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
834 { RaiseError => 1, AutoCommit => 1 }) unless not -e "$sift_dir/Human_CHR$cur_chr.sqlite";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
835
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
836 $cur_max_pos = -1;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
837 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
838 if($fields[$pos_column] =~ /^\d+$/ and $cur_max_pos < $fields[$pos_column]){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
839 undef $fr_stmt;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
840 undef $snp_stmt;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
841 my $c = $fields[$chr_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
842 $c = "chr$c" if $c !~ /^chr/;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
843 for my $chr_table_key (keys %{$chr_tables{$c}}){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
844 my $chr_table_range = $chr_tables{$c}->{$chr_table_key};
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
845 if($chr_table_range->[0] <= $fields[$pos_column] and $chr_table_range->[1] >= $fields[$pos_column]){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
846 $fr_stmt = $db_chr->prepare("select SCORE".
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
847 " from $chr_table_key where COORD1 = ?");
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
848 $snp_stmt = $db_chr->prepare("select SCORE".
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
849 " from $chr_table_key where COORD1 = ? AND NT2 = ?");
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
850 $cur_max_pos = $chr_table_range->[1];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
851 last;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
852 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
853 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
854 if(not defined $fr_stmt and not $quiet){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
855 warn "Got request for position not in range of SIFT ($c:$fields[$pos_column])\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
856 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
857 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
858
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
859 my @cols;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
860 if(not $fr_stmt){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
861 @cols = ("NA");
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
862 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
863 elsif($mode eq "frameshifts"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
864 $fr_stmt->execute($fields[$pos_column]);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
865 @cols = $fr_stmt->fetchrow_array();
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
866 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
867 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
868 $snp_stmt->execute($fields[$pos_column]-1, $lookupBase);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
869 @cols = $snp_stmt->fetchrow_array();
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
870 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
871 # See if the change is in a CDS, and if it's non-synonymous
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
872 my ($type, $location) = ("silent", "intron");
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
873 my $start = $fields[$pos_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
874 my $stop = $fields[$to_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
875 my $interpro = get_range_info(\%interpro_ids, $fields[$chr_column], $start, $stop);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
876 my $domains = join("; ", sort keys %$interpro);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
877
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
878 my $sift_score = "NA";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
879 my $variant_key = "NA";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
880 my $transcript_hgvs = $fields[$cdna_hgvs_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
881 if($transcript_hgvs =~ /\.\d+[ACGTN]>/ or $transcript_hgvs =~ /\.\d+(?:_|del|ins)/ or $transcript_hgvs =~ /[+\-]\?/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
882 $location = "exon";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
883 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
884 elsif($transcript_hgvs =~ /\.\d+[\-+](\d+)/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
885 if($1 < 21){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
886 $type = "splice";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
887 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
888 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
889 $type = "intronic";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
890 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
891 $location = "splice site";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
892 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
893 elsif($transcript_hgvs =~ /\.\*\d+/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
894 $location = "3'UTR";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
895 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
896 elsif($transcript_hgvs =~ /\.-\d+/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
897 $location = "5'UTR";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
898 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
899 if($mode eq "frameshifts"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
900 $type = "frameshift";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
901 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
902 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
903 if(not defined $protein_hgvs){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
904 $type = "unknown";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
905 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
906 elsif($protein_hgvs eq "NA"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
907 $type = "non-coding";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
908 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
909 elsif($protein_hgvs =~ /=$/){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
910 $type = "silent";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
911 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
912 elsif($protein_hgvs =~ /\d+\*/){ # nonsense
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
913 $type = "nonsense";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
914 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
915 elsif($protein_hgvs =~ /ext/){ # nonsense
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
916 $type = "nonstop";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
917 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
918 elsif($protein_hgvs eq "p.0?" or $protein_hgvs eq "p.?"){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
919 $type = "unknown";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
920 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
921 else{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
922 $type = "missense";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
923 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
924
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
925 $sift_score = $cols[0] || "NA";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
926
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
927 # Record the need to fetch VCF polyphen and gerp data later (en masse, for efficiency)
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
928 # Use newBase instead of lookupBase for PolyPhen since the orientation is always relative to the transcript
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
929 $variant_key = get_variant_key($fields[$chr_column], $fields[$pos_column], $fields[$ref_column], $newBase);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
930 #print STDERR "Variant key is $variant_key\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
931 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
932 my $transcript_type = $fields[$ftype_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
933 my $transcript_length = $fields[$transcript_length_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
934 my $transcript_id = $fields[$transcript_column]; # sift score is derived from this transcript
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
935 my $transcript_strand = $fields[$strand_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
936 my $zygosity = $fields[$zygosity_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
937 my $rsID = $fields[$dbid_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
938 my $popFreq = $fields[$maf_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
939 my $internalFreq;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
940 $internalFreq = $fields[$internal_freq_column] if defined $internal_freq_column;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
941 my $popFreqSrc = $fields[$maf_src_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
942
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
943 # Get the gene name and omim info
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
944 my %seen;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
945 my @gene_names = split /; /, $fields[$genename_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
946 my $morbid = join("; ", grep {/\S/ and not $seen{$_}++} map({$gene2morbids{$_} or ""} @gene_names));
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
947 my $tissue = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissues{$_} or ""} @gene_names));
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
948 my $function = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissue_desc{$_} or ""} @gene_names));
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
949 my $pathways = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathways{$_} or ""} @gene_names));
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
950 my $pathway_ids = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathway_ids{$_} or ""} @gene_names));
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
951
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
952 # It may be that the calls and freqs are multiple concatenated values, e.g. "X; Y"
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
953 my @tot_bases = split /; /,$fields[$tot_cnt_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
954 my @var_bases = split /; /,$fields[$alt_cnt_column];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
955 my (@pct_nonvar, @pct_var);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
956 for (my $i = 0; $i <= $#tot_bases; $i++){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
957 push @pct_nonvar, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int(($tot_bases[$i]-$var_bases[$i])/$tot_bases[$i]*100+0.5);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
958 push @pct_var, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int($var_bases[$i]/$tot_bases[$i]*100+0.5);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
959 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
960
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
961 push @lines, [
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
962 $fields[$chr_column], # chr
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
963 $fields[$pos_column], # pos
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
964 $fields[$to_column],
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
965 $fields[$ref_column],
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
966 $fields[$alt_column],
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
967 join("; ", @pct_nonvar), # pct ref
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
968 join("; ", @pct_var), # pct var
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
969 $fields[$alt_cnt_column], # num var
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
970 $fields[$tot_cnt_column], # num reads
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
971 $transcript_type, # protein_coding, pseudogene, etc.
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
972 $type,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
973 $location,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
974 $rsID,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
975 $popFreqSrc,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
976 (defined $internal_freq_column ? ($popFreq,$internalFreq) : ($popFreq)),
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
977 $sift_score,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
978 $variant_key, # 16th field
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
979 join("; ", @gene_names),
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
980 $transcript_hgvs,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
981 $protein_hgvs,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
982 $zygosity,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
983 $fields[$splice_dist_column], # distance to exon edge
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
984 $splicing_potential,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
985 $splicing_details,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
986 $morbid,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
987 $tissue,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
988 $pathways,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
989 $pathway_ids,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
990 $function,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
991 $domains,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
992 $fields[$pvalue_column],
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
993 $fields[$caveats_column], # caveats
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
994 (defined $rares_column ? ($fields[$rares_column],$transcript_id) : ($transcript_id)),
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
995 $transcript_length,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
996 $transcript_strand,
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
997 (defined $srcs_column ? ($fields[$phase_column],$fields[$srcs_column]) : ($fields[$phase_column]))];
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
998 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
999
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1000 print STDERR "Adding Polyphen2 and GERP info...\n" unless $quiet;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1001 # retrieve the polyphen and gerp info en masse for each chromosome, as this is much more efficient
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1002 my $vkey_col = 16+(defined $internal_freq_column ? 1 : 0);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1003 for my $line_fields (@lines){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1004 # expand the key into the relevant MAF values
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1005 $line_fields->[$vkey_col] = join("\t", polyphen_gerp_info($gerp_dir,$polyphen_file,$line_fields->[$vkey_col])); # fields[$vkey_col] is variant key
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1006 print OUT join("\t", @{$line_fields}), "\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1007 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1008 close(OUT);
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1009 unlink $tmpfile if defined $tmpfile;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1010
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1011 sub load_header_columns{
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1012 my ($reqs_hash_ref, $headers_array_ref) = @_;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1013 my %unfulfilled;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1014 for my $field_name (keys %$reqs_hash_ref){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1015 $unfulfilled{$field_name} = 1;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1016 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1017 for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1018 for my $req_header_name (keys %$reqs_hash_ref){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1019 if($req_header_name eq $headers_array_ref->[$i]){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1020 ${$reqs_hash_ref->{$req_header_name}} = $i;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1021 delete $unfulfilled{$req_header_name};
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1022 last;
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1023 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1024 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1025 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1026 if(keys %unfulfilled){
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1027 die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n";
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1028 }
9977d1935a07 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1029 }