annotate vcf2hgvs_table @ 0:7cdd13ff182a default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 15:49:28 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 BEGIN{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 my $prog_dir = `dirname $0`;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5 chomp $prog_dir;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6 push @INC, $prog_dir; # so DisjointSets.pm can be found no matter the working directory
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9 use DisjointSets; # homebrew module
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10 use Bio::DB::Sam; # for FastA reference pulls
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11 use Bio::SeqUtils;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 use Bio::Tools::CodonTable;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 use Statistics::Zed;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 use Getopt::Long;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 use Set::IntervalTree;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16 use strict;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17 use warnings;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 use vars qw($min_prop $zed $codonTable $default_transl_table %transl_except %internal_prop %dbsnp_info %chr2variant_locs %chr2dbsnp_vcf_lines %chr2internal_vcf_lines %chr2caveats %chr2phase @snvs $fasta_index $max_args $quiet);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 if(@ARGV == 1 and $ARGV[0] eq "-v"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 print "Version 1.0\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22 exit;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 #$max_args = `getconf ARG_MAX`; # largest number of args you can send to a system command (enviroment included, see limits.h)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26 #chomp $max_args;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 $max_args = 4096; # if not defined $max_args or $max_args < 1; # the minimum since System V
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 $max_args -= 50;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 # find out if a variant appears in the user provided data
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31 sub internal_prop($$$$){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32 my ($chr,$pos,$ref,$variant) = @_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 my $key = "$chr:$pos:$ref:$variant";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 if(exists $internal_prop{$key}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 return $internal_prop{$key};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 #print STDERR "Checking if internal_prop for $key exists: ";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 if(exists $chr2internal_vcf_lines{$chr}->{$pos}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 for(@{$chr2internal_vcf_lines{$chr}->{$pos}}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42 my @fields = split /\t/, $_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 if($pos == $fields[1] and length($fields[3]) == length($ref) and $fields[4] eq $variant){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 #print STDERR "yes\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 if(/MAF=(\d\.\d+)/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 $internal_prop{$key} = $1; # change from percent to proportion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 return $1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 #print STDERR "no\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56 $internal_prop{$key} = "NA";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 return "NA";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60 # find out if a variant appears in the NCBI's dbSNP
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 sub dbsnp_info($$$$){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 my ($chr,$pos,$ref,$variant) = @_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 my $key = "$chr:$pos:$ref:$variant";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 if(exists $dbsnp_info{$key}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 return @{$dbsnp_info{$key}};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69 if(exists $chr2dbsnp_vcf_lines{$chr}->{$pos}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70 #print STDERR "Checking existing SNP data for $chr:$pos -> ", join("\n", @{$chr2dbsnp_vcf_lines{$chr}->{$pos}}), "\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
71 for(@{$chr2dbsnp_vcf_lines{$chr}->{$pos}}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
72 my @fields = split /\t/, $_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
73 for my $var (split /,/, $fields[4]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
74 # Allows for different reference seqs between dbSNP and input, assuming patches only
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
75 if(length($fields[3]) == length($ref) and ($var eq $variant or $ref eq $var and $variant eq $fields[3])){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
76 my ($freq, $subpop) = ("","");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
77 $freq = $1 if $fields[7] =~ /(?:\A|;)MMAF=(0\.\d+)(?:;|\Z)/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
78 $subpop = $1 if $fields[7] =~ /(?:\A|;)MMAF_SRC=(\S+?)(?:;|\Z)/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
79 $dbsnp_info{$key} = [$subpop, $freq || "NA", $fields[2]];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
80 return @{$dbsnp_info{$key}};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
81 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
82 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
83 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
84 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
85 $dbsnp_info{$key} = ["novel", "NA", "NA"];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
86 return @{$dbsnp_info{$key}};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
87 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
88
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
89 sub record_snv{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
90 my $line = join("", @_);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
91 push @snvs, $line;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
92
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
93 my @fields = split /\t/, $line;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
94 my $prop_info_key = $fields[9];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
95 my ($chr,$pos,$ref,$variant) = split /:/, $prop_info_key;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
96 $chr2variant_locs{$chr} = {} unless exists $chr2variant_locs{$chr};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
97 return unless $ref; # ref not defined for CNVs
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
98 # Need to grab whole range for MNPs
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
99 for(my $i = 0; $i < length($ref); $i++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
100 $chr2variant_locs{$chr}->{$pos+$i} = 1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
101 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
102 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
103
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
104 sub retrieve_vcf_lines($$$){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
105 my ($dbsnp_file, $internal_snp_file, $chr) = @_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
106
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
107 my (%dbsnp_lines, %internal_snp_lines);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
108
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
109 if(not defined $dbsnp_file or not exists $chr2variant_locs{$chr}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
110 return ({}, {}, {}, {}); # no data requested for this chromosome
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
111 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
112
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
113 # build up the request
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
114 my @tabix_regions;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
115 my @var_locs = keys %{$chr2variant_locs{$chr}};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
116 # sort by variant start location
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
117 for my $var_loc (sort {$a <=> $b} @var_locs){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
118 push @tabix_regions, $chr.":".$var_loc."-".$var_loc;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
119 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
120 for(my $i = 0; $i <= $#tabix_regions; $i += $max_args){ # chunkify tabix request if too many for the system to handle
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
121 my $end = $i + $max_args > $#tabix_regions ? $#tabix_regions : $i + $max_args;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
122 my $regions = "'".join("' '", @tabix_regions[$i..$end])."'";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
123 # From file is very slow for some reason
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
124 #my $regions_file = "/tmp/vcf2hgvs_$$.bed";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
125 #open(REQ_BED, ">$regions_file")
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
126 # or die "Cannot open $regions_file for writing: $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
127 #print REQ_BED join("\n", @tabix_regions), "\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
128 #close(REQ_BED);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
129
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
130 # retrieve the data
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
131 die "Cannot find dbSNP VCF file $dbsnp_file\n" if not -e $dbsnp_file;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
132
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
133 open(VCF, "tabix $dbsnp_file $regions |")
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
134 or die "Cannot run tabix on $dbsnp_file (args ".substr($regions, 0, length($regions)>100? 100 : length($regions))."): $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
135 while(<VCF>){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
136 #if(/^(\S+\t(\d+)(?:\t\S+){6})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
137 if(/^(\S+\t(\d+)(?:\t\S+){6})/ and exists $chr2variant_locs{$chr}->{$2}){ # take only main columns to save room, if possible
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
138 $dbsnp_lines{$2} = [] unless exists $dbsnp_lines{$2};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
139 push @{$dbsnp_lines{$2}}, $1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
140 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
141 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
142 close(VCF);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
143
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
144 if($internal_snp_file){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
145 die "Cannot find internal VCF file $internal_snp_file\n" if not -e $internal_snp_file;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
146 open(VCF, "tabix $internal_snp_file $regions |")
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
147 or die "Cannot run tabix on $internal_snp_file: $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
148 while(<VCF>){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
149 #if(/^(\S+\t(\d+)(?:\t\S+){6})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
150 if(/^(\S+\t(\d+)(?:\t\S+){5})/ and exists $chr2variant_locs{$chr}->{$2}){ # take only main columns to save room, if possible
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
151 $internal_snp_lines{$2} = [] unless exists $internal_snp_lines{$2};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
152 push @{$internal_snp_lines{$2}}, $1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
153 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
154 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
155 close(VCF);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
156 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
157 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
158
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
159 #unlink $regions_file;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
160
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
161 return (\%dbsnp_lines, \%internal_snp_lines);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
162 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
163
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
164 sub prop_info_key{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
165 my($chr,$pos,$ref,$variant,$exon_edge_dist) = @_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
166
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
167 $chr =~ s/^chr//;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
168 if($chr eq "M"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
169 $chr = "MT"; # NCBI uses different name for mitochondrial chromosome
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
170 $pos-- if $pos >= 3107; # also, doesn't keep the old positioning (historical)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
171 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
172 return join(":", $chr,$pos,$ref,$variant, ($exon_edge_dist ? $exon_edge_dist : ""));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
173 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
174
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
175 sub prop_info($$$){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
176 my($snpfile,$internal_snps_file,$prop_info_key) = @_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
177
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
178 my($chr,$pos,$ref,$variant) = split /:/, $prop_info_key;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
179
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
180 # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
181 if(not exists $chr2dbsnp_vcf_lines{$chr}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
182 ($chr2dbsnp_vcf_lines{$chr}, $chr2internal_vcf_lines{$chr}) = retrieve_vcf_lines($snpfile,$internal_snps_file,$chr);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
183 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
184 my $internal_maf = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
185 if($internal_snps_file){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
186 $internal_maf = internal_prop($chr,$pos,$ref,$variant);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
187 $internal_maf = 0 if $internal_maf eq "NA";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
188 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
189
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
190 my @results = dbsnp_info($chr,$pos,$ref,$variant);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
191
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
192 # Not all entries have a proportion in dbSNP
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
193 return $internal_snps_file ? ($ref, $variant, @results, $internal_maf) : ($ref, $variant, @results);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
194 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
195
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
196 #offset a given HGVS nomenclature position (single position only) by a given number of bases
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
197 sub hgvs_plus($$){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
198 my ($hgvs, $offset) = @_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
199 if($hgvs =~ /^(\S+)(-\d+)(.*)/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
200 # all negative
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
201 if($2+$offset<0){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
202 return $1.($2+$offset).$3;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
203 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
204 # switches to positive, need to mod
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
205 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
206 return $1+($2+$offset);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
207 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
208 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
209 elsif($hgvs =~ /^(\S+)\+(\d+)(.*)/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
210 # all positive
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
211 if($2+$offset>0){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
212 return $1."+".($2+$offset).$3;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
213 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
214 # switches to negative, need to mod
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
215 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
216 return $1+($2+$offset);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
217 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
218 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
219 elsif($hgvs =~ /^(-?\d+)(.*)/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
220 # special case if offset spans -/+ since there is no position 0
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
221 if($1 < 0 and $1+$offset >= 0){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
222 $offset++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
223 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
224 elsif($1 > 0 and $1+$offset <= 0){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
225 $offset--;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
226 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
227 return ($1+$offset).$2;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
228 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
229 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
230 die "Cannot convert $hgvs to a new offset ($offset), only single base position nomenclature is currently supported\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
231 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
232 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
233
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
234 # offset a given position by a given number of bases,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
235 # taking into account that if the new offset crosses the threshold in the last argument,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
236 # HGVS boundary nomenclature has to be introduced
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
237 sub hgvs_plus_exon($$$){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
238 my ($pos, $offset, $boundary) = @_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
239
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
240 # special case if offset spans -/+ since there is no position 0
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
241 if($pos =~ /^(-?\d+)(.*)/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
242 if($1 < 0 and $1+$offset >= 0){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
243 $offset++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
244 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
245 elsif($1 > 0 and $1+$offset <= 0){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
246 $offset--;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
247 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
248 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
249 my $new_pos = $pos + $offset;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
250 if($new_pos > $boundary and $pos <= $boundary){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
251 # just moved into an intron 3'
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
252 $new_pos = $boundary."+".($new_pos-$boundary);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
253 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
254 elsif($new_pos < $boundary and $pos >= $boundary){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
255 # just moved into an intron 5'
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
256 $new_pos = $boundary.($new_pos-$boundary);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
257 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
258 return $new_pos;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
259 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
260
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
261 # given a nucleotide position, calculates the AA there (assumes coding region)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
262 sub getCodonFromSeq($$$$){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
263 my ($chr_ref, $location, $frame_offset, $strand) = @_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
264
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
265 my $codon;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
266 if($strand eq "+"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
267 $codon = substr($$chr_ref, $location-1-$frame_offset, 3);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
268 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
269 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
270 $codon = substr($$chr_ref, $location-3+$frame_offset, 3);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
271 $codon = reverse($codon);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
272 $codon =~ tr/ACGTacgt/TGCAtgca/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
273 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
274 return $codon;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
275 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
276
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
277 sub getCodonFromSeqIndex($$$$){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
278 my ($chr, $location, $frame_offset, $strand) = @_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
279
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
280 my $codon;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
281 if($strand eq "+"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
282 $codon = $fasta_index->fetch($chr.":".($location-$frame_offset)."-".($location-$frame_offset+2));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
283 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
284 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
285 $codon = $fasta_index->fetch($chr.":".($location-2+$frame_offset)."-".($location+$frame_offset));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
286 $codon = reverse($codon);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
287 $codon =~ tr/ACGTacgt/TGCAtgca/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
288 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
289 return $codon;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
290 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
291
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
292 sub getAAFromSeq($$$$$){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
293 return $_[4]->translate(getCodonFromSeq($_[0], $_[1], $_[2], $_[3]));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
294 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
295
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
296 sub getAAFromSeqIndex($$$$$){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
297 # convert codon to AA
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
298 if(exists $transl_except{"$_[0]:$_[1]"}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
299 return $transl_except{"$_[0]:$_[1]"};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
300 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
301 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
302 return $_[4]->translate(getCodonFromSeqIndex($_[0], $_[1], $_[2], $_[3]));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
303 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
304 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
305
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
306 sub hgvs_protein{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
307 my ($chr, $location, $ref, $variant, $cdna_pos, $strand, $transl_table) = @_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
308
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
309 if(substr($ref,0,1) eq substr($variant,0,1)){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
310 substr($ref,0,1) = "";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
311 substr($variant,0,1) = "";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
312 $location++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
313 if($strand eq "-"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
314 $cdna_pos--;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
315 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
316 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
317 $cdna_pos++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
318 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
319 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
320
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
321 if($cdna_pos !~ /^\d+/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
322 die "Aborting: got illegal cDNA position ($cdna_pos) for protein HGVS conversion of position ",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
323 "$location, ref $ref, variant $variant. Please correct the program code.\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
324 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
325 # Get the correct frame for the protein translation, to know what codons are affected
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
326 my $aapos = int(($cdna_pos-1)/3)+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
327
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
328 # does it destroy the start codon?
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
329 if($cdna_pos < 4){ # assumes animal codon usage
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
330 return "p.0?"; # indicates start codon missing, unsure of effect
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
331 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
332
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
333 my $table = $transl_table ne $default_transl_table ? # non standard translation table requested
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
334 Bio::Tools::CodonTable->new(-id=>$transl_table) : $codonTable;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
335
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
336 my $frame_offset = ($cdna_pos-1)%3;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
337 my $origAA = getAAFromSeqIndex($chr, $location, $frame_offset, $strand, $table);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
338 # take 100000 bp on either side for translation context of variant seq
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
339 my $five_prime_buffer = $location < 10000 ? $location-1 : 10000;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
340 my $mutSeq = $fasta_index->fetch($chr.":".($location-$five_prime_buffer)."-".($location+10000));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
341
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
342 # substitute all of the immediately adjacent variants in phase with this one to get the correct local effect
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
343 substr($mutSeq, $five_prime_buffer, length($ref)) = $variant;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
344
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
345 # does it cause a frameshift?
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
346 my $length_diff = length($variant)-length($ref);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
347 if($length_diff%3){ # insertion or deletion not a multiple of three
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
348 my $fs_codon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1, $frame_offset, $strand);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
349 my $ext = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
350 my $newAA;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
351 do{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
352 $ext++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
353 # The "NA"s below make it so that we don't pick up any translation exceptions from the original reference annotation
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
354 if($strand eq "+"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
355 $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1+$ext*3, $frame_offset, $strand, $table);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
356 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
357 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
358 $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1-$ext*3, $frame_offset, $strand, $table);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
359 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
360 } while($newAA ne "*");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
361
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
362 return "p.".$origAA.$aapos.$table->translate($fs_codon)."fs*$ext";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
363 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
364
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
365 # does it cause a stop codon to be lost?
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
366 if($origAA eq "*"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
367 my $stopChangeCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1, $frame_offset, $strand);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
368 # still a stop after the mutation (ignore translation exceptions)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
369 if($table->is_ter_codon($stopChangeCodon)){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
370 return "p.*$aapos=";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
371 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
372 # calculate the new stop, assuming there aren't mutations downstream in candidate stop codons
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
373 my $ext = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
374 my $newCodon;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
375 do{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
376 if($strand eq "+"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
377 $newCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1+(++$ext*3), $frame_offset, $strand);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
378 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
379 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
380 $newCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1-(++$ext*3), $frame_offset, $strand);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
381 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
382 } while(not $table->is_ter_codon($newCodon));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
383
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
384 return "p.*".$aapos.$table->translate($stopChangeCodon)."ext*".$ext;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
385 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
386
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
387 # if we get this far, it's a "regular" AA level change
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
388 my $origAAs = "";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
389 for(my $i = 0; $i < length($ref)+$frame_offset; $i+=3){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
390 my $oldAA = getAAFromSeqIndex($chr, $location+$i, $frame_offset, $strand, $table);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
391 if($strand eq "+"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
392 $origAAs .= $oldAA;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
393 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
394 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
395 $origAAs = $oldAA . $origAAs;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
396 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
397 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
398 my $newAAs = "";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
399 for(my $i = 0; $i < length($variant)+$frame_offset; $i+=3){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
400 # NA means we don't take translation exceptions from the original
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
401 my $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1+$i, $frame_offset, $strand, $table);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
402 if($strand eq "+"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
403 $newAAs .= $newAA;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
404 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
405 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
406 $newAAs = $newAA . $newAAs;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
407 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
408 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
409
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
410 # silent
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
411 if($origAAs eq $newAAs){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
412 return "p.".$origAAs.$aapos."=";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
413 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
414
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
415 # minimize the difference if there are leading or trailing AAs the same
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
416 my $delLength = length($ref);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
417 while(substr($newAAs, 0, 1) eq substr($origAAs, 0, 1)){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
418 $newAAs = substr($newAAs, 1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
419 $origAAs = substr($origAAs, 1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
420 $location+=3;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
421 $delLength-=3;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
422 $aapos++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
423 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
424 while(substr($newAAs, -1) eq substr($origAAs, -1)){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
425 $newAAs = substr($newAAs, 0, length($newAAs)-1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
426 $origAAs = substr($origAAs, 0, length($origAAs)-1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
427 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
428
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
429 # insertion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
430 if(length($origAAs) == 0){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
431 my $insAAs = getAAFromSeqIndex($chr,$location-3,$frame_offset,$strand,$table).($aapos-1)."_".
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
432 getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
433 return "p.".$insAAs.$aapos."ins".$newAAs;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
434 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
435 # deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
436 elsif(length($newAAs) == 0){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
437 my $delAAs;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
438 if(length($origAAs) == 1){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
439 $delAAs = getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos; # single AA deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
440 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
441 else{ # deleting a stretch
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
442 if($strand eq "+"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
443 my $endPoint = $location+$delLength-1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
444 $delAAs = getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos."_".
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
445 getAAFromSeqIndex($chr,$endPoint,$frame_offset,$strand,$table).($aapos+int(($delLength-1)/3));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
446 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
447 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
448 my $endPoint = $location-$delLength+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
449 $delAAs = getAAFromSeqIndex($chr,$endPoint,$frame_offset,$strand,$table).($aapos-int(($delLength-1)/3))."_".
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
450 getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
451 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
452 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
453 return "p.".$delAAs."del";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
454 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
455 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
456 # substitution
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
457 if(length($origAAs) == 1 and length($newAAs) == 1){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
458 return "p.".$origAAs.$aapos.$newAAs;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
459 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
460 # indel
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
461 elsif(length($origAAs) != 1){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
462 # convert ref stretch into range syntax
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
463 if($strand eq "+"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
464 $origAAs = substr($origAAs, 0, 1).$aapos."_".substr($origAAs, -1).($aapos+length($origAAs)-1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
465 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
466 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
467 $origAAs = substr($origAAs, 0, 1).($aapos-length($origAAs)+1)."_".substr($origAAs, -1).$aapos;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
468 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
469 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
470 return "p.".$origAAs."delins".$newAAs;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
471 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
472 return ("NA", "");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
473 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
474
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
475 sub z2p{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
476 if(not defined $zed){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
477 $zed = new Statistics::Zed;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
478 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
479 my $p = $zed->z2p(value => $_[0]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
480 return $p < 0.0000000001 ? 0 : $p;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
481 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
482 sub gq2p{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
483 return $_[0] > 200 ? 0 : 10**($_[0]/-10);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
484 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
485
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
486 my ($multi_phased, $min_depth, $flanking_bases, $dbsnp, $internal_snp, $genename_bed_file, $dir_1000G, $dir_esp6500, $min_pvalue, $mappability_file, $reference_file, $samtools_phasing_file, $exons_file, $input_file, $output_file, $cnv_file, $dgv_file, $which_chr, $enrichment_regions_file, $rare_variant_prop);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
487 $multi_phased = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
488 $min_depth = 2;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
489 $flanking_bases = 30;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
490 $min_pvalue = 0.01;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
491 $min_prop = 0.14;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
492 $rare_variant_prop = 0.05;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
493 $input_file = "-"; # STDIN by default
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
494 $output_file = "-"; # STDOUT by default
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
495 $default_transl_table = "1"; # assumes NCBI 'Standard' table, unless it is an argument to the script...
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
496 &GetOptions("d=i" => \$min_depth,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
497 "f=i" => \$flanking_bases,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
498 "s=s" => \$dbsnp,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
499 "t=s" => \$dir_1000G,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
500 "n=s" => \$dir_esp6500,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
501 "u=s" => \$internal_snp,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
502 "q" => \$quiet,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
503 "p=f" => \$min_pvalue,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
504 "h=f" => \$min_prop,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
505 "m=s" => \$mappability_file,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
506 "r=s" => \$reference_file,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
507 "z=s" => \$samtools_phasing_file,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
508 "e=s" => \$exons_file,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
509 "i=s" => \$input_file,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
510 "c=s" => \$cnv_file,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
511 "g=s" => \$dgv_file,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
512 "b=s" => \$genename_bed_file,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
513 "w=s" => \$which_chr,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
514 "o=s" => \$output_file,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
515 "a=i" => \$default_transl_table,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
516 "v=f" => \$rare_variant_prop,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
517 "x=s" => \$enrichment_regions_file); # if enrichment regions are specified, variants without a transcript model but in these ranges will be reported
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
518
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
519 if(($input_file ne "/dev/null" and not defined $reference_file) or
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
520 not defined $exons_file or
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
521 (defined $cnv_file and not defined $dgv_file)){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
522 die "Usage: $0 [-v(ersion)] [-q(uiet)] [-w(hich) contig_to_report (default is all)] [-d(epth of variant reads req'd) #] [-v(ariant max freq to count as rare)] [-f(lanking exon bases to report) #] [-p(robability of random genotype, maximum to report) 0.#]\n",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
523 " [-h(et proportion of variant reads, minimum to report) 0.#] [-c(opy number) variants_file.bed -g(enomic structural) variants_control_db.txt.gz] [-z file_containing_samtools_phase_output.txt]\n",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
524 " [-t(housand) genomes_integrated_vcfs_gz_dir] [-n ESP6500_dir] [-u(ser) specified_population.vcf.gz] [-m(appability) crg_file.bed]\n",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
525 " [-x enrichment_regions_file.bed] [-a(mino) acid translation table number from NCBI]\n",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
526 " [-i(nput) genotypes.vcf <-r(eference) sequence_file.fasta>] [-o(utput) hgvs_file.tsv] [-s(np) database_from_ncbi.vcf.gz]\n",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
527 " <-b(ed) file of named gene regions.bed> <-e(xons) file.gtf>\n\n",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
528 "Input gz files must be indexed with Tabix.\nDefault input is STDIN, default output is STDOUT. Note: if -c is specified, polyploidies are are assume to be proximal. Other defaults: -d 2, -v 0.05, -f 30, -p 0.01, -h 0.14 -a 1\nReference sequence is not strictly necessary if only CNV are being annotated.\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
529 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
530
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
531 print STDERR "Considering $flanking_bases flanking bases for variants as well\n" unless $quiet;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
532
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
533 $codonTable = new Bio::Tools::CodonTable(id => $default_transl_table);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
534
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
535 my %enrichment_regions;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
536 # Note, we assume the regions are non-overlapping
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
537 if(defined $enrichment_regions_file){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
538 print STDERR "Loading enrichment regions...\n" unless $quiet;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
539 open(BED, $enrichment_regions_file)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
540 or die "Cannot open $enrichment_regions_file for reading: $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
541 while(<BED>){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
542 chomp;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
543 my @F = split /\t/, $_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
544 $enrichment_regions{$F[0]} = [] if not exists $enrichment_regions{$F[0]};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
545 push @{$enrichment_regions{$F[0]}}, [$F[1], $F[2]];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
546 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
547 close(BED);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
548 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
549 for my $chr (keys %enrichment_regions){ # sort by start
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
550 $enrichment_regions{$chr} = [sort {$a->[0] <=> $b->[0]} @{$enrichment_regions{$chr}}];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
551 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
552
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
553 if(defined $reference_file){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
554 print STDERR "Scanning reference FastA info\n" unless $quiet;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
555 if(not -e $reference_file){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
556 die "Reference FastA file ($reference_file) does not exist.\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
557 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
558 if(not -e $reference_file.".fai" and not -w dirname($reference_file)){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
559 die "Reference FastA file ($reference_file) is not indexed, and the directory is not writable.\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
560 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
561 $fasta_index = Bio::DB::Sam::Fai->load($reference_file);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
562 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
563
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
564 my %chr2mappability;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
565 if(defined $mappability_file){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
566 print STDERR "Reading in mappability data\n" unless $quiet;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
567 my ($nmer) = $mappability_file =~ /(\d+).*?$/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
568 die "Cannot determine nmer from nmer file name $mappability_file, aborting\n" unless $nmer;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
569 open(MAP, $mappability_file)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
570 or die "Cannot open mappability data file $mappability_file for reading: $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
571 <MAP>; # header
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
572 while(<MAP>){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
573 next if /^#/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
574 chomp;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
575 my @F = split /\t/, $_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
576 my $x = int(1/$F[3]+0.5);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
577 $chr2mappability{$F[0]} = Set::IntervalTree->new() if not exists $chr2mappability{$F[0]};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
578 $chr2mappability{$F[0]}->insert("non-unique mapping region (x$x)", $F[1], $F[2]+$nmer-1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
579 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
580 close(MAP);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
581 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
582
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
583 # Is phasing data provided?
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
584 if(defined $samtools_phasing_file){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
585 print STDERR "Reading in phasing data\n" unless $quiet;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
586 open(PHASE, $samtools_phasing_file)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
587 or die "Cannot open phasing data file $samtools_phasing_file for reading: $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
588 my $phase_range;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
589 while(<PHASE>){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
590 if(/^PS/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
591 chomp;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
592 my @F = split /\t/, $_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
593 $phase_range = "$F[2]-$F[3]";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
594 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
595 if(/^M[12]/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
596 chomp;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
597 my @F = split /\t/, $_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
598 #ignore strange cases where haplotype reference has no cases (weird samtools call)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
599 next if $F[9] == 0 or $F[7] == 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
600 my $chr = $F[1];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
601 next if defined $which_chr and not $chr eq $which_chr;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
602 my $pos = $F[3];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
603 #print STDERR "Recording phase for $chr:$pos:$F[4] , $chr:$pos:$F[5] as A-$chr:$phase_range and B-$chr:$phase_range\n" if $pos == 12907379;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
604 if(($F[10]+$F[8])/($F[9]+$F[7]) >= $min_prop){ # error meets reporting threshold
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
605 $chr2caveats{"$chr:$pos"} .= "; " if exists $chr2caveats{"$chr:$pos"};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
606 $chr2caveats{"$chr:$pos"} .= "inconsistent haplotype phasing";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
607 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
608 else{ # appears to be a genuine phasing
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
609 $chr2phase{"$chr:$pos:$F[4]"} = "A-$chr:$phase_range"; # grouping for haplotype
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
610 $chr2phase{"$chr:$pos:$F[5]"} = "B-$chr:$phase_range"; # grouping for haplotype
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
611 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
612 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
613 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
614 close(PHASE);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
615 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
616
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
617 # Check the VCF file to see if contains phase data
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
618 open(VCFIN, $input_file)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
619 or die "Cannot open $input_file for reading: $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
620 my $phase_chr = "";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
621 my @phase_dataA;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
622 my @phase_dataB;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
623 while(<VCFIN>){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
624 if(/^\s*(?:#|$)/){ # blank or hash comment
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
625 next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
626 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
627 my @F = split /\t/, $_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
628 next if exists $chr2caveats{"$F[0]:$F[1]"} and $chr2caveats{"$F[0]:$F[1]"} =~ /inconsistent haplotype phasing/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
629 # | indicates phased
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
630 if($F[8] =~ m(^(\d+)\|(\d+):)){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
631 next if $1 eq $2; # not useful to us (actually would mess up phase combining later on), but is provided sometimes
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
632 # start of a phasing block
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
633 if($phase_chr eq ""){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
634 $phase_chr = $F[0];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
635 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
636 my @vars = split /,/, $F[4];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
637 if($1 > @vars){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
638 die "Invalid VCF file (line #$.): First haplotype listed as $1, but only ", scalar(@vars), " variants were provided (", join(",", @vars), "\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
639 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
640 if($2 > @vars){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
641 die "Invalid VCF file (line #$.): Second haplotype listed as $1, but only ", scalar(@vars), " variants were provided (", join(",", @vars), "\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
642 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
643 unshift @vars, $F[3];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
644 push @phase_dataA, [$F[1], $vars[$1]];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
645 push @phase_dataB, [$F[1], $vars[$2]];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
646 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
647 # non phased het call, ends any phasing block there might be
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
648 elsif($F[8] =~ m(^0/1)){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
649 # Did we just finish a phased block? If so, output it.
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
650 if(@phase_dataA > 1){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
651 my $phase_def = "G-$phase_chr:".$phase_dataA[0]->[0]."-".$phase_dataA[$#phase_dataA]->[0];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
652 for my $d (@phase_dataA){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
653 my ($p, $v) = @$d;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
654 if(exists $chr2phase{"$phase_chr:$p:$v"}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
655 $chr2phase{"$phase_chr:$p:$v"} .= ",$phase_def";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
656 $multi_phased ||= 1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
657 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
658 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
659 $chr2phase{"$phase_chr:$p:$v"} = $phase_def;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
660 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
661 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
662 $phase_def = "H-$phase_chr:".$phase_dataB[0]->[0]."-".$phase_dataB[$#phase_dataB]->[0];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
663 for my $d (@phase_dataB){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
664 my ($p, $v) = @$d;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
665 if(exists $chr2phase{"$phase_chr:$p:$v"}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
666 $chr2phase{"$phase_chr:$p:$v"} = ",$phase_def";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
667 $multi_phased ||= 1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
668 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
669 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
670 $chr2phase{"$phase_chr:$p:$v"} = $phase_def;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
671 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
672 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
673 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
674 if($phase_chr ne ""){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
675 $phase_chr = "";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
676 @phase_dataA = ();
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
677 @phase_dataB = ();
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
678 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
679 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
680 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
681
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
682 print STDERR "Reading in feature GTF data..." unless $quiet;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
683 my %feature_range; # chr => transcript_id => [[genomic_exon_start,genomic_exon_end,cdna_start_pos],...]
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
684 my %feature_intervaltree; # chr => transcript_id => [[genomic_exon_start,genomic_exon_end,cdna_start_pos],...]
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
685 my %feature_strand; # transcript_id => +|-
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
686 my $feature_count = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
687 my %feature_min;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
688 my %feature_max;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
689 my %feature_cds_min;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
690 my %feature_cds_max;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
691 my %feature_contig;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
692 my %feature_length;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
693 my %feature_type;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
694 my %feature_transl_table; # note alternate translation table usage
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
695 my %chr_read;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
696 open(GTF, $exons_file)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
697 or die "Cannot open $exons_file for reading: $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
698 while(<GTF>){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
699 next if /^\s*#/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
700 my @fields = split /\t/, $_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
701 next if defined $which_chr and $fields[0] ne $which_chr and "chr$fields[0]" ne $which_chr and $fields[0] ne "chr$which_chr";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
702
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
703 if($fields[2] eq "exon" or $fields[2] eq "CDS"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
704 next unless $fields[$#fields] =~ /transcript_id \"(.*?)\"/o;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
705 my $parent = $1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
706 if(not $quiet and not exists $chr_read{$fields[0]}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
707 print STDERR " $fields[0]";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
708 $chr_read{$fields[0]} = 1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
709 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
710 if(not exists $feature_strand{$parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
711 $feature_strand{$parent} = $fields[6];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
712 $feature_contig{$parent} = $fields[0];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
713 if($fields[$#fields] =~ /transcript_type \"(.*?)\"/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
714 $feature_type{$parent} = $1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
715 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
716 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
717 $feature_type{$parent} = "NA";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
718 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
719 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
720 if($fields[2] eq "CDS"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
721 #print STDERR "CDS value for $parent is $fields[2]..$fields[3]\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
722 if(not exists $feature_cds_min{$parent} or $fields[3] < $feature_cds_min{$parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
723 $feature_cds_min{$parent} = $fields[3];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
724 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
725 if(not exists $feature_cds_max{$parent} or $fields[4] > $feature_cds_max{$parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
726 $feature_cds_max{$parent} = $fields[4];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
727 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
728 if($fields[$#fields] =~ /transl_table \"(\d+)\"/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
729 $feature_transl_table{$parent} = $1; #assume one translation table per CDS, which should be reasonable
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
730 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
731 while($fields[$#fields] =~ /transl_except \"pos:(\S+?),aa:(\S+?)\"/g){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
732 my $pos = $1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
733 my $new_aa = $2; # needs to change from three letter code to 1
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
734 if($new_aa =~ /^ter/i){ # can be funny so have special case (allows TERM, etc.)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
735 $new_aa = "*";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
736 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
737 elsif(length($new_aa) == 3){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
738 $new_aa = Bio::SeqUtils->new()->seq3in($new_aa);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
739 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
740 if($pos =~ /^(\d+)\.\.(\d+)/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
741 for my $p ($1..$2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
742 $transl_except{"$fields[0]:$p"} = $new_aa;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
743 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
744 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
745 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
746 $transl_except{"$fields[0]:$pos"} = $new_aa;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
747 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
748 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
749 next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
750 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
751 if(not exists $feature_min{$parent} or $fields[3] < $feature_min{$parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
752 $feature_min{$parent} = $fields[3];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
753 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
754 if(not exists $feature_max{$parent} or $fields[4] > $feature_max{$parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
755 $feature_max{$parent} = $fields[4];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
756 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
757
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
758 $feature_count++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
759 if(not exists $feature_range{$fields[0]}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
760 $feature_range{$fields[0]} = {}; # Chr => {parentID => [start,stop]}
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
761 $feature_intervaltree{$fields[0]} = Set::IntervalTree->new();
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
762 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
763 if(not exists $feature_range{$fields[0]}->{$parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
764 $feature_range{$fields[0]}->{$parent} = [];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
765 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
766 push @{$feature_range{$fields[0]}->{$parent}}, [$fields[3],$fields[4]];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
767 $feature_intervaltree{$fields[0]}->insert($parent, $fields[3], $fields[4]+1); # ranges need to have positive length for module to work properly
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
768 $feature_length{$parent} += $fields[4]-$fields[3]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
769 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
770 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
771 close(GTF);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
772 print STDERR "\nFound $feature_count exons on ", scalar(keys %feature_range), " contigs in the GTF file\n" unless $quiet;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
773
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
774 for my $contig (keys %feature_range){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
775 for my $parent (keys %{$feature_range{$contig}}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
776 # sort by subrange start
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
777 my @feature_ranges = sort {$a->[0] <=> $b->[0]} @{$feature_range{$contig}->{$parent}};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
778 $feature_range{$contig}->{$parent} = \@feature_ranges;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
779 $feature_range{"chr".$contig}->{$parent} = \@feature_ranges if not $contig =~ /^chr/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
780 $feature_range{$1}->{$parent} = \@feature_ranges if $contig =~ /^chr(\S+)/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
781 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
782 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
783
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
784 # Calculate the cDNA position of the leftmost (reference strand) base for each exon
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
785 for my $contig (keys %feature_range){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
786 for my $parent (keys %{$feature_range{$contig}}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
787 my @feature_ranges = @{$feature_range{$contig}->{$parent}};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
788 if($feature_strand{$parent} eq "-"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
789 # set up utr offset for correct CDS coordinates
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
790 my $feature_offset = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
791 for(my $i = $#feature_ranges; $i >= 0; $i--){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
792 last if not $feature_cds_max{$parent};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
793 # exon is completely 5' of the start
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
794 if($feature_ranges[$i]->[0] > $feature_cds_max{$parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
795 $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
796 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
797 # exon with the cds start
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
798 elsif($feature_ranges[$i]->[1] >= $feature_cds_max{$parent} and
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
799 $feature_ranges[$i]->[0] <= $feature_cds_max{$parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
800 $feature_offset += $feature_cds_max{$parent} - $feature_ranges[$i]->[1];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
801 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
802 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
803 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
804 die "The CDS for $parent (on negative strand) ends downstream ",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
805 "($feature_cds_max{$parent}) of the an exon",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
806 " (", $feature_ranges[$i]->[0], "), which is illogical. Please revise the GFF file provided.\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
807 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
808 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
809 for(my $i = $#feature_ranges; $i >= 0; $i--){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
810 $feature_offset += $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
811 $feature_ranges[$i]->[2] = $feature_offset-1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
812 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
813 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
814 else{ # positive strand
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
815 # set up utr offset for correct CDS coordinates
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
816 my $feature_offset = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
817 for(my $i = 0; $i <= $#feature_ranges; $i++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
818 last if not $feature_cds_min{$parent};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
819 # All 5' utr exon
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
820 if($feature_ranges[$i]->[1] < $feature_cds_min{$parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
821 $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
822 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
823 # exon with the cds start
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
824 elsif($feature_ranges[$i]->[1] >= $feature_cds_min{$parent} and
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
825 $feature_ranges[$i]->[0] <= $feature_cds_min{$parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
826 $feature_offset -= $feature_cds_min{$parent} - $feature_ranges[$i]->[0];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
827 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
828 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
829 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
830 die "The CDS for $parent starts upstream ($feature_cds_min{$parent}) of the first exon",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
831 " (", $feature_ranges[$i]->[0], "), which is illogical. Please revise the GFF file provided.\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
832 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
833 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
834 # assign cDNA coords for each exon to the third array element
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
835 for(my $i = 0; $i <= $#feature_ranges; $i++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
836 $feature_ranges[$i]->[2] = $feature_offset;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
837 $feature_offset += $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
838 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
839 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
840 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
841 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
842
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
843 print STDERR "Reading in gene name definitions...\n" unless $quiet;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
844 die "Data file $genename_bed_file does not exist, aborting.\n" if not -e $genename_bed_file;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
845 my %gene_ids;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
846 open(TAB, $genename_bed_file)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
847 or die "Cannot open gene name BED file $genename_bed_file for reading: $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
848 while(<TAB>){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
849 chomp;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
850 # format should be "chr start stop gene_name ..."
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
851 my @fields = split /\t/, $_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
852 next if $#fields < 3;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
853 my $c = $fields[0];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
854 if(not exists $gene_ids{$c}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
855 $gene_ids{$c} = Set::IntervalTree->new();
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
856 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
857 $gene_ids{$c}->insert($fields[3], $fields[1], $fields[2]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
858 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
859
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
860 # Print output header
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
861 open(OUT, ">$output_file")
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
862 or die "Cannot open $output_file for writing: $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
863
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
864 print OUT join("\t", "Feature type", "Transcript length", "Selected transcript", "Transcript HGVS", "Strand", "Chr", "DNA From", "DNA To", "Zygosity", "P-value", "Variant Reads", "Total Reads",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
865 "Ref base", "Obs base", "Pop. freq. source", "Pop. freq.", "Variant DB ID"), "\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
866 ($internal_snp ? "Internal pop. freq.\t" : ""),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
867 join("\t", "Protein HGVS", "Closest exon junction (AA coding variants)", "Gene Name", "Caveats", "Phase", "Num rare variants in gene (MAF <= $rare_variant_prop)", "Num rare coding and splice site variants in gene (MAF <= $rare_variant_prop)"),"\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
868
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
869 # If there is CNV data, load it.
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
870 # BED columns should be chr start stop caveats ploidy . ignored ignored r,g,b
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
871 # The dot means the strand doesn't matter.
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
872 # where the first five fields are required, others optional
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
873 # where r,g,b is overloaded with father,mother ploidies and "b" is integer representing affected status logical AND (father bit mask 1, mother bit mask 2)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
874 if(defined $cnv_file){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
875 print STDERR "Reading in CNV data...\n" unless $quiet;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
876 open(CNV, $cnv_file)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
877 or die "Cannot open $cnv_file for reading: $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
878 while(<CNV>){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
879 chomp;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
880 my @F = split /\t/, $_, -1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
881 if(@F < 5){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
882 print STDERR "Skipping unparseable line ($cnv_file #$.): $_\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
883 next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
884 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
885 my $ploidy = $F[4];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
886 my $cnv_chr = $F[0];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
887 next if defined $which_chr and $cnv_chr ne $which_chr and "chr$cnv_chr" ne $which_chr and $cnv_chr ne "chr$which_chr";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
888 my $cnv_start = $F[1];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
889 my $cnv_end = $F[2];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
890 my $p_value = "NA";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
891 if($F[3] =~ s/p-value=(\S+?)(?:;|$)//){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
892 $p_value = $1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
893 next if $min_pvalue < $p_value;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
894 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
895
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
896 # Report a variant line for each gene that is found in this CNV
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
897 my $target_parents = $feature_intervaltree{$cnv_chr}->fetch($cnv_start, $cnv_end+1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
898
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
899 my $caveats = "";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
900 if(@F == 9){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
901 my @parents_ploidy = split /,/, $F[8];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
902 if($parents_ploidy[2] == 0){ # neither parent affected
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
903 if($ploidy < $parents_ploidy[0] and $ploidy < $parents_ploidy[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
904 if($ploidy > 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
905 $caveats = "Polyploidy is less severe than in either unaffected parents";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
906 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
907 # else: no caveats, this offspring has fewer copies than normally observed, or in unaffected parents
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
908 elsif($ploidy < 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
909 if($parents_ploidy[0] == 2 and $parents_ploidy[1] == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
910 $caveats = "De novo copy loss, unaffected parents are diploid";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
911 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
912 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
913 $caveats = "Copy loss is greater than in either unaffected parent";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
914 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
915 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
916 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
917 elsif($ploidy >= $parents_ploidy[0] and $ploidy <= $parents_ploidy[1] or
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
918 $ploidy >= $parents_ploidy[1] and $ploidy <= $parents_ploidy[0]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
919 $caveats = "Aneuploidy likely inherited from an unaffected parent";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
920 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
921 elsif($ploidy > $parents_ploidy[0] and $ploidy > $parents_ploidy[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
922 if($parents_ploidy[0] > 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
923 if($parents_ploidy[1] > 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
924 $caveats = "Lower polyploidy already exists in both unaffected parents";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
925 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
926 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
927 $caveats = "Lower polyploidy already exists in unaffected father";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
928 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
929 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
930 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
931 if($parents_ploidy[1] > 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
932 $caveats = "Lower polyploidy already exists in unaffected mother";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
933 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
934 # else no caveats, because both parents are "normal", yet we have polyploidy in the offspring
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
935 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
936 $caveats = "De novo polyploidy, unaffected parents are diploid";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
937 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
938 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
939 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
940 # else
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
941 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
942 die "Oops! Error in program logic...how did we get here (unaffected parents)? $_";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
943 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
944 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
945 elsif($parents_ploidy[2] == 1){ # father affected
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
946 if($ploidy == $parents_ploidy[1]){ # just like unaffected Mom
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
947 if($ploidy > 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
948 if($ploidy == $parents_ploidy[0]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
949 $caveats = "Same polyploidy present in both affected and unaffected parents";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
950 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
951 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
952 $caveats = "Polyploidy inherited from unaffected mother";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
953 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
954 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
955 elsif($ploidy < 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
956 if($ploidy == $parents_ploidy[0]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
957 $caveats = "Same copy loss in both affected and unaffected parents";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
958 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
959 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
960 $caveats = "Copy loss is shared with unaffected mother";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
961 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
962 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
963 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
964 if($ploidy == $parents_ploidy[0]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
965 # Why was this even reported? parents and child have diploid status...
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
966 next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
967 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
968 $caveats = "Diploidy is shared with unaffected mother";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
969 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
970 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
971 elsif($ploidy > 2){ # polyploidy
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
972 if($parents_ploidy[0] == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
973 if($parents_ploidy[1] > 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
974 $caveats = "Unaffected mother has polyploidy (".$parents_ploidy[1]."x), but affected father is diploid";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
975 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
976 elsif($parents_ploidy[1] == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
977 $caveats = "Both unaffected mother and affected father are diploid";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
978 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
979 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
980 $caveats = "Affected father is diploid, unaffected mother has copy loss (".$parents_ploidy[1]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
981 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
982 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
983 elsif($parents_ploidy[0] < 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
984 $caveats = "Polyploidy found, but affected father had copy loss (".$parents_ploidy[0]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
985 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
986 elsif($ploidy < $parents_ploidy[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
987 $caveats = "Polyploidy is less severe than in unaffected mother (".$parents_ploidy[1]."x), or affected father (".$parents_ploidy[0]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
988 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
989 # past here the ploidy is great than in the unaffected mother
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
990 elsif($parents_ploidy[1] < 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
991 $caveats = "Polyploidy is also severe in affected father (".$parents_ploidy[0]."x), but unaffected mother actually had copy loss (". $parents_ploidy[1]. "x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
992 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
993 elsif($parents_ploidy[1] == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
994 $caveats = "Polyploidy is also severe in affected father (".$parents_ploidy[0]."x), and mother is diploid";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
995 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
996 elsif($ploidy < $parents_ploidy[0]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
997 $caveats = "Polyploidy is less severe than in affected father (".$parents_ploidy[0]."x), but more severe than unaffected mother (". $parents_ploidy[1]. "x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
998 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
999 elsif($ploidy > $parents_ploidy[0]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1000 $caveats = "Polyploidy is more severe than in affected father (".$parents_ploidy[0]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1001 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1002 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1003 $caveats = "Polyploidy is as severe as in affected father";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1004 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1005 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1006 elsif($ploidy == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1007 # Don't report diploid status, any funny recombination should show up in large indel analysis
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1008 next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1009 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1010 else{ # copies < 2
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1011 if($ploidy == $parents_ploidy[0]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1012 if($ploidy > $parents_ploidy[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1013 $caveats = "Copy loss is the same as affected father, but less than unaffected mother (". $parents_ploidy[1]. "x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1014 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1015 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1016 $caveats = "Copy loss is as severe as in affected father";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1017 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1018 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1019 elsif($ploidy > $parents_ploidy[0]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1020 if($ploidy > $parents_ploidy[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1021 if($parents_ploidy[1] == 0 and $parents_ploidy[0] == 0){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1022 $caveats = "Poor mapping, or Mendelian inheritence violation is severe: no copies of region in either parent, but present in offspring";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1023 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1024 elsif($ploidy == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1025 next; # child got best of both parents, ignore from CNV standpoint (may still have SNPs of course, or translocation, etc.)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1026 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1027 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1028 $caveats = "Copy loss is less severe than in unaffected mother (".$parents_ploidy[1]."x), or affected father (".$parents_ploidy[0]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1029 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1030 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1031 # else: child has less copies than unaffected mom, but more than affected Dad
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1032 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1033 if($parents_ploidy[1] > 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1034 $caveats = "Copy loss was more severe in affected father (".$parents_ploidy[0]."x), but unaffected mother had polyploidy (".$parents_ploidy[1]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1035 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1036 elsif($parents_ploidy[1] == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1037 $caveats = "Copy loss was more severe in affected father (".$parents_ploidy[0]."x), but unaffected mother was diploid";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1038 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1039 else{ # unaffected has loss
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1040 $caveats = "Copy loss is more severe than unaffect mother (".$parents_ploidy[1]."x), but less severe than affected father (".$parents_ploidy[0]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1041 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1042 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1043 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1044 # past here, ploidy is less than affected father
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1045 elsif($parents_ploidy[1] > 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1046 $caveats = "Copy loss is more severe than affected father (".$parents_ploidy[0]."x), and unaffected mother had polyploidy (".$parents_ploidy[1]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1047 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1048 elsif($parents_ploidy[1] == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1049 $caveats = "Copy loss is more severe than in affected father (".$parents_ploidy[0]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1050 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1051 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1052 $caveats = "Copy loss is more severe than in both unaffect mother (".$parents_ploidy[1]."x), and affected father (".$parents_ploidy[0]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1053 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1054 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1055 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1056 elsif($parents_ploidy[2] == 2){ # mother affected
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1057 if($ploidy == $parents_ploidy[0]){ # just like unaffected Dad
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1058 if($ploidy > 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1059 if($ploidy == $parents_ploidy[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1060 $caveats = "Same polyploidy present in both affected and unaffected parents";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1061 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1062 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1063 $caveats = "Polyploidy inherited from unaffected father";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1064 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1065 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1066 elsif($ploidy < 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1067 if($ploidy == $parents_ploidy[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1068 $caveats = "Same copy loss in both affected and unaffected parents";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1069 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1070 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1071 $caveats = "Copy loss is shared with unaffected father";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1072 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1073 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1074 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1075 if($ploidy == $parents_ploidy[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1076 # Why was this even reported? parents and child have diploid status...
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1077 next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1078 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1079 $caveats = "Diploidy is shared with unaffected father";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1080 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1081 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1082 elsif($ploidy > 2){ # polyploidy
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1083 if($parents_ploidy[1] == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1084 if($parents_ploidy[0] > 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1085 $caveats = "Unaffected father has polyploidy (".$parents_ploidy[0]."x), but affected mother is diploid";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1086 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1087 elsif($parents_ploidy[0] == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1088 $caveats = "Both unaffected father and affected mother are diploid";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1089 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1090 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1091 $caveats = "Affected mother is diploid, unaffected father has copy loss (".$parents_ploidy[1]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1092 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1093 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1094 elsif($parents_ploidy[1] < 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1095 $caveats = "Polyploidy found, but affected mother had copy loss (".$parents_ploidy[1]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1096 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1097 elsif($ploidy < $parents_ploidy[0]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1098 $caveats = "Polyploidy is less severe than in unaffected father (".$parents_ploidy[0]."x), or affected mother (".$parents_ploidy[1]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1099 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1100 # past here the ploidy is great than in the unaffected father
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1101 elsif($parents_ploidy[0] < 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1102 $caveats = "Polyploidy is also severe in affected mother (".$parents_ploidy[1]."x), but unaffected father actually had copy loss (". $parents_ploidy[0]. "x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1103 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1104 elsif($parents_ploidy[0] == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1105 $caveats = "Polyploidy is also severe in affected mother (".$parents_ploidy[1]."x), and unaffected father is diploid";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1106 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1107 elsif($ploidy < $parents_ploidy[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1108 $caveats = "Polyploidy is less severe than in affected mother (".$parents_ploidy[1]."x), but more severe than unaffected father (". $parents_ploidy[0]. "x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1109 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1110 elsif($ploidy > $parents_ploidy[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1111 $caveats = "Polyploidy is more severe than in affected mother (".$parents_ploidy[1]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1112 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1113 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1114 $caveats = "Polyploidy is as severe as in affected mother";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1115 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1116 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1117 elsif($ploidy == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1118 # Don't report diploid status, any funny recombination should show up in large indel analysis
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1119 next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1120 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1121 else{ # copies < 2
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1122 if($ploidy == $parents_ploidy[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1123 if($ploidy > $parents_ploidy[0]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1124 $caveats = "Copy loss is the same as affected mother, but less than unaffected father (". $parents_ploidy[0]. "x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1125 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1126 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1127 $caveats = "Copy loss is as severe as in affected mother";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1128 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1129 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1130 elsif($ploidy > $parents_ploidy[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1131 if($ploidy > $parents_ploidy[0]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1132 if($parents_ploidy[1] == 0 and $parents_ploidy[0] == 0){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1133 $caveats = "Poor mapping, or Mendelian inheritence violation is severe: no copies of region in either parent, but present in offspring";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1134 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1135 elsif($ploidy == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1136 next; # child got best of both parents, ignore from CNV standpoint (may still have SNPs of course, or translocation, etc.)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1137 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1138 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1139 $caveats = "Copy loss is less severe than in unaffected father (".$parents_ploidy[0]."x), or affected mother (".$parents_ploidy[1]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1140 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1141 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1142 # else: child has less copies than unaffected Dad, but more than affected Mom
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1143 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1144 if($parents_ploidy[0] > 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1145 $caveats = "Copy loss was more severe in affected mother (".$parents_ploidy[1]."x), but unaffected father had polyploidy (".$parents_ploidy[0]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1146 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1147 elsif($parents_ploidy[0] == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1148 $caveats = "Copy loss was more severe in affected mother (".$parents_ploidy[1]."x), but unaffected father was diploid";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1149 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1150 else{ # unaffected has loss
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1151 $caveats = "Copy loss is more severe than unaffect father (".$parents_ploidy[0]."x), but less severe than affected mother (".$parents_ploidy[1]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1152 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1153 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1154 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1155 # past here, ploidy is less than affected mother
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1156 elsif($parents_ploidy[0] > 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1157 $caveats = "Copy loss is more severe than affected mother (".$parents_ploidy[1]."x), and unaffected father had polyploidy (".$parents_ploidy[0]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1158 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1159 elsif($parents_ploidy[0] == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1160 $caveats = "Copy loss is more severe than in affected mother (".$parents_ploidy[1]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1161 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1162 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1163 $caveats = "Copy loss is more severe than in both unaffect father (".$parents_ploidy[0]."x), and affected mother (".$parents_ploidy[1]."x)";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1164 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1165 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1166 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1167
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1168 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1169 if($F[3] and $F[3] ne "-"){ # prexisting caveat from CNV caller
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1170 if(defined $caveats){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1171 $caveats .= "; $F[3]" unless $caveats =~ /\b$F[3]\b/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1172 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1173 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1174 $caveats = $F[3];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1175 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1176 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1177
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1178 # Sort by start for consistency
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1179 my @target_parents = sort {$feature_range{$cnv_chr}->{$a}->[0]->[0] <=> $feature_range{$cnv_chr}->{$b}->[0]->[0]} @$target_parents;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1180
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1181 for my $target_parent (@target_parents){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1182 my $target_caveats = $caveats;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1183 my $strand = $feature_strand{$target_parent};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1184 # report the gain/loss of each gene separately, for simplicity in downstream analysis
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1185 my $cnv_exon_start = 10000000000; # genomic coords
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1186 my $cnv_exon_end = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1187 my $cnv_cdna_start = 0; # cDNA coords
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1188 my $cnv_cdna_end = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1189 my $off5 = 0; # border outside the exon?
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1190 my $off3 = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1191 my @feature_ranges = @{$feature_range{$cnv_chr}->{$target_parent}};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1192 # find the first and last exons in the gene that are inside the CNV
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1193 for my $subregion (@feature_ranges){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1194 # exon overlaps CNV?
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1195 if($subregion->[0] <= $cnv_end and $subregion->[1] >= $cnv_start){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1196 if($cnv_exon_start > $subregion->[0]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1197 if($cnv_start < $subregion->[0]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1198 $cnv_exon_start = $subregion->[0]; $off5 = 1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1199 $cnv_cdna_start = $subregion->[2];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1200 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1201 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1202 $cnv_exon_start = $cnv_start; $off5 = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1203 $cnv_cdna_start = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$cnv_start: $cnv_start-$subregion->[0]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1204 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1205 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1206 if($cnv_exon_end < $subregion->[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1207 if($cnv_end > $subregion->[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1208 $cnv_exon_end = $subregion->[1]; $off3 = 1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1209 $cnv_cdna_end = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$subregion->[1] : $subregion->[1]-$subregion->[0]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1210 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1211 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1212 $cnv_exon_end = $cnv_end; $off3 = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1213 $cnv_cdna_end = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$cnv_end : $cnv_end-$subregion->[0]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1214 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1215 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1216 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1217 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1218
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1219 my $ends_internally = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1220 if($cnv_exon_end == 0){ # ends inside the exon
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1221 $cnv_exon_end = $cnv_end;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1222 $ends_internally = 1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1223 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1224 # See if it's in the structural variant database
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1225 my @gain_coverage; $#gain_coverage = $cnv_exon_end-$cnv_exon_start; # preallocate blanks
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1226 my @loss_coverage; $#loss_coverage = $cnv_exon_end-$cnv_exon_start; # preallocate blanks
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1227 my $dgv_loss_id; # report the DGV entry that covers most of the observed structural variant
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1228 my $dgv_loss_length = 0; # report the DGV entry that covers most of the observed structural variant
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1229 my $dgv_gain_id; # report the DGV entry that covers most of the observed structural variant
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1230 my $dgv_gain_length = 0; # report the DGV entry that covers most of the observed structural variant
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1231 my $gains;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1232 my $losses;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1233 my $dgv_chr = $cnv_chr;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1234 $dgv_chr =~ s/^chr//; # no prefix in DGV
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1235 #open(DGV, "tabix $dgv_file $dgv_chr:$cnv_exon_start-$cnv_exon_end |") # check out CNV in this gene model region
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1236 # or die "Cannot run tabix: $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1237 open(DGV, "/dev/null");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1238 while(<DGV>){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1239 my @C = split /\t/, $_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1240 next if $C[4] ne "CNV"; # todo: handle indels?
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1241 my $dgv_start = $C[2];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1242 my $dgv_end = $C[3];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1243 my $dgv_direction = $C[5];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1244 my $gain_cov_count = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1245 my $loss_cov_count = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1246 if($dgv_direction eq "Gain"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1247 for(my $i = ($dgv_start < $cnv_exon_start ? $cnv_exon_start : $dgv_start); $i <= $dgv_end and $i <= $cnv_exon_end; $i++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1248 $gain_coverage[$i-$cnv_exon_start] = 1 unless defined $gain_coverage[$i-$cnv_exon_start];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1249 $gain_cov_count++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1250 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1251 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1252 elsif($dgv_direction eq "Loss"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1253 for(my $i = ($dgv_start < $cnv_exon_start ? $cnv_exon_start : $dgv_start); $i <= $dgv_end and $i <= $cnv_exon_end; $i++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1254 $loss_coverage[$i-$cnv_exon_start] = 1 unless defined $loss_coverage[$i-$cnv_exon_start];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1255 $loss_cov_count++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1256 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1257 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1258 if($dgv_direction eq "Gain" and $gain_cov_count > $dgv_gain_length){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1259 $dgv_gain_id = $C[0];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1260 $dgv_gain_length = $gain_cov_count;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1261 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1262 if($dgv_direction eq "Loss" and $loss_cov_count > $dgv_loss_length){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1263 $dgv_loss_id = $C[0];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1264 $dgv_loss_length = $loss_cov_count;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1265 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1266 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1267 close(DGV);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1268
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1269 my $gain_coverage = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1270 for my $count (@gain_coverage){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1271 $gain_coverage++ if defined $count;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1272 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1273 $gain_coverage = sprintf "%.3f", $gain_coverage/($cnv_exon_end-$cnv_exon_start+1); # make it a proportion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1274 my $loss_coverage = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1275 for my $count (@loss_coverage){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1276 $loss_coverage++ if defined $count;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1277 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1278 $loss_coverage = sprintf "%.3f", $loss_coverage/($cnv_exon_end-$cnv_exon_start+1); # make it a proportion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1279
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1280 my $src = "DGV";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1281 my $dgv_id = "NA";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1282 my $dgv_caveat;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1283 my $dgv_coverage;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1284 if($ploidy > 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1285 if(not defined $dgv_gain_id){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1286 if(defined $dgv_loss_id){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1287 $dgv_id = sprintf "%s/%.3f", $dgv_loss_id, $dgv_loss_length/($cnv_exon_end-$cnv_exon_start+1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1288 $dgv_caveat = "; No gains are known in healthy controls, the DGV2 ID reported is for a loss in the same area";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1289 $dgv_coverage = $loss_coverage;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1290 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1291 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1292 $dgv_id = "novel";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1293 $dgv_coverage = "NA";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1294 $src = "NA";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1295 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1296 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1297 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1298 $dgv_id = sprintf "%s/%.3f", $dgv_gain_id, $dgv_gain_length/($cnv_exon_end-$cnv_exon_start+1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1299 $dgv_coverage = $gain_coverage;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1300 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1301 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1302 elsif($ploidy < 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1303 if(not defined $dgv_loss_id){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1304 if(defined $dgv_gain_id){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1305 $dgv_id = sprintf "%s/%.3f", $dgv_gain_id, $dgv_gain_length/($cnv_exon_end-$cnv_exon_start+1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1306 $dgv_caveat = "; No losses are known in healthy controls, the DGV2 ID reported is for a gain in the same area";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1307 $dgv_coverage = $gain_coverage;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1308 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1309 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1310 $dgv_id = "novel";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1311 $dgv_coverage = "NA";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1312 $src = "NA";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1313 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1314 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1315 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1316 $dgv_id = sprintf "%s/%.3f", $dgv_loss_id, $dgv_loss_length/($cnv_exon_end-$cnv_exon_start+1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1317 $dgv_coverage = $loss_coverage;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1318 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1319 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1320
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1321 my $non_coding = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1322 if(not exists $feature_cds_max{$target_parent} or not defined $feature_cds_max{$target_parent} or $feature_cds_max{$target_parent} eq ""){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1323 $non_coding = 1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1324 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1325 $target_caveats .= $dgv_caveat if defined $dgv_caveat and $dgv_id ne "novel" and $target_caveats !~ /\Q$dgv_caveat\E/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1326 #print "Recorded $cnv_chr:$cnv_start caveat $caveats\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1327 # if it doesn't overlap an exon, we need to find out which two exons it's between
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1328 if($ends_internally){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1329 my $intron_found = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1330 for(my $i = 0; $i < $#feature_ranges; $i++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1331 if($feature_ranges[$i]->[1] < $cnv_start and $feature_ranges[$i+1]->[0] > $cnv_end){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1332 if($ploidy > 2){ # gain
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1333 if($strand eq "-"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1334 record_snv("$target_parent\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1335 ($non_coding ? "g.$cnv_start\_$cnv_end" :
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1336 "c.".($feature_ranges[$i+1]->[2])."+".($feature_ranges[$i+1]->[0]-$cnv_end)."_".($feature_ranges[$i+1]->[2]+1)."-".($cnv_start-$feature_ranges[$i]->[1])),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1337 # pos Zygosity P-value Variant Reads Total Reads Ref Bases Var Bases Population Frequency Source Pop. freq. or DGV2 gain/loss coverage dbSNP or DGV2 ID
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1338 "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\tNA\t$p_value\tNA\tNA\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1339 "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1340 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1341 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1342 record_snv("$target_parent\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1343 ($non_coding ? "g.$cnv_start\_$cnv_end" :
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1344 "c.".($feature_ranges[$i+1]->[2]-1)."+".($cnv_start-$feature_ranges[$i]->[1])."_".$feature_ranges[$i+1]->[2]."-".($feature_ranges[$i+1]->[0]-$cnv_end)),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1345 "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\tNA\t$p_value\tNA\tNA\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1346 "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1347 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1348 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1349 else{ # loss
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1350 if($strand eq "-"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1351 record_snv("$target_parent\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1352 ($non_coding ? "g.$cnv_start\_$cnv_end" :
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1353 "c.".($feature_ranges[$i+1]->[2])."+".($feature_ranges[$i+1]->[0]-$cnv_end)."_".($feature_ranges[$i+1]->[2]+1)."-".($cnv_start-$feature_ranges[$i]->[1])),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1354 "del\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1355 "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1356 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1357 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1358 record_snv("$target_parent\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1359 ($non_coding ? "g.$cnv_start\_$cnv_end" :
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1360 "c.".($feature_ranges[$i+1]->[2]-1)."+".($cnv_start-$feature_ranges[$i]->[1])."_".$feature_ranges[$i+1]->[2]."-".($feature_ranges[$i+1]->[0]-$cnv_end)),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1361 "del\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1362 "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1363 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1364 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1365 $intron_found = 1; last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1366 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1367 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1368 warn "Logic error: CNV overlaps a gene ($target_parent), but is neither intronic nor exonic. Offending CNV was $_\n" unless $intron_found;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1369 next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1370 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1371 if($strand eq "-"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1372 my $tmp = $cnv_cdna_start;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1373 $cnv_cdna_start = $cnv_cdna_end;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1374 $cnv_cdna_end = $tmp;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1375 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1376 # Make the location label pretty descriptive
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1377 my $cnv_phase = "";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1378 if($cnv_exon_start > $cnv_start or $cnv_exon_end < $cnv_end){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1379 $cnv_phase = "CNV-$cnv_chr:$cnv_start-$cnv_end"; # Use phase to note that it's part of a bigger CNV than just the range of this feature
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1380 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1381 # if we get here, we're in a gained/deleted exon category
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1382 # CNVs are fuzzy-edged (as opposed to large indels), so produce HGVS syntax that reflect this
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1383 if($ploidy > 2){ # gain
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1384 # find the exons encompassed by the CNV. NOTE that we assume that polyploidies are proximal
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1385 record_snv("$target_parent\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1386 ($non_coding ? "g.".($cnv_exon_start > $cnv_start ? "$cnv_exon_start-?" : $cnv_start)."_".($cnv_exon_end < $cnv_end ? "$cnv_exon_end+?" : $cnv_end) :
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1387 "c.$cnv_cdna_start".($off5?"-?":"")."_$cnv_cdna_end".($off3?"+?":"")),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1388 "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_exon_start\t$cnv_exon_end\tNA\t$p_value\tNA\tNA\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1389 "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t$cnv_phase\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1390 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1391 else{ # loss
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1392 #translate genome coordinates into cDNA coordinates
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1393 record_snv("$target_parent\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1394 ($non_coding ? "g.".($cnv_exon_start > $cnv_start ? "$cnv_exon_start-?" : $cnv_start)."_".($cnv_exon_end < $cnv_end ? "$cnv_exon_end+?" : $cnv_end) :
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1395 "c.$cnv_cdna_start".($off5?"-?":"")."_$cnv_cdna_end".($off3?"+?":"")),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1396 "del\t$strand\t$cnv_chr\t$cnv_exon_start\t$cnv_exon_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1397 "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t$cnv_phase\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1398 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1399 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1400 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1401 close(CNV);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1402
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1403 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1404
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1405
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1406 #sort genes by start, then longest if tied
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1407 my %rc = qw(A T T A G C C G N N S W W S K M M K Y R R Y X X);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1408 print STDERR "Processing variant calls..." unless $quiet;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1409 %chr_read = ();
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1410 open(VCFIN, $input_file)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1411 or die "Cannot open $input_file for reading: $!\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1412 while(<VCFIN>){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1413 if(/^\s*(?:#|$)/){ # blank or hash comment lines
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1414 next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1415 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1416 chomp;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1417 my @fields = split /\t/, $_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1418
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1419 next unless exists $feature_range{$fields[0]};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1420 if(not $quiet and not exists $chr_read{$fields[0]}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1421 print STDERR " $fields[0]";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1422 $chr_read{$fields[0]} = 1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1423 #print STDERR "(not in reference file!)" unless exists $feature_range{$fields[0]};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1424 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1425
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1426 next if $fields[4] eq "<NON_REF>"; # GVCF background stuff
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1427 next if $fields[9] eq "./." or $fields[9] eq "."; # no call
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1428 my $chr = $fields[0];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1429 next if defined $which_chr and $chr ne $which_chr and $chr ne "chr$which_chr" and "chr$chr" ne $which_chr;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1430 print STDERR "passes chr and field # test" if grep /dataset_7684.dat/, @ARGV;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1431 $chr = "chr$chr" if $chr !~ /^chr/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1432 $fields[8] =~ s/\s+$//;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1433 my @values = split /:/, $fields[9];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1434 #print STDERR join(" / ", @values), "\n" if $. == 84;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1435 my @keys = split /:/, $fields[8];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1436 my $zygosity = "n/a";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1437 my $quality = "n/a";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1438 my $read_depth = "n/a";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1439 my $variant_depths = "n/a";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1440 for(my $i = 0; $i <= $#keys and $i <= $#values; $i++){ # values max index check because some genotypers are nasty and don't provide as many fields as they say they will
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1441 if($keys[$i] eq "GT"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1442 if($values[$i] =~ /\./ or $values[$i] =~ /0\/0/){ # one genotype not described
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1443 $zygosity = "none";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1444 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1445 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1446 else{ # genotypes described
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1447 $zygosity = $values[$i] =~ /[02]/ ? "heterozygote" : "homozygote";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1448 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1449 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1450 elsif($keys[$i] eq "DNM_CONFIG" and $zygosity eq "n/a"){ # hack
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1451 $zygosity = $values[$i] =~ /^(.)\1/ ? "homozygote" : "heterozygote";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1452 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1453 elsif($keys[$i] eq "GQ" and $values[$i] ne "."){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1454 #print "Checking GQ (index $i) $values[$i] gq2p\n" if $. == 84;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1455 $quality = gq2p($values[$i]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1456 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1457 elsif($keys[$i] eq "RD"){ # from some tools like denovo variant finders
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1458 $read_depth = $values[$i];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1459 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1460 elsif($keys[$i] eq "DP"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1461 $read_depth = $values[$i];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1462 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1463 # the frequency of the variant can go by many names, here we have freebayes (A* are new and old versions) and atlas2_indel
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1464 elsif($keys[$i] eq "AA" or $keys[$i] eq "VR" or $keys[$i] eq "AO"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1465 $variant_depths = $values[$i];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1466 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1467 # here we have GATK variant freq of form ref#,var#
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1468 elsif($keys[$i] eq "AD"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1469 $variant_depths = $values[$i];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1470 $variant_depths =~ s/^\d+,//;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1471 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1472 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1473 #print STDERR "Ignoring field $keys[$i]\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1474 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1475 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1476 next if $zygosity eq "none"; # GVCF non-ref for example or when multiple samples are reported simultaneously
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1477 $quality = z2p($1) if $fields[7] =~ /Z=(\d+\.\d+)/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1478 if($quality eq "n/a" and $fields[5] ne "."){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1479 $quality = gq2p($fields[5]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1480 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1481 if($fields[5] eq "0" and $fields[6] eq "PASS"){ # not qual and a PASS in the filter column
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1482 $quality = 1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1483 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1484 elsif($quality ne "n/a" and $quality > $min_pvalue){ # p-value cutoff
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1485 #print "Checking call quality $fields[5] gq2p\n" if $. == 84;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1486 next unless gq2p($fields[5]) <= $min_pvalue; # in some cases, programs like FreeBayes give low genotype quality such as when a call is borderline homo/het
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1487 # in these cases it would be silly to reject the call if their are many supporting reads.
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1488 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1489
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1490 # Some tools like GATK don't report number of variant reads...infer from other data if possible
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1491 if($variant_depths eq "n/a"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1492 my @key_value_pairs = split /;/, $fields[7];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1493 for my $key_value_pair (@key_value_pairs){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1494 if($key_value_pair !~ /^(.*?)=(.*)$/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1495 next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1496 #next if $key_value_pair eq "INDEL"; # samtools peculiarity
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1497 #die "Key-value pair field (column #8) does not have the format key=value for entry $key_value_pair (line #$. of ), please fix the VCF file\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1498 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1499 if($1 eq "AB"){ # GATK: for het calls, AB is ref/(ref+var), only one variant reported per line
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1500 $variant_depths = "";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1501 for my $ab (split /,/, $2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1502 $variant_depths .= int((1-$ab)*$read_depth).",";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1503 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1504 chop $variant_depths;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1505 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1506 elsif($1 eq "MLEAC"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1507 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1508 elsif($1 eq "DP4"){ # samtools: high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1509 my @rds = split /,/, $2;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1510 $variant_depths = $rds[2]+$rds[3];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1511 $read_depth = $rds[0]+$rds[1]+$variant_depths;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1512 if($fields[4] =~ /,/){ # samtools doesn't break down compound het calls into individual frequencies
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1513 my $num_alt_genotypes = $fields[4] =~ tr/,/,/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1514 $num_alt_genotypes++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1515 my $even_prop = sprintf "%.2f", $variant_depths/$read_depth/$num_alt_genotypes;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1516 $variant_depths = join(",", ($even_prop) x $num_alt_genotypes);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1517 if(not exists $chr2caveats{"$chr:$fields[1]"}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1518 $chr2caveats{"$chr:$fields[1]"} = "compound het var freq n/a in orig VCF file, auto set to $even_prop";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1519 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1520 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1521 $chr2caveats{"$chr:$fields[1]"} .= "; compound het var freq n/a in orig VCF file, auto set to $even_prop";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1522 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1523 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1524 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1525 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1526 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1527 if($variant_depths eq "n/a"){ # usually homo var calls, can only assume all reads are variant
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1528 if($zygosity eq "homozygote"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1529 $variant_depths = $read_depth;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1530 if($read_depth ne "n/a"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1531 if(not exists $chr2caveats{"$chr:$fields[1]"}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1532 $chr2caveats{"$chr:$fields[1]"} = "homo var freq n/a in orig VCF file, auto set to 1.0";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1533 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1534 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1535 $chr2caveats{"$chr:$fields[1]"} = "; homo var freq n/a in orig VCF file, auto set to 1.0";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1536 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1537 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1538 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1539 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1540 if($read_depth ne "n/a"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1541 $variant_depths = int($read_depth/2);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1542 if(not exists $chr2caveats{"$chr:$fields[1]"}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1543 $chr2caveats{"$chr:$fields[1]"} = "het var freq n/a in orig VCF file, auto set to 0.5";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1544 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1545 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1546 $chr2caveats{"$chr:$fields[1]"} = "; het var freq n/a in orig VCF file, auto set to 0.5";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1547 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1548 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1549 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1550 $variant_depths = $read_depth;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1551 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1552 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1553 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1554
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1555 my $target_parents = $feature_intervaltree{$chr}->fetch($fields[1]-$flanking_bases, $fields[1]+length($fields[3])+$flanking_bases);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1556 # Last ditch, if not in a gene model, is it at least in an enrichment region?
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1557 if(not @$target_parents){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1558 next if not exists $enrichment_regions{$chr};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1559 my $regions_ref = $enrichment_regions{$chr};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1560 my $location = $fields[1];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1561 my $strand = "+"; # for lack of a better choice
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1562 for(my $i = find_earliest_index($location-$flanking_bases, $regions_ref);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1563 $i < $#$regions_ref and $location <= $regions_ref->[$i]->[1]+$flanking_bases;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1564 $i++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1565 next unless $regions_ref->[$i]->[0]-$flanking_bases <= $location and $regions_ref->[$i]->[1]+$flanking_bases >= $location;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1566
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1567 my $feature_name = "enrichment_target_$chr:".$regions_ref->[$i]->[0]."-".$regions_ref->[$i]->[1];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1568 $feature_type{$feature_name} = "misc_enrichment_kit_target";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1569 $feature_length{$feature_name} = $regions_ref->[$i]->[1]-$regions_ref->[$i]->[0]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1570 my @variants = split /,/, $fields[4];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1571 my @variant_depths = split /,/, $variant_depths;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1572 my $ref = uc($fields[3]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1573 for(my $j = 0; $j <= $#variants; $j++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1574 my $variant = $variants[$j];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1575 next if $variant eq "<NON_REF>" or $variant_depths[$j] eq "0"; # GVCF stuff
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1576 my $variant_depth = $variant_depths[$j];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1577 if($min_prop){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1578 next unless $variant_depth >= $min_depth and $read_depth ne "n/a" and $variant_depth/$read_depth >= $min_prop;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1579 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1580 if(length($ref) == 1 and length($variant) == 1){ # SNP
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1581 record_snv("$feature_name\tg.$location",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1582 "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1583 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1584 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1585 elsif(length($ref) == 1 and length($variant) > 1){ # insertion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1586 record_snv("$feature_name\tg.$location\_",($location+1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1587 "ins",substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1588 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1589 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1590 elsif(length($variant) == 1 and length($ref) > 1){ # deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1591 record_snv("$feature_name\tg.$location\_",($location+length($ref)-1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1592 "del",substr($ref, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1593 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1594 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1595 else{ # indel
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1596 record_snv("$feature_name\tg.$location\_",($location+length($ref)-1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1597 "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1598 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1599 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1600 } # end for variants
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1601 next; # process next record, we've done all we can with a non-coding-gene SNP
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1602 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1603 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1604
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1605 for my $target_parent (@$target_parents){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1606
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1607 print STDERR "Checking parent $target_parent for on $chr:$fields[1] $fields[3] -> $fields[4]\n" if grep /dataset_7684.dat/, @ARGV;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1608 my @feature_ranges = @{$feature_range{$chr}->{$target_parent}};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1609 # Calculate the position of the change within the feature range position
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1610 my $strand = $feature_strand{$target_parent};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1611 my $trans_table = exists $feature_transl_table{$target_parent} ? $feature_transl_table{$target_parent} : $default_transl_table;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1612 $fields[4]=~tr/"//d; # sometime strangely surroundsed by quotes
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1613 my @variants = split /,/, $fields[4];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1614 my @variant_depths = split /,/, $variant_depths;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1615 my @refs = (uc($fields[3])) x scalar(@variants);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1616 my @locations = ($fields[1]) x scalar(@variants);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1617
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1618 for(my $j = 0; $j <= $#variants; $j++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1619 my $ref = $refs[$j];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1620 my $location = $locations[$j];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1621 my $feature_offset = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1622 my $feature_num = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1623 my $variant = uc($variants[$j]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1624 next if $variant eq "<NON_REF>" or $variant_depths[$j] eq "0"; # GVCF stuff
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1625 my $variant_depth = $variant_depths[$j] || "n/a";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1626 #print STDERR "Evaluating target parent $target_parent for $chr:$fields[1]-".($fields[1]+length($fields[3]))." -> ",join("/", @$target_parents) , "\n" if $fields[1] == 982994;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1627
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1628 # Break down MNPs into individual SNPs that are phased (TODO: skip if it's an inversion? would require amalgamating SNPs for tools that call them individually, phased :-P)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1629 if(length($variant) > 1 and length($variant) == length($ref)){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1630 my @subvariants;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1631 my @subrefs;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1632 my @sublocations;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1633 my $phase_range = $location."-".($location+length($ref)-1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1634 for(my $k = 0; $k < length($variant); $k++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1635 my $r = substr($ref, $k, 1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1636 my $v = substr($variant, $k, 1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1637 if($r ne $v){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1638 push @subvariants, $v;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1639 push @subrefs, $r;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1640 push @sublocations, $location+$k;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1641 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1642 elsif(@variants == 1){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1643 next; # homo ref call
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1644 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1645 if($zygosity eq "heterozygote"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1646 # need to ignore case where a homozygous call (variant or ref) is included in a double non-ref het MNP
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1647 if(@variants > 1){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1648 my $homo = 1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1649 for(my $l = 0; $l <= $#variants; $l++){ # using loop in case we handle oligoploid genomes in the future
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1650 if(length($variants[$l]) <= $k or substr($variants[$l], $k, 1) ne $v){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1651 $homo = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1652 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1653 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1654 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1655 next if $homo;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1656 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1657 my $phase_key = "$chr:".($location+$k).":$v";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1658 my $phase_label = "M$j-$chr:$phase_range";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1659 if(exists $chr2phase{$phase_key}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1660 if($chr2phase{$phase_key} !~ /$phase_label/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1661 $chr2phase{$phase_key} .= ",$phase_label";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1662 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1663 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1664 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1665 $chr2phase{$phase_key} = $phase_label;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1666 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1667 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1668 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1669 # recycle this MNP variant loop to start processing the individual SNPs
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1670 splice(@refs, $j, 1, @subrefs);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1671 splice(@variants, $j, 1, @subvariants);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1672 splice(@locations, $j, 1, @sublocations);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1673 splice(@variant_depths, $j, 1, ($variant_depth) x scalar(@subvariants));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1674 $j--;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1675 next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1676 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1677
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1678 if($min_prop != 0 and $variant_depth eq "n/a" or $variant_depth eq "."){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1679 print STDERR "Could not parse variant depth from $_\n" unless $quiet;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1680 next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1681 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1682 next unless $min_prop == 0 or $min_prop and $variant_depth >= $min_depth and $read_depth ne "n/a" and $variant_depth/$read_depth >= $min_prop;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1683 if($zygosity eq "NA"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1684 # make the call ourselves
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1685 $zygosity = $variant_depths/$read_depth > 1-$min_prop ? "homozygote" : "heterozygote";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1686 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1687 # e.g. chr22 47857671 . CAAAG AAGAT,AAAAG 29.04 .
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1688 if(length($variant) > 1 and length($variant) == length($ref)){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1689 for(my $k = 0; $k < length($variant); $k++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1690 my $fixed_variant = $variant;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1691 substr($fixed_variant, $k, 1) = substr($ref, $k, 1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1692 if($fixed_variant eq $ref){ # single base difference at base k between the two
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1693 $ref = substr($ref, $k, 1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1694 $variant = substr($variant, $k, 1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1695 $location += $k;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1696 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1697 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1698 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1699 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1700
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1701 # samtools reports indels with long common tails, causing non-canonical HGVS reporting and a problem when looking up the variant in dbSNP etc.
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1702 # remove common tails to variant calls in order to try to rectify this
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1703 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1704 while(length($ref) > 1 and length($variant) > 1 and substr($ref, length($ref)-1) eq substr($variant, length($variant)-1)){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1705 chop $ref; chop $variant;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1706 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1707 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1708
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1709 # See if a caveat should be added because the indel was in a polyhomomer region
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1710 if(length($ref) > length($variant) and index($ref, $variant) == 0 and $fasta_index->fetch("$chr:".($location+1)."-".($location+length($ref)+1)) =~ /^([ACGT])\1+$/i){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1711 if(not exists $chr2caveats{"$chr:$location"}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1712 $chr2caveats{"$chr:$location"} = "poly".uc($1)." region deletion";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1713 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1714 elsif($chr2caveats{"$chr:$location"} !~ /poly/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1715 $chr2caveats{"$chr:$location"} .= "; poly".uc($1)." region deletion";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1716 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1717 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1718 elsif(length($ref) < length($variant) and index($variant, $ref) == 0 and substr($variant, 1) =~ /^([ACGT])\1+$/i){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1719 if(not exists $chr2caveats{"$chr:$location"}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1720 $chr2caveats{"$chr:$location"} .= "poly".uc($1)." region insertion";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1721 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1722 elsif($chr2caveats{"$chr:$location"} !~ /poly/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1723 $chr2caveats{"$chr:$location"} .= "; poly".uc($1)." region insertion";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1724 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1725 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1726
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1727 # Not a protein-coding gene? Report genomic cooordinates for HGVS
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1728 if(not exists $feature_cds_max{$target_parent} or not defined $feature_cds_max{$target_parent} or $feature_cds_max{$target_parent} eq ""){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1729 if(length($ref) == 1 and length($variant) == 1){ # SNP
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1730 record_snv("$target_parent\tg.$location",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1731 "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1732 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1733 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1734 elsif(length($ref) == 1 and length($variant) > 1){ # insertion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1735 record_snv("$target_parent\tg.$location\_",($location+1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1736 "ins",substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1737 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1738 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1739 elsif(length($variant) == 1 and length($ref) > 1){ # deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1740 record_snv("$target_parent\tg.$location\_",($location+length($ref)-1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1741 "del",substr($ref, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1742 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1743 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1744 else{ # indel
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1745 record_snv("$target_parent\tg.$location\_",($location+length($ref)-1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1746 "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1747 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1748 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1749 next; # process next record, we've done all we can with a non-coding-gene SNP
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1750 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1751
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1752 if($strand eq "-"){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1753 # set up utr offset for correct CDS coordinates
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1754 for(my $i = $#feature_ranges; $i >= 0; $i--){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1755 # exon is completely 5' of the start
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1756 if($feature_ranges[$i]->[0] > $feature_cds_max{$target_parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1757 #print STDERR "Whole 5' UTR exon $i: ",$feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1,"\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1758 $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1759 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1760 # exon with the cds start
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1761 elsif($feature_ranges[$i]->[1] >= $feature_cds_max{$target_parent} and
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1762 $feature_ranges[$i]->[0] <= $feature_cds_max{$target_parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1763 #print STDERR "Start codon in exon $i: ", $feature_cds_max{$target_parent} - $feature_ranges[$i]->[1],"\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1764 $feature_offset += $feature_cds_max{$target_parent} - $feature_ranges[$i]->[1];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1765 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1766 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1767 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1768 die "The CDS for $target_parent (on negative strand) ends downstream ",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1769 "($feature_cds_max{$target_parent}) of the an exon",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1770 " (", $feature_ranges[$i]->[0], "), which is illogical. Please revise the GFF file provided.\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1771 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1772 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1773 for(my $i = $#feature_ranges; $i >= 0; $i--){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1774 my $feature = $feature_ranges[$i];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1775 # in the 3' UTR region of the gene
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1776 if($location < $feature_cds_min{$target_parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1777 my $feature_exon = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1778 $feature = $feature_ranges[$feature_exon];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1779 while($location > $feature->[1]+$flanking_bases and
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1780 $feature_exon < $#feature_ranges){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1781 $feature = $feature_ranges[++$feature_exon]; # find the 3' utr exon in which the variant is located
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1782 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1783 # utr exons passed entirely
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1784 my $stop_exon = $feature_exon;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1785 while($feature_ranges[$stop_exon]->[1] < $feature_cds_min{$target_parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1786 $stop_exon++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1787 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1788 my $post_offset = $feature_cds_min{$target_parent}-$feature_ranges[$stop_exon]->[0]; # offset from the stop codon in the final coding exon
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1789 while($stop_exon > $feature_exon){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1790 $post_offset += $feature_ranges[$stop_exon]->[1]-$feature_ranges[$stop_exon]->[0]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1791 $stop_exon--;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1792 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1793
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1794 my $pos = $feature->[1]-$location+1+$post_offset;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1795 my $junction_dist;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1796 if($location < $feature->[0]){ # after a UTR containing exon? set as .*DD+DD
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1797 $junction_dist = ($feature->[0]-$location);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1798 $pos = ($post_offset+$feature->[1]-$feature->[0]+1)."+".$junction_dist;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1799 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1800 elsif($location > $feature->[1]){ # before a total UTR exon? set as .*DD-DD
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1801 $junction_dist = -($location-$feature->[1]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1802 $pos = ($post_offset+1).$junction_dist;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1803 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1804 else{ # in the UTR exon
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1805 if($location - $feature->[0] < $feature->[1] - $location){ # location is closer to exon donor site
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1806 $junction_dist = -($location - $feature->[0]+1); # +1 for HGVS syntax compatibility (there is no position 0, direct skip from -1 to +1)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1807 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1808 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1809 $junction_dist = $feature->[1] - $location +1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1810 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1811 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1812 if(length($ref) == 1 and length($variant) == 1){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1813 my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1814 # 3' UTR SNP
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1815 record_snv("$target_parent\tc.*$pos",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1816 "$rc{$ref}>$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1817 #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1818 join("\t",prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1819 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1820 elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1821 my $rc = join("",map({$rc{$_}} split(//,reverse(substr($variant,1)))));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1822 # 3' UTR insertion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1823 record_snv("$target_parent\tc.*",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1824 hgvs_plus($pos,-1),"_*",$pos,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1825 "ins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1826 #"ins",substr($variant,1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1827 join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1828 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1829 elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1830 my $rc = join("",map({$rc{$_}} split(//,reverse($ref))));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1831 my $delBases = substr($rc,0,length($rc)-1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1832 if(length($ref) == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1833 # 3' UTR single base deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1834 record_snv("$target_parent\tc.*",hgvs_plus($pos,-1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1835 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1836 join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1837 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1838 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1839 # longer 3' UTR deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1840 record_snv("$target_parent\tc.*",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1841 hgvs_plus($pos,-length($ref)+1),"_*",hgvs_plus($pos, -1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1842 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1843 join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1844 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1845 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1846 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1847 my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1848 if($rc eq $ref and length($variant) > 3){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1849 # 3' UTR inversion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1850 record_snv("$target_parent\tc.*",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1851 hgvs_plus($pos,-length($ref)+1),"_*",$pos,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1852 "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1853 join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1854 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1855 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1856
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1857 # complex substitution in 3' UTR
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1858 # Will break if stop codon is modified
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1859 record_snv("$target_parent\tc.*",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1860 hgvs_plus($pos,-length($ref)+1),"_*", $pos,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1861 "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1862 join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1863 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1864 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1865 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1866 # in the feature
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1867 elsif($location >= $feature->[0] and $location <= $feature->[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1868 my $pos = $feature_offset+$feature->[1]-$location+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1869 if($location > $feature_cds_max{$target_parent}){ #since there is no position 0, the pos is in UTR, subtract one
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1870 $pos = hgvs_plus($pos, -1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1871 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1872 my $first_exon_base = $feature_offset+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1873 my $exon_edge_dist = $feature->[1]-$location+1; # equiv of HGVS +X or -X intron syntax, but for exons
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1874 $exon_edge_dist = $feature->[0]-$location-1 if abs($feature->[0]-$location-1) < $exon_edge_dist; # pick closer of donor and acceptor sites
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1875 #print STDERR "Got ", $feature->[1]-$location+1, "vs. ", $feature->[0]-$location-1, ": chose $exon_edge_dist\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1876 if(length($ref) == 1 and length($variant) == 1){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1877 # SNP
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1878 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1879 $pos,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1880 "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1881 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1882 ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1883 #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1884 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1885 elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1886 my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1887 my $insBases = substr($rc,1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1888 # insertion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1889 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1890 hgvs_plus_exon($pos, -1, $first_exon_base),"_",$pos,"ins$insBases",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1891 "\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1892 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1893 ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1894 #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1895 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1896 elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1897 my $rc = join("",map({$rc{$_}} split(//,reverse($ref))));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1898 my $delBases = substr($rc,0,length($rc)-1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1899 # single nucleotide deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1900 if(length($ref) == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1901 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1902 hgvs_plus_exon($pos, -1, $first_exon_base),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1903 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1904 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1905 ($pos-1 < 1 ? "NA" : $pos-1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1906 #($pos-1 < 1 ? "NA" : $pos-1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1907 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1908 # longer deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1909 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1910 $exon_edge_dist = $feature->[1]-$location-length($ref)+1 if abs($feature->[1]-$location-length($ref)+1) < $exon_edge_dist;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1911 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1912 hgvs_plus_exon($pos, -length($ref)+1, $first_exon_base),"_",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1913 hgvs_plus_exon($pos, -1, $first_exon_base),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1914 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1915 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1916 ($pos-1 < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1917 #($pos-1 < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1918 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1919 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1920 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1921 # complex substitution
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1922 $exon_edge_dist = $feature->[1]-$location-length($ref)+1 if abs($feature->[1]-$location-length($ref)+1) < $exon_edge_dist;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1923 my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1924 if($rc eq $variant and length($variant) > 3){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1925 # inversion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1926 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1927 hgvs_plus_exon($pos,-length($ref)+1,$first_exon_base),"_",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1928 $pos,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1929 "inv",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1930 "\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1931 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1932 ($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1933
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1934 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1935 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1936 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1937 hgvs_plus_exon($pos,-length($ref)+1,$first_exon_base),"_",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1938 $pos,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1939 "delins$rc",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1940 "\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1941 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1942 ($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1943 #($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1944 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1945 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1946 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1947 # 5' of feature (on negative strand)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1948 elsif($location > $feature->[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1949 if(length($ref) == 1 and length($variant) == 1){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1950 # intronic SNP
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1951 if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1952 # Closer to acceptor site
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1953 record_snv("$target_parent\tc.",$feature_offset+1,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1954 ($feature->[1]-$location),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1955 "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1956 #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1957 join("\t", prop_info_key($chr,$location,$ref,$variant, $feature->[1]-$location)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1958 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1959 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1960 # Closer to donor site
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1961 record_snv("$target_parent\tc.",$feature_offset,"+",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1962 ($feature_ranges[$i+1]->[0]-$location),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1963 "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1964 #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1965 join("\t", prop_info_key($chr,$location,$ref,$variant, $feature_ranges[$i+1]->[0]-$location)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1966 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1967 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1968 elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1969 my $rc = join("",map({$rc{$_}} split(//,reverse(substr($variant,1)))));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1970 if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1971 # intronic insertion near acceptor
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1972 my $pos = ($feature_offset+1).($feature->[1]-$location);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1973 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1974 hgvs_plus($pos,-1),"_",$pos,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1975 "ins",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1976 $rc,"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1977 #substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1978 join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location-1)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1979 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1980 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1981 # intronic insertion near donor
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1982 my $pos = $feature_offset."+".($feature_ranges[$i+1]->[0]-$location);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1983 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1984 hgvs_plus($pos,-1),"_",$pos,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1985 "ins",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1986 $rc,"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1987 #substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1988 join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location+1)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1989 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1990 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1991 elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1992 # intronic deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1993 # single nucleotide deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1994 my $rc = reverse($ref);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1995 $rc=~tr/ACGT/TGCA/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1996 my $delBases = substr($rc, 0, length($rc)-1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1997 if(length($ref) == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1998 # single intronic deletion near acceptor
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1999 if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2000 my $off = $feature->[1]-$location-1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2001 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2002 ($feature_offset+1),$off,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2003 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2004 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2005 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2006 # single intronic deletion near donor
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2007 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2008 my $pos = $feature_offset;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2009 my $off = $feature_ranges[$i+1]->[0]-$location+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2010 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2011 hgvs_plus_exon($pos, $off, $pos),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2012 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2013 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2014 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2015 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2016 # longer deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2017 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2018 if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2019 # long intronic deletion near acceptor
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2020 my $off = $feature->[1]-$location-1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2021 my $pos = ($feature_offset+1).$off;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2022 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2023 hgvs_plus($pos,-length($ref)+2),"_",$pos,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2024 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2025 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2026 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2027 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2028 # long intronic deletion near donor
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2029 my $off = $feature_ranges[$i+1]->[0]-$location+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2030 my $pos = ($feature_offset)."+".$off;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2031 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2032 $pos,"_",hgvs_plus($pos,-length($ref)-1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2033 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2034 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off-length($ref)+1 <= 2 ? "p.?" : "NA"),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2035 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2036 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2037 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2038 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2039 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2040 my $rc = reverse($ref);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2041 $rc=~tr/ACGT/TGCA/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2042 if($rc eq $variant and length($variant) > 3){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2043 # intronic inversion near acceptor site
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2044 if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2045 my $pos = ($feature_offset+1).($feature->[1]-$location);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2046 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2047 hgvs_plus($pos,-length($ref)+1),"_",$pos,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2048 "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2049 join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2050 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2051 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2052 my $pos = ($feature_offset)."+".($feature_ranges[$i+1]->[0]-$location);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2053 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2054 $pos,"_",hgvs_plus($pos, length($ref)-1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2055 "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2056 join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2057 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2058 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2059 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2060 $rc = reverse($variant);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2061 $rc=~tr/ACGT/TGCA/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2062 # Intronic complex substitution
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2063 if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2064 # complex intronic substitution near acceptor site
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2065 my $pos = ($feature_offset+1).($feature->[1]-$location);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2066 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2067 hgvs_plus($pos, -length($ref)+1),"_",$pos,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2068 "delins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2069 #"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2070 join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2071 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2072 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2073 # complex intronic substitution near donor site
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2074 my $pos = $feature_offset."+".($feature_ranges[$i+1]->[0]-$location);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2075 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2076 $pos,"_",hgvs_plus($pos, length($ref)-1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2077 "delins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2078 #"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2079 join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2080 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2081 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2082 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2083 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2084 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2085 #print STDERR "Offset going from ", $feature_offset, " to ", $feature_offset+$feature->[1]-$feature->[0]+1,"\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2086 $feature_offset += $feature->[1]-$feature->[0]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2087 #print STDERR "Set feature offset to $feature_offset by adding ",$feature->[1],"-", $feature->[0],"+1\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2088 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2089 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2090 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2091 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2092 # forward strand
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2093
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2094 # set up utr offset for correct CDS coordinates
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2095 for(my $i = 0; $i <= $#feature_ranges; $i++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2096 # All 5' utr exon
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2097 if($feature_ranges[$i]->[1] < $feature_cds_min{$target_parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2098 $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2099 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2100 # exon with the cds start
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2101 elsif($feature_ranges[$i]->[1] >= $feature_cds_min{$target_parent} and
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2102 $feature_ranges[$i]->[0] <= $feature_cds_min{$target_parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2103 $feature_offset -= $feature_cds_min{$target_parent} - $feature_ranges[$i]->[0];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2104 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2105 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2106 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2107 die "The CDS for $target_parent starts upstream ($feature_cds_max{$target_parent}) of the first exon",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2108 " (", $feature_ranges[$i]->[0], "), which is illogical. Please revise the GFF file provided.\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2109 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2110 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2111 for(my $i = 0; $i <= $#feature_ranges; $i++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2112 my $feature = $feature_ranges[$i];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2113 # 3' of last coding position
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2114 if($location > $feature_cds_max{$target_parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2115 # find the exon with the stop codon
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2116 while($feature->[1] < $feature_cds_max{$target_parent}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2117 $feature = $feature_ranges[++$i];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2118 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2119 my $post_offset = $feature->[0]-$feature_cds_max{$target_parent};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2120 while($location > $feature->[1]+$flanking_bases and
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2121 $i < $#feature_ranges){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2122 $post_offset += $feature->[1]-$feature->[0]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2123 $feature = $feature_ranges[++$i]; # find the 3' utr exon in which the variant is located
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2124 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2125 my $pos = $location-$feature->[0]+$post_offset;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2126 #print STDERR "Positive strand: got $pos for $location, exon #$i of $#feature_ranges, post_offset is $post_offset\n" if $location-$feature->[1] > $flanking_bases;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2127 my $off;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2128 if($location > $feature->[1]){ # after a UTR containing exon? set as .*DD+DD
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2129 $off = $location-$feature->[1];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2130 $pos = ($post_offset+$feature->[1]-$feature->[0]+1)."+".$off;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2131 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2132 elsif($location < $feature->[0]){ # before a total UTR exon? set as .*DD-DD
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2133 $off = -($feature->[0]-$location);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2134 $pos = ($post_offset+1).$off;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2135 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2136 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2137 if($location-$feature->[0] < $feature->[1]-$location){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2138 $off = $location-$feature->[0]+1; # +1 since HGVS skips right from -1 to +1 at exon boundary
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2139 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2140 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2141 $off = $location-$feature->[1]-1; # will be negative
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2142 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2143 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2144 if(length($ref) == 1 and length($variant) == 1){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2145 # 3' UTR SNP
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2146 record_snv("$target_parent\tc.*$pos",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2147 "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2148 join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2149 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2150 elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2151 # 3' UTR insertion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2152 record_snv("$target_parent\tc.*",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2153 hgvs_plus($pos,1),"_*",hgvs_plus($pos,2),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2154 "ins",substr($variant,1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2155 join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2156 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2157 elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2158 my $delBases = substr($ref, 1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2159 if(length($ref) == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2160 # 3' UTR single base deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2161 record_snv("$target_parent\tc.*",hgvs_plus($pos,1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2162 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2163 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2164 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2165 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2166 # longer 3' UTR deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2167 record_snv("$target_parent\tc.*",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2168 hgvs_plus($pos,1),"_*",hgvs_plus($pos,length($ref)-1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2169 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2170 join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2171 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2172 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2173 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2174 my $rc = reverse($ref);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2175 $rc=~tr/ACGT/TGCA/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2176 if($rc eq $variant and length($variant) > 3){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2177 # 3' UTR inversion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2178 record_snv("$target_parent\tc.*$pos",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2179 "_*",hgvs_plus($pos,length($ref)-1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2180 "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2181 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2182 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2183 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2184 # complex substitution in 3' UTR
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2185 record_snv("$target_parent\tc.*$pos",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2186 "_*",hgvs_plus($pos,length($ref)-1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2187 "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2188 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2189 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2190 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2191 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2192 # in the exon
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2193 elsif($location >= $feature->[0] and $location <= $feature->[1]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2194 my $pos = $feature_offset+$location-$feature->[0]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2195 my $last_exon_base = $feature_offset+$feature->[1]-$feature->[0]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2196 my $exon_edge_dist = $location-$feature->[0]+1; # equiv of HGVS +X or -X intron syntax, but for exons
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2197 $exon_edge_dist = $location-$feature->[1]-1 if abs($location-$feature->[1]-1) < $exon_edge_dist; # pick closer of donor and acceptor sites
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2198 #print STDERR "Got ", $location-$feature->[0]+1, "vs. ", $location-$feature->[1]-1, ": chose $exon_edge_dist\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2199 if($location < $feature_cds_min{$target_parent}){ #since there is no position 0, the pos is in UTR, subtract one
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2200 $pos = hgvs_plus($pos, -1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2201 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2202 if(length($ref) == 1 and length($variant) == 1){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2203 # SNP
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2204 record_snv("$target_parent\tc.$pos",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2205 "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2206 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2207 ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2208 #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2209 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2210 elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2211 # insertion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2212 record_snv("$target_parent\tc.$pos",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2213 "_",hgvs_plus_exon($pos,1,$last_exon_base),"ins",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2214 substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2215 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2216 ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2217 #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2218 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2219 elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2220 my $delBases = substr($ref, 1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2221 # single nucleotide deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2222 if(length($delBases) == 1){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2223 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2224 hgvs_plus_exon($pos,1,$last_exon_base),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2225 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2226 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2227 ($pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2228 #($pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2229 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2230 # longer deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2231 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2232 $exon_edge_dist = $feature->[1]-$location-length($ref)-1 if abs($feature->[1]-$location-length($ref)-1) < $exon_edge_dist;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2233 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2234 hgvs_plus_exon($pos,1,$last_exon_base),"_",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2235 hgvs_plus_exon($pos,length($ref)-1,$last_exon_base),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2236 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2237 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2238 ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2239 #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2240 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2241 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2242 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2243 $exon_edge_dist = $feature->[1]-$location-length($ref)-1 if abs($feature->[1]-$location-length($ref)-1) < $exon_edge_dist;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2244 my $rc = reverse($ref);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2245 $rc=~tr/ACGT/TGCA/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2246 if($rc eq $variant and length($variant) > 3){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2247 # inversion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2248 record_snv("$target_parent\tc.$pos",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2249 "_",hgvs_plus_exon($pos,length($ref)-1, $last_exon_base),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2250 "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2251 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2252 ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2253 #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2254 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2255 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2256 # complex substitution
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2257 record_snv("$target_parent\tc.$pos",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2258 "_",hgvs_plus_exon($pos, length($ref)-1, $last_exon_base),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2259 "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2260 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2261 ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2262 #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2263 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2264 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2265 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2266 # 5' of feature
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2267 elsif($location < $feature->[0]){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2268 if(length($ref) == 1 and length($variant) == 1){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2269 # intronic SNP
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2270 if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2271 # Closer to donor site
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2272 record_snv("$target_parent\tc.",$feature_offset,"+",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2273 ($location-$feature_ranges[$i-1]->[1]),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2274 "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2275 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2276 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2277 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2278 # Closer to acceptor site
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2279 record_snv("$target_parent\tc.",$feature_offset+1,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2280 ($location-$feature->[0]),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2281 "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2282 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2283 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2284 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2285 elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2286 if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2287 # intronic insertion near donor
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2288 my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2289 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2290 $pos,"_",hgvs_plus($pos,1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2291 "ins",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2292 substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2293 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2294 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2295 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2296 # intronic insertion near acceptor
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2297 my $pos = ($feature_offset+1).($location-$feature->[0]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2298 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2299 $pos,"_",hgvs_plus($pos,1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2300 "ins",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2301 substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2302 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2303 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2304 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2305 elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2306 # intronic deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2307 # single nucleotide deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2308 my $delBases = substr($ref, 1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2309 if(length($ref) == 2){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2310 # single intronic deletion near donor
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2311 if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2312 my $off = $location-$feature_ranges[$i-1]->[1]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2313 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2314 $feature_offset,"+",$off,
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2315 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2316 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2317 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2318 # single intronic deletion near acceptor
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2319 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2320 my $pos = ($feature_offset+1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2321 my $off = $location-$feature->[0];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2322 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2323 hgvs_plus_exon($pos, $off, $pos),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2324 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2325 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2326 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2327 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2328 # longer deletion
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2329 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2330 if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2331 # long intronic deletion near donor
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2332 my $off = $location-$feature_ranges[$i-1]->[1]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2333 my $pos = $feature_offset."+".$off;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2334 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2335 $pos,"_",hgvs_plus($pos,length($ref)-2),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2336 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2337 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2338 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2339 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2340 # long intronic deletion near acceptor
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2341 my $off = $location-$feature->[0]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2342 my $pos = ($feature_offset+1).$off;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2343 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2344 $pos,"_",hgvs_plus($pos,length($ref)-2),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2345 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2346 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off+length($ref)-2 >= -2 ? "p.?" : "NA"),"\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2347 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2348 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2349 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2350 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2351 my $rc = reverse($ref);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2352 $rc=~tr/ACGT/TGCA/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2353 if($rc eq $variant and length($variant) > 3){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2354 # intronic inversion near donor site
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2355 if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2356 my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2357 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2358 $pos,"_",hgvs_plus($pos,length($ref)-1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2359 "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2360 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2361 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2362 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2363 my $pos = ($feature_offset+1).($location-$feature->[0]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2364 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2365 $pos,"_",hgvs_plus($pos, length($ref)-1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2366 "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2367 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2368 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2369 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2370 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2371 # Intronic complex substitution
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2372 # Note: sub maybe have comma in it to denote two possible variants
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2373 if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2374 # complex intronic substitution near donor site
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2375 my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2376 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2377 $pos,"_",hgvs_plus($pos, length($ref)-1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2378 "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2379 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2380 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2381 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2382 # complex intronic substitution near acceptor site
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2383 my $pos = ($feature_offset+1).($location-$feature->[0]);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2384 record_snv("$target_parent\tc.",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2385 $pos,"_",hgvs_plus($pos, length($ref)-1),
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2386 "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2387 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2388 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2389 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2390 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2391 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2392 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2393 # feature is past this exon
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2394 $feature_offset += $feature->[1]-$feature->[0]+1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2395 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2396 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2397 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2398 } # for each variant on the line
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2399 } # for each gene overlapping the site the VCF describes
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2400 } # for each VCF line
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2401 print STDERR "\n" unless $quiet;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2402 close(VCFIN);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2403
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2404 # Before we can start printing the variants, we need to look at the phase information and calculate the real haplotype HGVS changes
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2405 #if(keys %chr2phase){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2406 # Note that we could have samtools read-backed haplotype info, MNPs in the VCF, and pre-existing haplotypes in the input VCF (e.g. imputed or based on Mendelian inheritance where trios exist)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2407 # We need to create new disjoint sets of phased blocks from the (consistent) union of these data.
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2408 # my $chr2phase2variants = combine_phase_data(\%chr2phase);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2409
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2410 # TODO: Calculate protein HGVS syntax for each variant, now that all phase data has been incorporated
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2411 #for my $chr (keys %$chr2phase2variants){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2412 # for my $phase (keys %{$chr2phase2variants{$chr}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2413 # # apply all phased changes to the reference chromosomal seq
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2414 # my $phased_seq = $seq{$chr}; #reference
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2415 # # sort the variants from 3' to 5' so that edits after indels don't need adjustment in their ref coordinate
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2416 # my @sorted_variants = sort {my($a_pos) = $a =~ /:(\d+):/; my($b_pos) = $b =~ /:(\d+):/; $b_pos <=> $a_pos} @{$chr2phase2variants{$chr}->{$phase}};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2417 # for my $variant (@sorted_variants){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2418 # }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2419 # }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2420 #}
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2421 #}
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2422
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2423 # retrieve the MAF info en masse for each chromosome, as this is much more efficient
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2424 my @out_lines;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2425 for my $snv (@snvs){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2426 chomp $snv;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2427 my @fields = split /\t/, $snv;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2428 # For CNVs, all the fields are already filled out
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2429 if(@fields > 13){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2430 push @out_lines, join("\t", $feature_type{$fields[0]}, ($fields[0] =~ /\S/ ? $feature_length{$fields[0]} : "NA"), @fields);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2431 next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2432 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2433 my $variant_key = $fields[9];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2434 $fields[9] = join("\t", prop_info($dbsnp,$internal_snp,$variant_key));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2435 my $from = $fields[4];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2436 my $chr_pos_key = $fields[3].":".$from;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2437 my $to = $fields[4]; # true for SNPs and insertions
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2438 my @variant_key = split /:/, $variant_key;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2439 # For deletions and complex variants, calculate the affected reference genome range and set the 'to'
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2440 if(length($variant_key[2]) > 1){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2441 $to += length($variant_key[2])-1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2442 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2443 splice(@fields, 5, 0, $to);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2444
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2445 # Otherwise expand the key into the relevant MAF values
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2446 $fields[0] =~ s/\/chr.*$//; # some transcript ids are repeated... we expect "id/chr#" in the GTF file to distinguish these, but should get rid of them at reporting time
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2447 # the offset from the nearest exon border if coding
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2448 push @fields, ($#variant_key > 3 ? $variant_key[4] : "");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2449 # add gene name(s)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2450 push @fields, range2genes($fields[3], $from, $to+1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2451 # add caveats
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2452 my $c = $fields[3];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2453 if(not exists $chr2mappability{$c}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2454 if($c =~ s/^chr//){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2455 # nothing more
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2456 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2457 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2458 $c = "chr$c";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2459 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2460 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2461 my $mappability_caveats = exists $chr2mappability{$c} ? $chr2mappability{$c}->fetch($fields[4], $fields[4]+1) : [];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2462 if(ref $mappability_caveats eq "ARRAY" and @$mappability_caveats){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2463 my %h;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2464 if(exists $chr2caveats{$chr_pos_key}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2465 if($chr2caveats{$chr_pos_key} !~ /non-unique/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2466 $chr2caveats{$chr_pos_key} = join("; ", grep {not $h{$_}++} @$mappability_caveats)."; ".$chr2caveats{$chr_pos_key};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2467 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2468 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2469 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2470 $chr2caveats{$chr_pos_key} = join("; ", grep {not $h{$_}++} @$mappability_caveats);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2471 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2472 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2473 push @fields, (exists $chr2caveats{$chr_pos_key} ? $chr2caveats{$chr_pos_key} : "");
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2474 # add phase
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2475 push @fields, find_phase($variant_key);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2476 push @out_lines, join("\t", $feature_type{$fields[0]}, $feature_length{$fields[0]}, @fields);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2477 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2478
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2479 # Now tabulate the rare variant numbers
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2480 my %gene2rares;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2481 my %gene2aa_rares;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2482 for my $line (@out_lines){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2483 my @F = split /\t/, $line, -1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2484 if($F[15] eq "NA" or $F[15] < $rare_variant_prop and (!$internal_snp or $F[17] < $rare_variant_prop)){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2485 my $gene_list = $internal_snp ? $F[20] : $F[19];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2486 next unless defined $gene_list;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2487 for my $g (split /; /, $gene_list){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2488 $gene2rares{$g}++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2489 # Check the cDNA HGVS syntax for relevance
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2490 if($F[3] =~ /c.\d+/ or # coding
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2491 $F[3] =~ /c.\d+.*-[12]/ or # splicing acceptor
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2492 $F[3] =~ /c.\d+\+[12345]/){ # splicing donor
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2493 $gene2aa_rares{$g}++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2494 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2495 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2496 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2497 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2498
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2499 # Print the results
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2500 for my $line (@out_lines){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2501 my @F = split /\t/, $line, -1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2502 my $gene_list = $internal_snp ? $F[20] : $F[19];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2503 if(not defined $gene_list){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2504 print OUT join("\t", @F, "", ""), "\n"; next;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2505 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2506
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2507 my $max_gene_rare = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2508 my $max_gene_aa_rare = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2509 for my $g (split /; /, $gene_list){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2510 next unless exists $gene2rares{$g};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2511 if($gene2rares{$g} > $max_gene_rare){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2512 $max_gene_rare = $gene2rares{$g};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2513 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2514 next unless exists $gene2aa_rares{$g};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2515 if($gene2aa_rares{$g} > $max_gene_aa_rare){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2516 $max_gene_aa_rare = $gene2aa_rares{$g};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2517 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2518 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2519 print OUT join("\t", @F, $max_gene_rare, $max_gene_aa_rare), "\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2520 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2521 close(OUT);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2522
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2523 sub range2genes{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2524 my ($c, $from, $to) = @_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2525 if(not exists $gene_ids{$c}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2526 if($c =~ s/^chr//){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2527 # nothing more
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2528 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2529 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2530 $c = "chr$c";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2531 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2532 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2533 if(exists $gene_ids{$c}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2534 my %have;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2535 return join("; ", grep {not $have{$_}++} @{$gene_ids{$c}->fetch($from, $to+1)});
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2536 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2537 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2538 return "";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2539 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2540 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2541 sub combine_phase_data{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2542 my ($phases) = @_; # map from variant to its phase data
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2543
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2544 # Create a reverse mapping from phase regions to their variants
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2545 my %chr2phase_region2variants;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2546 my @variants = keys %$phases;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2547 for my $variant (@variants){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2548 my ($chr) = $variant =~ /^\S+?-(\S+):/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2549 $chr2phase_region2variants{$chr} = {} if not exists $chr2phase_region2variants{$chr};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2550 for my $phase_region (split /,/, $phases->{$variant}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2551 $chr2phase_region2variants{$chr}->{$phase_region} = [] if not exists $chr2phase_region2variants{$chr}->{$phase_region};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2552 push @{$chr2phase_region2variants{$phase_region}}, $variant;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2553 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2554 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2555
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2556 # Now for each phased block known so far, see if any variant in it is also part of another block
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2557 # If so, do a union since phasing is both transitive and commutative.
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2558 # The quickest way to do this is to check for overlapping intervals, then check for common members amongst those that do overlap
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2559 for my $chr (keys %chr2phase_region2variants){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2560 my @ordered_phase_regions = sort {my($a_pos) = $a =~ /:(\d+)/; my($b_pos) = $b =~ /:(\d+)/; $a_pos <=> $b_pos} keys %{$chr2phase_region2variants{$chr}};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2561 my $sets = new DisjointSets(scalar(@ordered_phase_regions));
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2562
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2563 for (my $i = 0; $i < $#ordered_phase_regions; $i++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2564 my ($start, $stop, $variant) = $ordered_phase_regions[$i];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2565 for (my $j = $i+1; $j <= $#ordered_phase_regions; $j++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2566 my ($start2, $stop2, $variant2) = $ordered_phase_regions[$j];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2567 if($start2 > $stop){ # won't overlap any regions after this in the sorted list
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2568 last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2569 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2570 # If we get here, it is implicit that $stop >= $start2 and $start < $stop2, i.e. there is overlap
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2571 # Now check if there is a shared variant (otherwise we might erroneously join blocks from different physical chromosomal arms)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2572 my $have_shared_variant = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2573 my $overlapping_phase_region = $ordered_phase_regions[$j];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2574 for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$i]}}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2575 if($phases->{$variant} =~ /\b$overlapping_phase_region\b/){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2576 $have_shared_variant = 1; last;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2577 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2578 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2579 # sanity check that there aren't conflicting variants in the new block (i.e. two different variants in the same position)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2580 my %pos2base;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2581 my $have_conflicting_variant = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2582 for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$i]}}, @{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$j]}}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2583 my ($pos, $base) = $variant =~ /(\d+):(.+?)$/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2584 if(exists $pos2base{$pos} and $pos2base{$pos} ne $base){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2585 # conflict, note with a caveat
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2586 if(exists $chr2caveats{"$chr:$pos"}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2587 $chr2caveats{"$chr:$pos"} .= "; inconsistent haplotype phasing" unless $chr2caveats{"$chr:$pos"} =~ /inconsistent haplotype phasing/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2588 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2589 else{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2590 $chr2caveats{"$chr:$pos"} = "inconsistent haplotype phasing";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2591 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2592 $have_conflicting_variant ||= 1;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2593 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2594 elsif(not exists $pos2base{$pos}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2595 $pos2base{$pos} = $base;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2596 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2597 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2598
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2599 $sets->union($i+1, $j+1) if $have_shared_variant and not $have_conflicting_variant; # indexes are one-based for sets rather than 0-based
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2600 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2601 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2602 my $phase_sets = $sets->sets; #disjoint haplotype sets
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2603 my %region_counts;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2604 for my $phase_set (@$phase_sets){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2605 next if scalar(@$phase_set) == 1; # No change to existing phase region (is disjoint from all others)
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2606 # Generate a new haploblock to replace the old ones that are being merged
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2607 my $merged_start = 10000000000;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2608 my $merged_end = 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2609 for my $ordered_phase_region_index (@$phase_set){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2610 my ($start, $end) = $ordered_phase_regions[$ordered_phase_region_index-1] =~ /(\d+)-(\d+)$/;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2611 $merged_start = $start if $start < $merged_start;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2612 $merged_end = $end if $end > $merged_end;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2613 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2614 # At the start of the region is a unique prefix so we can tell the arms apart if two haploblocks have the exact same boundary
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2615 my $region_count = $region_counts{"$merged_start-$merged_end"}++;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2616 my $merged_haploblock_name = "Y$region_count-$chr:$merged_start-$merged_end";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2617 # Assign this new name to overwrite the premerge values for each variant contained within
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2618 for my $ordered_phase_region_index (@$phase_set){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2619 for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$ordered_phase_region_index-1]}}){ # incl. one-based set correction in 0-based array index
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2620 print STDERR "Merging $variant from ", $phases->{$variant}, " into new block $merged_haploblock_name\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2621 $phases->{$variant} = $merged_haploblock_name;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2622 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2623 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2624 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2625 # TODO: if there are overlapping phase blocks still, but with different variants in the same position, we can infer that they are on the opposite strands...
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2626 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2627 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2628
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2629 # Sees if the positions of the variant are in the range of a phased haplotype, returning which allele it belongs to
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2630 sub find_phase{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2631 my ($chr,$pos,$ref,$variant) = split /:/, $_[0];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2632 return "" if length($ref) != length($variant); # Can only deal with SNPs (and broken down MNPs) for now
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2633 for(my $i = 0; $i < length($ref); $i++){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2634 my $key = "$chr:".($pos+$i).":".substr($variant, $i, 1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2635 #print STDERR "Checking phase for $key\n" if $pos == 12907379;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2636 if(exists $chr2phase{$key}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2637 #print STDERR "returning phase data $chr2phase{$key}\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2638 return $chr2phase{$key};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2639 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2640 elsif(exists $chr2phase{"chr".$key}){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2641 #print STDERR "returning phase data ", $chr2phase{"chr".$key}, "\n";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2642 return $chr2phase{"chr".$key};
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2643 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2644 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2645 return "";
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2646 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2647
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2648 sub find_earliest_index{
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2649 # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2650 my ($query, $array) = @_;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2651
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2652 return 0 if $query < $array->[0]->[0];
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2653
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2654 my ($left, $right, $prevCenter) = (0, $#$array, -1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2655
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2656 while(1){
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2657 my $center = int (($left + $right)/2);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2658
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2659 my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1);
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2660
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2661 return $center if $cmp == 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2662 if ($center == $prevCenter) {
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2663 return $left;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2664 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2665 $right = $center if $cmp < 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2666 $left = $center if $cmp > 0;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2667 $prevCenter = $center;
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2668 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2669 }
7cdd13ff182a initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2670