| 
0
 | 
     1 #!/usr/bin/env perl
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 BEGIN{
 | 
| 
 | 
     4   my $prog_dir = `dirname $0`;
 | 
| 
 | 
     5   chomp $prog_dir;
 | 
| 
 | 
     6   push @INC, $prog_dir; # so DisjointSets.pm can be found no matter the working directory
 | 
| 
 | 
     7 }
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 use DisjointSets; # homebrew module
 | 
| 
 | 
    10 use Bio::DB::Sam; # for FastA reference pulls
 | 
| 
 | 
    11 use Bio::SeqUtils;
 | 
| 
 | 
    12 use Bio::Tools::CodonTable;
 | 
| 
 | 
    13 use Statistics::Zed;
 | 
| 
 | 
    14 use Getopt::Long;
 | 
| 
 | 
    15 use Set::IntervalTree;
 | 
| 
 | 
    16 use strict;
 | 
| 
 | 
    17 use warnings;
 | 
| 
 | 
    18 use vars qw($min_prop $zed $codonTable $default_transl_table %transl_except %internal_prop %dbsnp_info %chr2variant_locs %chr2dbsnp_vcf_lines %chr2internal_vcf_lines %chr2caveats %chr2phase @snvs $fasta_index $max_args $quiet);
 | 
| 
 | 
    19 
 | 
| 
 | 
    20 if(@ARGV == 1 and $ARGV[0] eq "-v"){
 | 
| 
 | 
    21   print "Version 1.0\n";
 | 
| 
 | 
    22   exit;
 | 
| 
 | 
    23 }
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 #$max_args = `getconf ARG_MAX`; # largest number of args you can send to a system command (enviroment included, see limits.h)
 | 
| 
 | 
    26 #chomp $max_args; 
 | 
| 
 | 
    27 $max_args = 4096; # if not defined $max_args or $max_args < 1; # the minimum since System V 
 | 
| 
 | 
    28 $max_args -= 50;
 | 
| 
 | 
    29 
 | 
| 
 | 
    30 # find out if a variant appears in the user provided data 
 | 
| 
 | 
    31 sub internal_prop($$$$){
 | 
| 
 | 
    32     my ($chr,$pos,$ref,$variant) = @_;
 | 
| 
 | 
    33     
 | 
| 
 | 
    34     my $key = "$chr:$pos:$ref:$variant";
 | 
| 
 | 
    35     if(exists $internal_prop{$key}){
 | 
| 
 | 
    36         return $internal_prop{$key};
 | 
| 
 | 
    37     }
 | 
| 
 | 
    38 
 | 
| 
 | 
    39     #print STDERR "Checking if internal_prop for $key exists: ";
 | 
| 
 | 
    40     if(exists $chr2internal_vcf_lines{$chr}->{$pos}){
 | 
| 
 | 
    41       for(@{$chr2internal_vcf_lines{$chr}->{$pos}}){
 | 
| 
 | 
    42         my @fields = split /\t/, $_;
 | 
| 
 | 
    43         if($pos == $fields[1] and length($fields[3]) == length($ref) and $fields[4] eq $variant){
 | 
| 
 | 
    44           #print STDERR "yes\n";
 | 
| 
 | 
    45           if(/MAF=(\d\.\d+)/){
 | 
| 
 | 
    46             $internal_prop{$key} = $1; # change from percent to proportion
 | 
| 
 | 
    47             return $1;
 | 
| 
 | 
    48           }
 | 
| 
 | 
    49         }
 | 
| 
 | 
    50       }
 | 
| 
 | 
    51     }
 | 
| 
 | 
    52     else{
 | 
| 
 | 
    53       #print STDERR "no\n";
 | 
| 
 | 
    54     }
 | 
| 
 | 
    55 
 | 
| 
 | 
    56     $internal_prop{$key} = "NA";
 | 
| 
 | 
    57     return "NA";
 | 
| 
 | 
    58 }
 | 
| 
 | 
    59 
 | 
| 
 | 
    60 # find out if a variant appears in the NCBI's dbSNP 
 | 
| 
 | 
    61 sub dbsnp_info($$$$){
 | 
| 
 | 
    62     my ($chr,$pos,$ref,$variant) = @_;
 | 
| 
 | 
    63 
 | 
| 
 | 
    64     my $key = "$chr:$pos:$ref:$variant";
 | 
| 
 | 
    65     if(exists $dbsnp_info{$key}){
 | 
| 
 | 
    66         return @{$dbsnp_info{$key}};
 | 
| 
 | 
    67     }
 | 
| 
 | 
    68 
 | 
| 
 | 
    69     if(exists $chr2dbsnp_vcf_lines{$chr}->{$pos}){
 | 
| 
 | 
    70       #print STDERR "Checking existing SNP data for $chr:$pos -> ", join("\n", @{$chr2dbsnp_vcf_lines{$chr}->{$pos}}), "\n";
 | 
| 
 | 
    71       for(@{$chr2dbsnp_vcf_lines{$chr}->{$pos}}){
 | 
| 
 | 
    72 	my @fields = split /\t/, $_;
 | 
| 
 | 
    73 	for my $var (split /,/, $fields[4]){
 | 
| 
 | 
    74             # Allows for different reference seqs between dbSNP and input, assuming patches only
 | 
| 
 | 
    75 	    if(length($fields[3]) == length($ref) and ($var eq $variant or $ref eq $var and $variant eq $fields[3])){
 | 
| 
 | 
    76 		my ($freq, $subpop) = ("","");
 | 
| 
 | 
    77 		$freq = $1 if $fields[7] =~ /(?:\A|;)MMAF=(0\.\d+)(?:;|\Z)/;
 | 
| 
 | 
    78 		$subpop = $1 if $fields[7] =~ /(?:\A|;)MMAF_SRC=(\S+?)(?:;|\Z)/;
 | 
| 
 | 
    79                 $dbsnp_info{$key} = [$subpop, $freq || "NA", $fields[2]];
 | 
| 
 | 
    80 		return @{$dbsnp_info{$key}};
 | 
| 
 | 
    81 	    }
 | 
| 
 | 
    82 	}
 | 
| 
 | 
    83       }
 | 
| 
 | 
    84     }
 | 
| 
 | 
    85     $dbsnp_info{$key} = ["novel", "NA", "NA"];
 | 
| 
 | 
    86     return @{$dbsnp_info{$key}};
 | 
| 
 | 
    87 }
 | 
| 
 | 
    88 
 | 
| 
 | 
    89 sub record_snv{
 | 
| 
 | 
    90   my $line = join("", @_);
 | 
| 
 | 
    91   push @snvs, $line;
 | 
| 
 | 
    92 
 | 
| 
 | 
    93   my @fields = split /\t/, $line;
 | 
| 
 | 
    94   my $prop_info_key = $fields[9];
 | 
| 
 | 
    95   my ($chr,$pos,$ref,$variant) = split /:/, $prop_info_key;
 | 
| 
 | 
    96   $chr2variant_locs{$chr} = {} unless exists $chr2variant_locs{$chr};
 | 
| 
 | 
    97   return unless $ref; # ref not defined for CNVs
 | 
| 
 | 
    98   # Need to grab whole range for MNPs
 | 
| 
 | 
    99   for(my $i = 0; $i < length($ref); $i++){
 | 
| 
 | 
   100     $chr2variant_locs{$chr}->{$pos+$i} = 1;
 | 
| 
 | 
   101   }
 | 
| 
 | 
   102 }
 | 
| 
 | 
   103 
 | 
| 
 | 
   104 sub retrieve_vcf_lines($$$){
 | 
| 
 | 
   105   my ($dbsnp_file, $internal_snp_file, $chr) = @_;
 | 
| 
 | 
   106 
 | 
| 
 | 
   107   my (%dbsnp_lines, %internal_snp_lines);
 | 
| 
 | 
   108 
 | 
| 
 | 
   109   if(not defined $dbsnp_file or not exists $chr2variant_locs{$chr}){
 | 
| 
 | 
   110     return ({}, {}, {}, {}); # no data requested for this chromosome
 | 
| 
 | 
   111   }
 | 
| 
 | 
   112 
 | 
| 
 | 
   113   # build up the request
 | 
| 
 | 
   114   my @tabix_regions;
 | 
| 
 | 
   115   my @var_locs = keys %{$chr2variant_locs{$chr}};
 | 
| 
 | 
   116   # sort by variant start location
 | 
| 
 | 
   117   for my $var_loc (sort {$a <=> $b} @var_locs){
 | 
| 
 | 
   118     push @tabix_regions, $chr.":".$var_loc."-".$var_loc;
 | 
| 
 | 
   119   }
 | 
| 
 | 
   120   for(my $i = 0; $i <= $#tabix_regions; $i += $max_args){ # chunkify tabix request if too many for the system to handle
 | 
| 
 | 
   121     my $end = $i + $max_args > $#tabix_regions ? $#tabix_regions : $i + $max_args;
 | 
| 
 | 
   122     my $regions = "'".join("' '", @tabix_regions[$i..$end])."'";
 | 
| 
 | 
   123     # From file is very slow for some reason
 | 
| 
 | 
   124     #my $regions_file =  "/tmp/vcf2hgvs_$$.bed";
 | 
| 
 | 
   125     #open(REQ_BED, ">$regions_file")
 | 
| 
 | 
   126     #  or die "Cannot open $regions_file for writing: $!\n";
 | 
| 
 | 
   127     #print REQ_BED join("\n", @tabix_regions), "\n";
 | 
| 
 | 
   128     #close(REQ_BED);
 | 
| 
 | 
   129 
 | 
| 
 | 
   130     # retrieve the data
 | 
| 
 | 
   131     die "Cannot find dbSNP VCF file $dbsnp_file\n" if not -e $dbsnp_file;
 | 
| 
 | 
   132   
 | 
| 
 | 
   133     open(VCF, "tabix $dbsnp_file $regions |")
 | 
| 
 | 
   134       or die "Cannot run tabix on $dbsnp_file (args ".substr($regions, 0, length($regions)>100? 100 : length($regions))."): $!\n";
 | 
| 
 | 
   135     while(<VCF>){
 | 
| 
 | 
   136       #if(/^(\S+\t(\d+)(?:\t\S+){6})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible
 | 
| 
 | 
   137       if(/^(\S+\t(\d+)(?:\t\S+){6})/ and exists $chr2variant_locs{$chr}->{$2}){ # take only main columns to save room, if possible
 | 
| 
 | 
   138         $dbsnp_lines{$2} = [] unless exists $dbsnp_lines{$2};
 | 
| 
 | 
   139         push @{$dbsnp_lines{$2}}, $1;
 | 
| 
 | 
   140       }
 | 
| 
 | 
   141     }
 | 
| 
 | 
   142     close(VCF);
 | 
| 
 | 
   143 
 | 
| 
 | 
   144     if($internal_snp_file){
 | 
| 
 | 
   145       die "Cannot find internal VCF file $internal_snp_file\n" if not -e $internal_snp_file;
 | 
| 
 | 
   146       open(VCF, "tabix $internal_snp_file $regions |")
 | 
| 
 | 
   147         or die "Cannot run tabix on $internal_snp_file: $!\n";
 | 
| 
 | 
   148       while(<VCF>){
 | 
| 
 | 
   149         #if(/^(\S+\t(\d+)(?:\t\S+){6})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible
 | 
| 
 | 
   150         if(/^(\S+\t(\d+)(?:\t\S+){5})/ and exists $chr2variant_locs{$chr}->{$2}){ # take only main columns to save room, if possible
 | 
| 
 | 
   151           $internal_snp_lines{$2} = [] unless exists $internal_snp_lines{$2};
 | 
| 
 | 
   152           push @{$internal_snp_lines{$2}}, $1;
 | 
| 
 | 
   153         }
 | 
| 
 | 
   154       }
 | 
| 
 | 
   155       close(VCF);
 | 
| 
 | 
   156     }
 | 
| 
 | 
   157   }
 | 
| 
 | 
   158 
 | 
| 
 | 
   159   #unlink $regions_file;
 | 
| 
 | 
   160 
 | 
| 
 | 
   161   return (\%dbsnp_lines, \%internal_snp_lines);
 | 
| 
 | 
   162 }
 | 
| 
 | 
   163 
 | 
| 
 | 
   164 sub prop_info_key{
 | 
| 
 | 
   165     my($chr,$pos,$ref,$variant,$exon_edge_dist) = @_;
 | 
| 
 | 
   166 
 | 
| 
 | 
   167     $chr =~ s/^chr//;
 | 
| 
 | 
   168     if($chr eq "M"){
 | 
| 
 | 
   169       $chr = "MT"; # NCBI uses different name for mitochondrial chromosome
 | 
| 
 | 
   170       $pos-- if $pos >= 3107;  # also, doesn't keep the old positioning (historical)
 | 
| 
 | 
   171     }
 | 
| 
 | 
   172     return join(":", $chr,$pos,$ref,$variant, ($exon_edge_dist ? $exon_edge_dist : ""));
 | 
| 
 | 
   173 }
 | 
| 
 | 
   174 
 | 
| 
 | 
   175 sub prop_info($$$){
 | 
| 
 | 
   176     my($snpfile,$internal_snps_file,$prop_info_key) = @_;
 | 
| 
 | 
   177 
 | 
| 
 | 
   178     my($chr,$pos,$ref,$variant) = split /:/, $prop_info_key;
 | 
| 
 | 
   179 
 | 
| 
 | 
   180     # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse
 | 
| 
 | 
   181     if(not exists $chr2dbsnp_vcf_lines{$chr}){
 | 
| 
 | 
   182       ($chr2dbsnp_vcf_lines{$chr}, $chr2internal_vcf_lines{$chr}) = retrieve_vcf_lines($snpfile,$internal_snps_file,$chr);
 | 
| 
 | 
   183     }
 | 
| 
 | 
   184     my $internal_maf = 0;
 | 
| 
 | 
   185     if($internal_snps_file){
 | 
| 
 | 
   186       $internal_maf = internal_prop($chr,$pos,$ref,$variant);
 | 
| 
 | 
   187       $internal_maf = 0 if $internal_maf eq "NA";
 | 
| 
 | 
   188     }
 | 
| 
 | 
   189 
 | 
| 
 | 
   190     my @results = dbsnp_info($chr,$pos,$ref,$variant);
 | 
| 
 | 
   191 
 | 
| 
 | 
   192     # Not all entries have a proportion in dbSNP
 | 
| 
 | 
   193     return $internal_snps_file ? ($ref, $variant, @results, $internal_maf) : ($ref, $variant, @results);
 | 
| 
 | 
   194 }
 | 
| 
 | 
   195 
 | 
| 
 | 
   196 #offset a given HGVS nomenclature position (single position only) by a given number of bases
 | 
| 
 | 
   197 sub hgvs_plus($$){
 | 
| 
 | 
   198    my ($hgvs, $offset) = @_;
 | 
| 
 | 
   199     if($hgvs =~ /^(\S+)(-\d+)(.*)/){
 | 
| 
 | 
   200 	# all negative
 | 
| 
 | 
   201 	if($2+$offset<0){
 | 
| 
 | 
   202 	    return $1.($2+$offset).$3;
 | 
| 
 | 
   203 	}
 | 
| 
 | 
   204 	# switches to positive, need to mod
 | 
| 
 | 
   205 	else{
 | 
| 
 | 
   206 	    return $1+($2+$offset);
 | 
| 
 | 
   207 	}
 | 
| 
 | 
   208     }
 | 
| 
 | 
   209     elsif($hgvs =~ /^(\S+)\+(\d+)(.*)/){
 | 
| 
 | 
   210 	# all positive
 | 
| 
 | 
   211 	if($2+$offset>0){
 | 
| 
 | 
   212 	    return $1."+".($2+$offset).$3;
 | 
| 
 | 
   213 	}
 | 
| 
 | 
   214 	# switches to negative, need to mod
 | 
| 
 | 
   215 	else{
 | 
| 
 | 
   216 	    return $1+($2+$offset);
 | 
| 
 | 
   217 	}
 | 
| 
 | 
   218     }
 | 
| 
 | 
   219     elsif($hgvs =~ /^(-?\d+)(.*)/){
 | 
| 
 | 
   220 	# special case if offset spans -/+ since there is no position 0
 | 
| 
 | 
   221 	if($1 < 0 and $1+$offset >= 0){
 | 
| 
 | 
   222 	    $offset++;
 | 
| 
 | 
   223 	}
 | 
| 
 | 
   224 	elsif($1 > 0 and $1+$offset <= 0){
 | 
| 
 | 
   225 	    $offset--;
 | 
| 
 | 
   226 	}
 | 
| 
 | 
   227 	return ($1+$offset).$2;
 | 
| 
 | 
   228     }
 | 
| 
 | 
   229     else{
 | 
| 
 | 
   230 	die "Cannot convert $hgvs to a new offset ($offset), only single base position nomenclature is currently supported\n";
 | 
| 
 | 
   231     }
 | 
| 
 | 
   232 }
 | 
| 
 | 
   233 
 | 
| 
 | 
   234 # offset a given position by a given number of bases, 
 | 
| 
 | 
   235 # taking into account that if the new offset crosses the threshold in the last argument, 
 | 
| 
 | 
   236 # HGVS boundary nomenclature has to be introduced
 | 
| 
 | 
   237 sub hgvs_plus_exon($$$){
 | 
| 
 | 
   238     my ($pos, $offset, $boundary) = @_;
 | 
| 
 | 
   239 
 | 
| 
 | 
   240     # special case if offset spans -/+ since there is no position 0
 | 
| 
 | 
   241     if($pos =~ /^(-?\d+)(.*)/){
 | 
| 
 | 
   242       if($1 < 0 and $1+$offset >= 0){
 | 
| 
 | 
   243          $offset++;
 | 
| 
 | 
   244       }
 | 
| 
 | 
   245       elsif($1 > 0 and $1+$offset <= 0){
 | 
| 
 | 
   246          $offset--;
 | 
| 
 | 
   247       }
 | 
| 
 | 
   248     }
 | 
| 
 | 
   249     my $new_pos = $pos + $offset;
 | 
| 
 | 
   250     if($new_pos > $boundary and $pos <= $boundary){
 | 
| 
 | 
   251 	# just moved into an intron 3'
 | 
| 
 | 
   252 	$new_pos = $boundary."+".($new_pos-$boundary);
 | 
| 
 | 
   253     }
 | 
| 
 | 
   254     elsif($new_pos < $boundary and $pos >= $boundary){
 | 
| 
 | 
   255 	# just moved into an intron 5'
 | 
| 
 | 
   256 	$new_pos = $boundary.($new_pos-$boundary);
 | 
| 
 | 
   257     }
 | 
| 
 | 
   258     return $new_pos;
 | 
| 
 | 
   259 }
 | 
| 
 | 
   260 
 | 
| 
 | 
   261 # given a nucleotide position, calculates the AA there (assumes coding region)
 | 
| 
 | 
   262 sub getCodonFromSeq($$$$){
 | 
| 
 | 
   263   my ($chr_ref, $location, $frame_offset, $strand) = @_;
 | 
| 
 | 
   264 
 | 
| 
 | 
   265   my $codon;
 | 
| 
 | 
   266   if($strand eq "+"){
 | 
| 
 | 
   267     $codon = substr($$chr_ref, $location-1-$frame_offset, 3);
 | 
| 
 | 
   268   }
 | 
| 
 | 
   269   else{
 | 
| 
 | 
   270     $codon = substr($$chr_ref, $location-3+$frame_offset, 3);
 | 
| 
 | 
   271     $codon = reverse($codon);
 | 
| 
 | 
   272     $codon =~ tr/ACGTacgt/TGCAtgca/;
 | 
| 
 | 
   273   }
 | 
| 
 | 
   274   return $codon;
 | 
| 
 | 
   275 }
 | 
| 
 | 
   276 
 | 
| 
 | 
   277 sub getCodonFromSeqIndex($$$$){
 | 
| 
 | 
   278   my ($chr, $location, $frame_offset, $strand) = @_;
 | 
| 
 | 
   279 
 | 
| 
 | 
   280   my $codon;
 | 
| 
 | 
   281   if($strand eq "+"){
 | 
| 
 | 
   282     $codon = $fasta_index->fetch($chr.":".($location-$frame_offset)."-".($location-$frame_offset+2));
 | 
| 
 | 
   283   }
 | 
| 
 | 
   284   else{
 | 
| 
 | 
   285     $codon = $fasta_index->fetch($chr.":".($location-2+$frame_offset)."-".($location+$frame_offset));
 | 
| 
 | 
   286     $codon = reverse($codon);
 | 
| 
 | 
   287     $codon =~ tr/ACGTacgt/TGCAtgca/;
 | 
| 
 | 
   288   }
 | 
| 
 | 
   289   return $codon;
 | 
| 
 | 
   290 }
 | 
| 
 | 
   291 
 | 
| 
 | 
   292 sub getAAFromSeq($$$$$){
 | 
| 
 | 
   293   return $_[4]->translate(getCodonFromSeq($_[0], $_[1], $_[2], $_[3]));
 | 
| 
 | 
   294 }
 | 
| 
 | 
   295 
 | 
| 
 | 
   296 sub getAAFromSeqIndex($$$$$){
 | 
| 
 | 
   297   # convert codon to AA
 | 
| 
 | 
   298   if(exists $transl_except{"$_[0]:$_[1]"}){
 | 
| 
 | 
   299     return $transl_except{"$_[0]:$_[1]"};
 | 
| 
 | 
   300   }
 | 
| 
 | 
   301   else{
 | 
| 
 | 
   302     return $_[4]->translate(getCodonFromSeqIndex($_[0], $_[1], $_[2], $_[3]));
 | 
| 
 | 
   303   }
 | 
| 
 | 
   304 }
 | 
| 
 | 
   305 
 | 
| 
 | 
   306 sub hgvs_protein{
 | 
| 
 | 
   307   my ($chr, $location, $ref, $variant, $cdna_pos, $strand, $transl_table) = @_;
 | 
| 
 | 
   308 
 | 
| 
 | 
   309   if(substr($ref,0,1) eq substr($variant,0,1)){
 | 
| 
 | 
   310     substr($ref,0,1) = "";
 | 
| 
 | 
   311     substr($variant,0,1) = "";
 | 
| 
 | 
   312     $location++;
 | 
| 
 | 
   313     if($strand eq "-"){
 | 
| 
 | 
   314       $cdna_pos--;
 | 
| 
 | 
   315     }
 | 
| 
 | 
   316     else{
 | 
| 
 | 
   317       $cdna_pos++;
 | 
| 
 | 
   318     }
 | 
| 
 | 
   319   }
 | 
| 
 | 
   320 
 | 
| 
 | 
   321   if($cdna_pos !~ /^\d+/){
 | 
| 
 | 
   322     die "Aborting: got illegal cDNA position ($cdna_pos) for protein HGVS conversion of position ",
 | 
| 
 | 
   323         "$location, ref $ref, variant $variant. Please correct the program code.\n";
 | 
| 
 | 
   324   }
 | 
| 
 | 
   325   # Get the correct frame for the protein translation, to know what codons are affected
 | 
| 
 | 
   326   my $aapos = int(($cdna_pos-1)/3)+1;
 | 
| 
 | 
   327 
 | 
| 
 | 
   328   # does it destroy the start codon?
 | 
| 
 | 
   329   if($cdna_pos < 4){ # assumes animal codon usage
 | 
| 
 | 
   330     return "p.0?"; # indicates start codon missing, unsure of effect
 | 
| 
 | 
   331   }
 | 
| 
 | 
   332 
 | 
| 
 | 
   333   my $table = $transl_table ne $default_transl_table ?  # non standard translation table requested
 | 
| 
 | 
   334     Bio::Tools::CodonTable->new(-id=>$transl_table) : $codonTable;
 | 
| 
 | 
   335 
 | 
| 
 | 
   336   my $frame_offset = ($cdna_pos-1)%3;
 | 
| 
 | 
   337   my $origAA = getAAFromSeqIndex($chr, $location, $frame_offset, $strand, $table);
 | 
| 
 | 
   338   # take 100000 bp on either side for translation context of variant seq
 | 
| 
 | 
   339   my $five_prime_buffer = $location < 10000 ? $location-1 : 10000;
 | 
| 
 | 
   340   my $mutSeq = $fasta_index->fetch($chr.":".($location-$five_prime_buffer)."-".($location+10000));
 | 
| 
 | 
   341 
 | 
| 
 | 
   342   # substitute all of the immediately adjacent variants in phase with this one to get the correct local effect
 | 
| 
 | 
   343   substr($mutSeq, $five_prime_buffer, length($ref)) = $variant;
 | 
| 
 | 
   344 
 | 
| 
 | 
   345   # does it cause a frameshift?
 | 
| 
 | 
   346   my $length_diff = length($variant)-length($ref);
 | 
| 
 | 
   347   if($length_diff%3){ # insertion or deletion not a multiple of three
 | 
| 
 | 
   348     my $fs_codon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1, $frame_offset, $strand);
 | 
| 
 | 
   349     my $ext = 0;
 | 
| 
 | 
   350     my $newAA;
 | 
| 
 | 
   351     do{
 | 
| 
 | 
   352        $ext++;
 | 
| 
 | 
   353        # The "NA"s below make it so that we don't pick up any translation exceptions from the original reference annotation
 | 
| 
 | 
   354        if($strand eq "+"){
 | 
| 
 | 
   355          $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1+$ext*3, $frame_offset, $strand, $table);
 | 
| 
 | 
   356        }
 | 
| 
 | 
   357        else{
 | 
| 
 | 
   358          $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1-$ext*3, $frame_offset, $strand, $table);
 | 
| 
 | 
   359        }
 | 
| 
 | 
   360      } while($newAA ne "*");
 | 
| 
 | 
   361 
 | 
| 
 | 
   362     return "p.".$origAA.$aapos.$table->translate($fs_codon)."fs*$ext";
 | 
| 
 | 
   363   }
 | 
| 
 | 
   364 
 | 
| 
 | 
   365   # does it cause a stop codon to be lost?
 | 
| 
 | 
   366   if($origAA eq "*"){
 | 
| 
 | 
   367      my $stopChangeCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1, $frame_offset, $strand);
 | 
| 
 | 
   368      # still a stop after the mutation (ignore translation exceptions)
 | 
| 
 | 
   369      if($table->is_ter_codon($stopChangeCodon)){
 | 
| 
 | 
   370         return "p.*$aapos=";
 | 
| 
 | 
   371      }
 | 
| 
 | 
   372      # calculate the new stop, assuming there aren't mutations downstream in candidate stop codons
 | 
| 
 | 
   373      my $ext = 0;
 | 
| 
 | 
   374      my $newCodon;
 | 
| 
 | 
   375      do{
 | 
| 
 | 
   376        if($strand eq "+"){
 | 
| 
 | 
   377          $newCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1+(++$ext*3), $frame_offset, $strand);
 | 
| 
 | 
   378        }
 | 
| 
 | 
   379        else{
 | 
| 
 | 
   380          $newCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1-(++$ext*3), $frame_offset, $strand);
 | 
| 
 | 
   381        }
 | 
| 
 | 
   382      } while(not $table->is_ter_codon($newCodon));
 | 
| 
 | 
   383 
 | 
| 
 | 
   384      return "p.*".$aapos.$table->translate($stopChangeCodon)."ext*".$ext;
 | 
| 
 | 
   385   }
 | 
| 
 | 
   386 
 | 
| 
 | 
   387   # if we get this far, it's a "regular" AA level change
 | 
| 
 | 
   388   my $origAAs = "";
 | 
| 
 | 
   389   for(my $i = 0; $i < length($ref)+$frame_offset; $i+=3){
 | 
| 
 | 
   390     my $oldAA = getAAFromSeqIndex($chr, $location+$i, $frame_offset, $strand, $table);
 | 
| 
 | 
   391     if($strand eq "+"){
 | 
| 
 | 
   392       $origAAs .= $oldAA;
 | 
| 
 | 
   393     }
 | 
| 
 | 
   394     else{
 | 
| 
 | 
   395       $origAAs = $oldAA . $origAAs;
 | 
| 
 | 
   396     }
 | 
| 
 | 
   397   }
 | 
| 
 | 
   398   my $newAAs = "";
 | 
| 
 | 
   399   for(my $i = 0; $i < length($variant)+$frame_offset; $i+=3){
 | 
| 
 | 
   400     # NA means we don't take translation exceptions from the original
 | 
| 
 | 
   401     my $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1+$i, $frame_offset, $strand, $table);
 | 
| 
 | 
   402     if($strand eq "+"){
 | 
| 
 | 
   403       $newAAs .= $newAA;
 | 
| 
 | 
   404     }
 | 
| 
 | 
   405     else{
 | 
| 
 | 
   406       $newAAs = $newAA . $newAAs;
 | 
| 
 | 
   407     } 
 | 
| 
 | 
   408   } 
 | 
| 
 | 
   409 
 | 
| 
 | 
   410   # silent
 | 
| 
 | 
   411   if($origAAs eq $newAAs){
 | 
| 
 | 
   412       return "p.".$origAAs.$aapos."=";
 | 
| 
 | 
   413   }
 | 
| 
 | 
   414 
 | 
| 
 | 
   415   # minimize the difference if there are leading or trailing AAs the same
 | 
| 
 | 
   416   my $delLength = length($ref);
 | 
| 
 | 
   417   while(substr($newAAs, 0, 1) eq substr($origAAs, 0, 1)){
 | 
| 
 | 
   418     $newAAs = substr($newAAs, 1);
 | 
| 
 | 
   419     $origAAs = substr($origAAs, 1);
 | 
| 
 | 
   420     $location+=3;
 | 
| 
 | 
   421     $delLength-=3;
 | 
| 
 | 
   422     $aapos++;
 | 
| 
 | 
   423   }
 | 
| 
 | 
   424   while(substr($newAAs, -1) eq substr($origAAs, -1)){
 | 
| 
 | 
   425     $newAAs = substr($newAAs, 0, length($newAAs)-1);
 | 
| 
 | 
   426     $origAAs = substr($origAAs, 0, length($origAAs)-1);
 | 
| 
 | 
   427   }
 | 
| 
 | 
   428 
 | 
| 
 | 
   429   # insertion
 | 
| 
 | 
   430   if(length($origAAs) == 0){
 | 
| 
 | 
   431     my $insAAs = getAAFromSeqIndex($chr,$location-3,$frame_offset,$strand,$table).($aapos-1)."_".
 | 
| 
 | 
   432                  getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table);
 | 
| 
 | 
   433     return "p.".$insAAs.$aapos."ins".$newAAs;
 | 
| 
 | 
   434   }
 | 
| 
 | 
   435   # deletion
 | 
| 
 | 
   436   elsif(length($newAAs) == 0){
 | 
| 
 | 
   437     my $delAAs;
 | 
| 
 | 
   438     if(length($origAAs) == 1){
 | 
| 
 | 
   439        $delAAs = getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos; # single AA deletion
 | 
| 
 | 
   440     }
 | 
| 
 | 
   441     else{ # deleting a stretch
 | 
| 
 | 
   442        if($strand eq "+"){
 | 
| 
 | 
   443          my $endPoint = $location+$delLength-1;
 | 
| 
 | 
   444          $delAAs = getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos."_".
 | 
| 
 | 
   445                    getAAFromSeqIndex($chr,$endPoint,$frame_offset,$strand,$table).($aapos+int(($delLength-1)/3));
 | 
| 
 | 
   446        }
 | 
| 
 | 
   447        else{
 | 
| 
 | 
   448          my $endPoint = $location-$delLength+1;
 | 
| 
 | 
   449          $delAAs = getAAFromSeqIndex($chr,$endPoint,$frame_offset,$strand,$table).($aapos-int(($delLength-1)/3))."_".
 | 
| 
 | 
   450                    getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos;
 | 
| 
 | 
   451        }
 | 
| 
 | 
   452     }
 | 
| 
 | 
   453     return "p.".$delAAs."del";
 | 
| 
 | 
   454   }
 | 
| 
 | 
   455   else{
 | 
| 
 | 
   456     # substitution
 | 
| 
 | 
   457     if(length($origAAs) == 1 and length($newAAs) == 1){
 | 
| 
 | 
   458       return "p.".$origAAs.$aapos.$newAAs;
 | 
| 
 | 
   459     }
 | 
| 
 | 
   460     # indel
 | 
| 
 | 
   461     elsif(length($origAAs) != 1){
 | 
| 
 | 
   462        # convert ref stretch into range syntax
 | 
| 
 | 
   463        if($strand eq "+"){
 | 
| 
 | 
   464          $origAAs = substr($origAAs, 0, 1).$aapos."_".substr($origAAs, -1).($aapos+length($origAAs)-1);
 | 
| 
 | 
   465        }
 | 
| 
 | 
   466        else{
 | 
| 
 | 
   467          $origAAs = substr($origAAs, 0, 1).($aapos-length($origAAs)+1)."_".substr($origAAs, -1).$aapos;
 | 
| 
 | 
   468        }
 | 
| 
 | 
   469     }
 | 
| 
 | 
   470     return "p.".$origAAs."delins".$newAAs;
 | 
| 
 | 
   471   }
 | 
| 
 | 
   472   return ("NA", "");
 | 
| 
 | 
   473 }
 | 
| 
 | 
   474 
 | 
| 
 | 
   475 sub z2p{
 | 
| 
 | 
   476   if(not defined $zed){
 | 
| 
 | 
   477     $zed = new Statistics::Zed;
 | 
| 
 | 
   478   }
 | 
| 
 | 
   479   my $p = $zed->z2p(value => $_[0]);
 | 
| 
 | 
   480   return $p < 0.0000000001 ? 0 : $p;
 | 
| 
 | 
   481 }
 | 
| 
 | 
   482 sub gq2p{
 | 
| 
 | 
   483     return $_[0] > 200 ? 0 : 10**($_[0]/-10);
 | 
| 
 | 
   484 }
 | 
| 
 | 
   485 
 | 
| 
 | 
   486 my ($multi_phased, $min_depth, $flanking_bases, $dbsnp, $internal_snp, $genename_bed_file, $dir_1000G, $dir_esp6500, $min_pvalue, $mappability_file, $reference_file, $samtools_phasing_file, $exons_file, $input_file, $output_file, $cnv_file, $dgv_file, $which_chr, $enrichment_regions_file, $rare_variant_prop);
 | 
| 
 | 
   487 $multi_phased = 0;
 | 
| 
 | 
   488 $min_depth = 2;
 | 
| 
 | 
   489 $flanking_bases = 30;
 | 
| 
 | 
   490 $min_pvalue = 0.01;
 | 
| 
 | 
   491 $min_prop = 0.14;
 | 
| 
 | 
   492 $rare_variant_prop = 0.05;
 | 
| 
 | 
   493 $input_file = "-"; # STDIN by default
 | 
| 
 | 
   494 $output_file = "-"; # STDOUT by default
 | 
| 
 | 
   495 $default_transl_table = "1"; # assumes NCBI 'Standard' table, unless it is an argument to the script...
 | 
| 
 | 
   496 &GetOptions("d=i"       => \$min_depth,
 | 
| 
 | 
   497             "f=i"       => \$flanking_bases,
 | 
| 
 | 
   498             "s=s"       => \$dbsnp,
 | 
| 
 | 
   499             "t=s"       => \$dir_1000G,
 | 
| 
 | 
   500             "n=s"       => \$dir_esp6500,
 | 
| 
 | 
   501             "u=s"       => \$internal_snp,
 | 
| 
 | 
   502             "q"         => \$quiet,
 | 
| 
 | 
   503             "p=f"       => \$min_pvalue,
 | 
| 
 | 
   504             "h=f"       => \$min_prop,
 | 
| 
 | 
   505             "m=s"       => \$mappability_file,
 | 
| 
 | 
   506             "r=s"       => \$reference_file,
 | 
| 
 | 
   507             "z=s"       => \$samtools_phasing_file,
 | 
| 
 | 
   508             "e=s"       => \$exons_file,
 | 
| 
 | 
   509             "i=s"       => \$input_file,
 | 
| 
 | 
   510             "c=s"       => \$cnv_file,
 | 
| 
 | 
   511             "g=s"       => \$dgv_file,
 | 
| 
 | 
   512             "b=s"       => \$genename_bed_file,
 | 
| 
 | 
   513             "w=s"       => \$which_chr,
 | 
| 
 | 
   514             "o=s"       => \$output_file,
 | 
| 
 | 
   515             "a=i"       => \$default_transl_table,
 | 
| 
 | 
   516             "v=f"       => \$rare_variant_prop,
 | 
| 
 | 
   517             "x=s"       => \$enrichment_regions_file); # if enrichment regions are specified, variants without a transcript model but in these ranges will be reported
 | 
| 
 | 
   518 
 | 
| 
 | 
   519 if(($input_file ne "/dev/null" and not defined $reference_file) or
 | 
| 
 | 
   520    not defined $exons_file or 
 | 
| 
 | 
   521    (defined $cnv_file and not defined $dgv_file)){
 | 
| 
 | 
   522   die "Usage: $0 [-v(ersion)] [-q(uiet)] [-w(hich) contig_to_report (default is all)] [-d(epth of variant reads req'd) #] [-v(ariant max freq to count as rare)] [-f(lanking exon bases to report) #] [-p(robability of random genotype, maximum to report) 0.#]\n",
 | 
| 
 | 
   523       "  [-h(et proportion of variant reads, minimum to report) 0.#] [-c(opy number) variants_file.bed -g(enomic structural) variants_control_db.txt.gz] [-z file_containing_samtools_phase_output.txt]\n",
 | 
| 
 | 
   524       "  [-t(housand) genomes_integrated_vcfs_gz_dir] [-n ESP6500_dir] [-u(ser) specified_population.vcf.gz] [-m(appability) crg_file.bed]\n", 
 | 
| 
 | 
   525       "  [-x enrichment_regions_file.bed] [-a(mino) acid translation table number from NCBI]\n",
 | 
| 
 | 
   526       "  [-i(nput) genotypes.vcf <-r(eference) sequence_file.fasta>] [-o(utput) hgvs_file.tsv] [-s(np) database_from_ncbi.vcf.gz]\n",
 | 
| 
 | 
   527       "  <-b(ed) file of named gene regions.bed> <-e(xons) file.gtf>\n\n",
 | 
| 
 | 
   528       "Input gz files must be indexed with Tabix.\nDefault input is STDIN, default output is STDOUT.  Note: if -c is specified, polyploidies are are assume to be proximal. Other defaults: -d 2, -v 0.05, -f 30, -p 0.01, -h 0.14 -a 1\nReference sequence is not strictly necessary if only CNV are being annotated.\n";
 | 
| 
 | 
   529 }
 | 
| 
 | 
   530 
 | 
| 
 | 
   531 print STDERR "Considering $flanking_bases flanking bases for variants as well\n" unless $quiet;
 | 
| 
 | 
   532 
 | 
| 
 | 
   533 $codonTable = new Bio::Tools::CodonTable(id => $default_transl_table); 
 | 
| 
 | 
   534 
 | 
| 
 | 
   535 my %enrichment_regions;
 | 
| 
 | 
   536 # Note, we assume the regions are non-overlapping
 | 
| 
 | 
   537 if(defined $enrichment_regions_file){
 | 
| 
 | 
   538   print STDERR "Loading enrichment regions...\n" unless $quiet;
 | 
| 
 | 
   539   open(BED, $enrichment_regions_file)
 | 
| 
 | 
   540     or die "Cannot open $enrichment_regions_file for reading: $!\n";
 | 
| 
 | 
   541   while(<BED>){
 | 
| 
 | 
   542     chomp;
 | 
| 
 | 
   543     my @F = split /\t/, $_;
 | 
| 
 | 
   544     $enrichment_regions{$F[0]} = [] if not exists $enrichment_regions{$F[0]};
 | 
| 
 | 
   545     push @{$enrichment_regions{$F[0]}}, [$F[1], $F[2]];
 | 
| 
 | 
   546   }
 | 
| 
 | 
   547   close(BED);
 | 
| 
 | 
   548 }
 | 
| 
 | 
   549 for my $chr (keys %enrichment_regions){ # sort by start
 | 
| 
 | 
   550   $enrichment_regions{$chr} = [sort {$a->[0] <=> $b->[0]} @{$enrichment_regions{$chr}}];
 | 
| 
 | 
   551 }
 | 
| 
 | 
   552 
 | 
| 
 | 
   553 if(defined $reference_file){
 | 
| 
 | 
   554   print STDERR "Scanning reference FastA info\n" unless $quiet;
 | 
| 
 | 
   555   if(not -e $reference_file){
 | 
| 
 | 
   556     die "Reference FastA file ($reference_file) does not exist.\n";
 | 
| 
 | 
   557   }
 | 
| 
 | 
   558   if(not -e $reference_file.".fai" and not -w dirname($reference_file)){
 | 
| 
 | 
   559     die "Reference FastA file ($reference_file) is not indexed, and the directory is not writable.\n";
 | 
| 
 | 
   560   }
 | 
| 
 | 
   561   $fasta_index = Bio::DB::Sam::Fai->load($reference_file);
 | 
| 
 | 
   562 }
 | 
| 
 | 
   563 
 | 
| 
 | 
   564 my %chr2mappability;
 | 
| 
 | 
   565 if(defined $mappability_file){
 | 
| 
 | 
   566   print STDERR "Reading in mappability data\n" unless $quiet;
 | 
| 
 | 
   567   my ($nmer) = $mappability_file =~ /(\d+).*?$/;
 | 
| 
 | 
   568   die "Cannot determine nmer from nmer file name $mappability_file, aborting\n" unless $nmer;
 | 
| 
 | 
   569   open(MAP, $mappability_file)
 | 
| 
 | 
   570     or die "Cannot open mappability data file $mappability_file for reading: $!\n";
 | 
| 
 | 
   571   <MAP>; # header
 | 
| 
 | 
   572   while(<MAP>){
 | 
| 
 | 
   573     next if /^#/;
 | 
| 
 | 
   574     chomp;
 | 
| 
 | 
   575     my @F = split /\t/, $_;
 | 
| 
 | 
   576     my $x = int(1/$F[3]+0.5);
 | 
| 
 | 
   577     $chr2mappability{$F[0]} = Set::IntervalTree->new() if not exists $chr2mappability{$F[0]};
 | 
| 
 | 
   578     $chr2mappability{$F[0]}->insert("non-unique mapping region (x$x)", $F[1], $F[2]+$nmer-1);
 | 
| 
 | 
   579   }
 | 
| 
 | 
   580   close(MAP);
 | 
| 
 | 
   581 }
 | 
| 
 | 
   582 
 | 
| 
 | 
   583 # Is phasing data provided?
 | 
| 
 | 
   584 if(defined $samtools_phasing_file){
 | 
| 
 | 
   585   print STDERR "Reading in phasing data\n" unless $quiet;
 | 
| 
 | 
   586   open(PHASE, $samtools_phasing_file)
 | 
| 
 | 
   587     or die "Cannot open phasing data file $samtools_phasing_file for reading: $!\n";
 | 
| 
 | 
   588   my $phase_range;
 | 
| 
 | 
   589   while(<PHASE>){
 | 
| 
 | 
   590     if(/^PS/){
 | 
| 
 | 
   591       chomp;
 | 
| 
 | 
   592       my @F = split /\t/, $_;
 | 
| 
 | 
   593       $phase_range = "$F[2]-$F[3]";
 | 
| 
 | 
   594     }
 | 
| 
 | 
   595     if(/^M[12]/){
 | 
| 
 | 
   596       chomp;
 | 
| 
 | 
   597       my @F = split /\t/, $_;
 | 
| 
 | 
   598       #ignore strange cases where haplotype reference has no cases (weird samtools call)
 | 
| 
 | 
   599       next if $F[9] == 0 or $F[7] == 0;
 | 
| 
 | 
   600       my $chr = $F[1];
 | 
| 
 | 
   601       next if defined $which_chr and not $chr eq $which_chr;
 | 
| 
 | 
   602       my $pos = $F[3];
 | 
| 
 | 
   603       #print STDERR "Recording phase for $chr:$pos:$F[4] , $chr:$pos:$F[5] as A-$chr:$phase_range and B-$chr:$phase_range\n" if $pos == 12907379;
 | 
| 
 | 
   604       if(($F[10]+$F[8])/($F[9]+$F[7]) >= $min_prop){ # error meets reporting threshold
 | 
| 
 | 
   605         $chr2caveats{"$chr:$pos"} .= "; " if exists $chr2caveats{"$chr:$pos"};
 | 
| 
 | 
   606         $chr2caveats{"$chr:$pos"} .= "inconsistent haplotype phasing";
 | 
| 
 | 
   607       }
 | 
| 
 | 
   608       else{ # appears to be a genuine phasing
 | 
| 
 | 
   609         $chr2phase{"$chr:$pos:$F[4]"} = "A-$chr:$phase_range"; # grouping for haplotype 
 | 
| 
 | 
   610         $chr2phase{"$chr:$pos:$F[5]"} = "B-$chr:$phase_range"; # grouping for haplotype 
 | 
| 
 | 
   611       }
 | 
| 
 | 
   612     }
 | 
| 
 | 
   613   }
 | 
| 
 | 
   614   close(PHASE);
 | 
| 
 | 
   615 }
 | 
| 
 | 
   616 
 | 
| 
 | 
   617 # Check the VCF file to see if contains phase data
 | 
| 
 | 
   618 open(VCFIN, $input_file)
 | 
| 
 | 
   619     or die "Cannot open $input_file for reading: $!\n";
 | 
| 
 | 
   620 my $phase_chr = "";
 | 
| 
 | 
   621 my @phase_dataA;
 | 
| 
 | 
   622 my @phase_dataB;
 | 
| 
 | 
   623 while(<VCFIN>){
 | 
| 
 | 
   624     if(/^\s*(?:#|$)/){ # blank or hash comment
 | 
| 
 | 
   625        next;
 | 
| 
 | 
   626     }
 | 
| 
 | 
   627     my @F = split /\t/, $_;
 | 
| 
 | 
   628     next if exists $chr2caveats{"$F[0]:$F[1]"} and $chr2caveats{"$F[0]:$F[1]"} =~ /inconsistent haplotype phasing/;
 | 
| 
 | 
   629     # | indicates phased
 | 
| 
 | 
   630     if($F[8] =~ m(^(\d+)\|(\d+):)){
 | 
| 
 | 
   631       next if $1 eq $2;  # not useful to us (actually would mess up phase combining later on), but is provided sometimes
 | 
| 
 | 
   632       # start of a phasing block
 | 
| 
 | 
   633       if($phase_chr eq ""){
 | 
| 
 | 
   634         $phase_chr = $F[0];
 | 
| 
 | 
   635       }
 | 
| 
 | 
   636       my @vars = split /,/, $F[4];
 | 
| 
 | 
   637       if($1 > @vars){
 | 
| 
 | 
   638         die "Invalid VCF file (line #$.): First haplotype listed as $1, but only ", scalar(@vars), " variants were provided (", join(",", @vars), "\n";
 | 
| 
 | 
   639       }
 | 
| 
 | 
   640       if($2 > @vars){
 | 
| 
 | 
   641         die "Invalid VCF file (line #$.): Second haplotype listed as $1, but only ", scalar(@vars), " variants were provided (", join(",", @vars), "\n";
 | 
| 
 | 
   642       }
 | 
| 
 | 
   643       unshift @vars, $F[3];
 | 
| 
 | 
   644       push @phase_dataA, [$F[1], $vars[$1]];
 | 
| 
 | 
   645       push @phase_dataB, [$F[1], $vars[$2]];
 | 
| 
 | 
   646     }
 | 
| 
 | 
   647     # non phased het call, ends any phasing block there might be
 | 
| 
 | 
   648     elsif($F[8] =~ m(^0/1)){
 | 
| 
 | 
   649       # Did we just finish a phased block? If so, output it.
 | 
| 
 | 
   650       if(@phase_dataA > 1){
 | 
| 
 | 
   651         my $phase_def = "G-$phase_chr:".$phase_dataA[0]->[0]."-".$phase_dataA[$#phase_dataA]->[0];
 | 
| 
 | 
   652         for my $d (@phase_dataA){
 | 
| 
 | 
   653           my ($p, $v) = @$d;
 | 
| 
 | 
   654           if(exists $chr2phase{"$phase_chr:$p:$v"}){
 | 
| 
 | 
   655             $chr2phase{"$phase_chr:$p:$v"} .= ",$phase_def";
 | 
| 
 | 
   656             $multi_phased ||= 1;
 | 
| 
 | 
   657           }
 | 
| 
 | 
   658           else{
 | 
| 
 | 
   659             $chr2phase{"$phase_chr:$p:$v"} = $phase_def;
 | 
| 
 | 
   660           }
 | 
| 
 | 
   661         }
 | 
| 
 | 
   662         $phase_def = "H-$phase_chr:".$phase_dataB[0]->[0]."-".$phase_dataB[$#phase_dataB]->[0];
 | 
| 
 | 
   663         for my $d (@phase_dataB){
 | 
| 
 | 
   664           my ($p, $v) = @$d;
 | 
| 
 | 
   665           if(exists $chr2phase{"$phase_chr:$p:$v"}){
 | 
| 
 | 
   666             $chr2phase{"$phase_chr:$p:$v"} = ",$phase_def";
 | 
| 
 | 
   667             $multi_phased ||= 1;
 | 
| 
 | 
   668           }
 | 
| 
 | 
   669           else{
 | 
| 
 | 
   670             $chr2phase{"$phase_chr:$p:$v"} = $phase_def;
 | 
| 
 | 
   671           }
 | 
| 
 | 
   672         }
 | 
| 
 | 
   673       }
 | 
| 
 | 
   674       if($phase_chr ne ""){
 | 
| 
 | 
   675         $phase_chr = "";
 | 
| 
 | 
   676         @phase_dataA = ();
 | 
| 
 | 
   677         @phase_dataB = ();
 | 
| 
 | 
   678       }
 | 
| 
 | 
   679     }
 | 
| 
 | 
   680 }
 | 
| 
 | 
   681 
 | 
| 
 | 
   682 print STDERR "Reading in feature GTF data..." unless $quiet;
 | 
| 
 | 
   683 my %feature_range; # chr => transcript_id => [[genomic_exon_start,genomic_exon_end,cdna_start_pos],...]
 | 
| 
 | 
   684 my %feature_intervaltree; # chr => transcript_id => [[genomic_exon_start,genomic_exon_end,cdna_start_pos],...]
 | 
| 
 | 
   685 my %feature_strand; # transcript_id => +|-
 | 
| 
 | 
   686 my $feature_count = 0;
 | 
| 
 | 
   687 my %feature_min;
 | 
| 
 | 
   688 my %feature_max;
 | 
| 
 | 
   689 my %feature_cds_min;
 | 
| 
 | 
   690 my %feature_cds_max;
 | 
| 
 | 
   691 my %feature_contig;
 | 
| 
 | 
   692 my %feature_length;
 | 
| 
 | 
   693 my %feature_type;
 | 
| 
 | 
   694 my %feature_transl_table; # note alternate translation table usage
 | 
| 
 | 
   695 my %chr_read;
 | 
| 
 | 
   696 open(GTF, $exons_file)
 | 
| 
 | 
   697     or die "Cannot open $exons_file for reading: $!\n";
 | 
| 
 | 
   698 while(<GTF>){
 | 
| 
 | 
   699     next if /^\s*#/;
 | 
| 
 | 
   700     my @fields = split /\t/, $_;
 | 
| 
 | 
   701     next if defined $which_chr and $fields[0] ne $which_chr and "chr$fields[0]" ne $which_chr and $fields[0] ne "chr$which_chr";
 | 
| 
 | 
   702    
 | 
| 
 | 
   703     if($fields[2] eq "exon" or $fields[2] eq "CDS"){
 | 
| 
 | 
   704 	next unless $fields[$#fields] =~ /transcript_id \"(.*?)\"/o;
 | 
| 
 | 
   705 	my $parent = $1;
 | 
| 
 | 
   706 	if(not $quiet and not exists $chr_read{$fields[0]}){
 | 
| 
 | 
   707 	    print STDERR " $fields[0]";
 | 
| 
 | 
   708 	    $chr_read{$fields[0]} = 1;
 | 
| 
 | 
   709 	}
 | 
| 
 | 
   710 	if(not exists $feature_strand{$parent}){
 | 
| 
 | 
   711 	    $feature_strand{$parent} = $fields[6];
 | 
| 
 | 
   712 	    $feature_contig{$parent} = $fields[0];
 | 
| 
 | 
   713             if($fields[$#fields] =~ /transcript_type \"(.*?)\"/){
 | 
| 
 | 
   714               $feature_type{$parent} = $1;
 | 
| 
 | 
   715             }
 | 
| 
 | 
   716             else{
 | 
| 
 | 
   717               $feature_type{$parent} = "NA";
 | 
| 
 | 
   718             }
 | 
| 
 | 
   719 	}
 | 
| 
 | 
   720 	if($fields[2] eq "CDS"){
 | 
| 
 | 
   721 	    #print STDERR "CDS value for $parent is $fields[2]..$fields[3]\n";
 | 
| 
 | 
   722 	    if(not exists $feature_cds_min{$parent} or $fields[3] < $feature_cds_min{$parent}){
 | 
| 
 | 
   723 		$feature_cds_min{$parent} = $fields[3];
 | 
| 
 | 
   724 	    }
 | 
| 
 | 
   725 	    if(not exists $feature_cds_max{$parent} or $fields[4] > $feature_cds_max{$parent}){
 | 
| 
 | 
   726 		$feature_cds_max{$parent} = $fields[4];
 | 
| 
 | 
   727 	    }
 | 
| 
 | 
   728             if($fields[$#fields] =~ /transl_table \"(\d+)\"/){
 | 
| 
 | 
   729                 $feature_transl_table{$parent} = $1; #assume one translation table per CDS, which should be reasonable
 | 
| 
 | 
   730             }
 | 
| 
 | 
   731             while($fields[$#fields] =~ /transl_except \"pos:(\S+?),aa:(\S+?)\"/g){
 | 
| 
 | 
   732                 my $pos = $1;
 | 
| 
 | 
   733                 my $new_aa = $2; # needs to change from three letter code to 1
 | 
| 
 | 
   734                 if($new_aa =~ /^ter/i){ # can be funny so have special case (allows TERM, etc.)
 | 
| 
 | 
   735                   $new_aa = "*";
 | 
| 
 | 
   736                 } 
 | 
| 
 | 
   737                 elsif(length($new_aa) == 3){
 | 
| 
 | 
   738                   $new_aa = Bio::SeqUtils->new()->seq3in($new_aa);
 | 
| 
 | 
   739                 }
 | 
| 
 | 
   740                 if($pos =~ /^(\d+)\.\.(\d+)/){
 | 
| 
 | 
   741                   for my $p ($1..$2){
 | 
| 
 | 
   742                     $transl_except{"$fields[0]:$p"} = $new_aa;
 | 
| 
 | 
   743                   }
 | 
| 
 | 
   744                 }
 | 
| 
 | 
   745                 else{
 | 
| 
 | 
   746                   $transl_except{"$fields[0]:$pos"} = $new_aa;
 | 
| 
 | 
   747                 }
 | 
| 
 | 
   748             }
 | 
| 
 | 
   749 	    next;
 | 
| 
 | 
   750 	}
 | 
| 
 | 
   751 	if(not exists $feature_min{$parent} or $fields[3] < $feature_min{$parent}){
 | 
| 
 | 
   752 	    $feature_min{$parent} = $fields[3];
 | 
| 
 | 
   753 	}
 | 
| 
 | 
   754 	if(not exists $feature_max{$parent} or $fields[4] > $feature_max{$parent}){
 | 
| 
 | 
   755 	    $feature_max{$parent} = $fields[4];
 | 
| 
 | 
   756 	}
 | 
| 
 | 
   757 	
 | 
| 
 | 
   758 	$feature_count++;
 | 
| 
 | 
   759 	if(not exists $feature_range{$fields[0]}){
 | 
| 
 | 
   760 	    $feature_range{$fields[0]} = {}; # Chr => {parentID => [start,stop]}
 | 
| 
 | 
   761             $feature_intervaltree{$fields[0]} = Set::IntervalTree->new();
 | 
| 
 | 
   762 	}
 | 
| 
 | 
   763 	if(not exists $feature_range{$fields[0]}->{$parent}){
 | 
| 
 | 
   764 	    $feature_range{$fields[0]}->{$parent} = [];
 | 
| 
 | 
   765 	}
 | 
| 
 | 
   766         push @{$feature_range{$fields[0]}->{$parent}}, [$fields[3],$fields[4]];
 | 
| 
 | 
   767         $feature_intervaltree{$fields[0]}->insert($parent, $fields[3], $fields[4]+1); # ranges need to have positive length for module to work properly 
 | 
| 
 | 
   768         $feature_length{$parent} += $fields[4]-$fields[3]+1;
 | 
| 
 | 
   769     }
 | 
| 
 | 
   770 }
 | 
| 
 | 
   771 close(GTF);
 | 
| 
 | 
   772 print STDERR "\nFound $feature_count exons on ", scalar(keys %feature_range), " contigs in the GTF file\n" unless $quiet;
 | 
| 
 | 
   773 
 | 
| 
 | 
   774 for my $contig (keys %feature_range){
 | 
| 
 | 
   775     for my $parent (keys %{$feature_range{$contig}}){
 | 
| 
 | 
   776 	# sort by subrange start
 | 
| 
 | 
   777 	my @feature_ranges = sort {$a->[0] <=> $b->[0]} @{$feature_range{$contig}->{$parent}};
 | 
| 
 | 
   778 	$feature_range{$contig}->{$parent} = \@feature_ranges;
 | 
| 
 | 
   779 	$feature_range{"chr".$contig}->{$parent} = \@feature_ranges if not $contig =~ /^chr/;
 | 
| 
 | 
   780 	$feature_range{$1}->{$parent} = \@feature_ranges if $contig =~ /^chr(\S+)/;
 | 
| 
 | 
   781     }
 | 
| 
 | 
   782 }
 | 
| 
 | 
   783 
 | 
| 
 | 
   784 # Calculate the cDNA position of the leftmost (reference strand) base for each exon
 | 
| 
 | 
   785 for my $contig (keys %feature_range){
 | 
| 
 | 
   786     for my $parent (keys %{$feature_range{$contig}}){
 | 
| 
 | 
   787         my @feature_ranges = @{$feature_range{$contig}->{$parent}};
 | 
| 
 | 
   788         if($feature_strand{$parent} eq "-"){
 | 
| 
 | 
   789             # set up utr offset for correct CDS coordinates
 | 
| 
 | 
   790             my $feature_offset = 0;
 | 
| 
 | 
   791             for(my $i = $#feature_ranges; $i >= 0; $i--){
 | 
| 
 | 
   792                 last if not $feature_cds_max{$parent};
 | 
| 
 | 
   793                 # exon is completely 5' of the start
 | 
| 
 | 
   794                 if($feature_ranges[$i]->[0] > $feature_cds_max{$parent}){
 | 
| 
 | 
   795                     $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
 | 
| 
 | 
   796                 }
 | 
| 
 | 
   797                 # exon with the cds start
 | 
| 
 | 
   798                 elsif($feature_ranges[$i]->[1] >= $feature_cds_max{$parent} and
 | 
| 
 | 
   799                       $feature_ranges[$i]->[0] <= $feature_cds_max{$parent}){
 | 
| 
 | 
   800                     $feature_offset += $feature_cds_max{$parent} - $feature_ranges[$i]->[1];
 | 
| 
 | 
   801                     last;
 | 
| 
 | 
   802                 }
 | 
| 
 | 
   803                 else{
 | 
| 
 | 
   804                     die "The CDS for $parent (on negative strand) ends downstream ",
 | 
| 
 | 
   805                     "($feature_cds_max{$parent}) of the an exon",
 | 
| 
 | 
   806                     " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n";
 | 
| 
 | 
   807                 }
 | 
| 
 | 
   808             }
 | 
| 
 | 
   809             for(my $i = $#feature_ranges; $i >= 0; $i--){
 | 
| 
 | 
   810               $feature_offset += $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
 | 
| 
 | 
   811               $feature_ranges[$i]->[2] = $feature_offset-1;
 | 
| 
 | 
   812             }
 | 
| 
 | 
   813        }
 | 
| 
 | 
   814        else{ # positive strand
 | 
| 
 | 
   815             # set up utr offset for correct CDS coordinates
 | 
| 
 | 
   816             my $feature_offset = 0;
 | 
| 
 | 
   817             for(my $i = 0; $i <= $#feature_ranges; $i++){
 | 
| 
 | 
   818                 last if not $feature_cds_min{$parent};
 | 
| 
 | 
   819                 # All 5' utr exon
 | 
| 
 | 
   820                 if($feature_ranges[$i]->[1] < $feature_cds_min{$parent}){
 | 
| 
 | 
   821                     $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
 | 
| 
 | 
   822                 }
 | 
| 
 | 
   823                 # exon with the cds start
 | 
| 
 | 
   824                 elsif($feature_ranges[$i]->[1] >= $feature_cds_min{$parent} and
 | 
| 
 | 
   825                       $feature_ranges[$i]->[0] <= $feature_cds_min{$parent}){
 | 
| 
 | 
   826                     $feature_offset -= $feature_cds_min{$parent} - $feature_ranges[$i]->[0];
 | 
| 
 | 
   827                     last;
 | 
| 
 | 
   828                 }
 | 
| 
 | 
   829                 else{
 | 
| 
 | 
   830                     die "The CDS for $parent starts upstream ($feature_cds_min{$parent}) of the first exon",
 | 
| 
 | 
   831                     " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n";
 | 
| 
 | 
   832                 }
 | 
| 
 | 
   833             }
 | 
| 
 | 
   834             # assign cDNA coords for each exon to the third array element
 | 
| 
 | 
   835             for(my $i = 0; $i <= $#feature_ranges; $i++){
 | 
| 
 | 
   836               $feature_ranges[$i]->[2] = $feature_offset;
 | 
| 
 | 
   837               $feature_offset += $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
 | 
| 
 | 
   838             }
 | 
| 
 | 
   839        }
 | 
| 
 | 
   840     }
 | 
| 
 | 
   841 }
 | 
| 
 | 
   842 
 | 
| 
 | 
   843 print STDERR "Reading in gene name definitions...\n" unless $quiet;
 | 
| 
 | 
   844 die "Data file $genename_bed_file does not exist, aborting.\n" if not -e $genename_bed_file;
 | 
| 
 | 
   845 my %gene_ids;
 | 
| 
 | 
   846 open(TAB, $genename_bed_file)
 | 
| 
 | 
   847   or die "Cannot open gene name BED file $genename_bed_file for reading: $!\n";
 | 
| 
 | 
   848 while(<TAB>){
 | 
| 
 | 
   849   chomp;
 | 
| 
 | 
   850   # format should be "chr start stop gene_name ..."
 | 
| 
 | 
   851   my @fields = split /\t/, $_;
 | 
| 
 | 
   852   next if $#fields < 3;
 | 
| 
 | 
   853   my $c = $fields[0];
 | 
| 
 | 
   854   if(not exists $gene_ids{$c}){
 | 
| 
 | 
   855     $gene_ids{$c} = Set::IntervalTree->new();
 | 
| 
 | 
   856   }
 | 
| 
 | 
   857   $gene_ids{$c}->insert($fields[3], $fields[1], $fields[2]);
 | 
| 
 | 
   858 }
 | 
| 
 | 
   859 
 | 
| 
 | 
   860 # Print output header
 | 
| 
 | 
   861 open(OUT, ">$output_file")
 | 
| 
 | 
   862   or die "Cannot open $output_file for writing: $!\n";
 | 
| 
 | 
   863 
 | 
| 
 | 
   864 print OUT join("\t", "Feature type", "Transcript length", "Selected transcript", "Transcript HGVS", "Strand", "Chr", "DNA From", "DNA To", "Zygosity", "P-value", "Variant Reads", "Total Reads",
 | 
| 
 | 
   865 	       "Ref base", "Obs base", "Pop. freq. source", "Pop. freq.", "Variant DB ID"), "\t",
 | 
| 
 | 
   866                ($internal_snp ? "Internal pop. freq.\t" : ""), 
 | 
| 
 | 
   867                join("\t", "Protein HGVS", "Closest exon junction (AA coding variants)", "Gene Name", "Caveats", "Phase", "Num rare variants in gene (MAF <= $rare_variant_prop)", "Num rare coding and splice site variants in gene (MAF <= $rare_variant_prop)"),"\n";
 | 
| 
 | 
   868 
 | 
| 
 | 
   869 # If there is CNV data, load it.
 | 
| 
 | 
   870 # BED columns should be chr start stop caveats ploidy . ignored ignored r,g,b
 | 
| 
 | 
   871 # The dot means the strand doesn't matter.
 | 
| 
 | 
   872 # where the first five fields are required, others optional
 | 
| 
 | 
   873 # where r,g,b is overloaded with father,mother ploidies and "b" is integer representing affected status logical AND (father bit mask 1, mother bit mask 2) 
 | 
| 
 | 
   874 if(defined $cnv_file){
 | 
| 
 | 
   875   print STDERR "Reading in CNV data...\n" unless $quiet;
 | 
| 
 | 
   876   open(CNV, $cnv_file)
 | 
| 
 | 
   877     or die "Cannot open $cnv_file for reading: $!\n";
 | 
| 
 | 
   878   while(<CNV>){
 | 
| 
 | 
   879     chomp;
 | 
| 
 | 
   880     my @F = split /\t/,  $_, -1;
 | 
| 
 | 
   881     if(@F < 5){
 | 
| 
 | 
   882       print STDERR "Skipping unparseable line ($cnv_file #$.): $_\n";
 | 
| 
 | 
   883       next;
 | 
| 
 | 
   884     }
 | 
| 
 | 
   885     my $ploidy = $F[4];
 | 
| 
 | 
   886     my $cnv_chr = $F[0];
 | 
| 
 | 
   887     next if defined $which_chr and $cnv_chr ne $which_chr and "chr$cnv_chr" ne $which_chr and $cnv_chr ne "chr$which_chr";
 | 
| 
 | 
   888     my $cnv_start = $F[1];
 | 
| 
 | 
   889     my $cnv_end = $F[2];
 | 
| 
 | 
   890     my $p_value = "NA";
 | 
| 
 | 
   891     if($F[3] =~ s/p-value=(\S+?)(?:;|$)//){
 | 
| 
 | 
   892       $p_value = $1;
 | 
| 
 | 
   893       next if $min_pvalue < $p_value;
 | 
| 
 | 
   894     }
 | 
| 
 | 
   895 
 | 
| 
 | 
   896     # Report a variant line for each gene that is found in this CNV
 | 
| 
 | 
   897     my $target_parents = $feature_intervaltree{$cnv_chr}->fetch($cnv_start, $cnv_end+1);
 | 
| 
 | 
   898 
 | 
| 
 | 
   899     my $caveats = "";
 | 
| 
 | 
   900     if(@F == 9){
 | 
| 
 | 
   901       my @parents_ploidy = split /,/, $F[8];
 | 
| 
 | 
   902       if($parents_ploidy[2] == 0){ # neither parent affected
 | 
| 
 | 
   903         if($ploidy < $parents_ploidy[0] and $ploidy < $parents_ploidy[1]){
 | 
| 
 | 
   904           if($ploidy > 2){
 | 
| 
 | 
   905             $caveats = "Polyploidy is less severe than in either unaffected parents";
 | 
| 
 | 
   906           }
 | 
| 
 | 
   907           # else: no caveats, this offspring has fewer copies than normally observed, or in unaffected parents
 | 
| 
 | 
   908           elsif($ploidy < 2){
 | 
| 
 | 
   909             if($parents_ploidy[0] == 2 and $parents_ploidy[1] == 2){
 | 
| 
 | 
   910               $caveats = "De novo copy loss, unaffected parents are diploid";
 | 
| 
 | 
   911             }
 | 
| 
 | 
   912             else{
 | 
| 
 | 
   913               $caveats = "Copy loss is greater than in either unaffected parent";
 | 
| 
 | 
   914             }
 | 
| 
 | 
   915           }
 | 
| 
 | 
   916         }
 | 
| 
 | 
   917         elsif($ploidy >= $parents_ploidy[0] and $ploidy <= $parents_ploidy[1] or
 | 
| 
 | 
   918            $ploidy >= $parents_ploidy[1] and $ploidy <= $parents_ploidy[0]){
 | 
| 
 | 
   919           $caveats = "Aneuploidy likely inherited from an unaffected parent";
 | 
| 
 | 
   920         }
 | 
| 
 | 
   921         elsif($ploidy > $parents_ploidy[0] and $ploidy > $parents_ploidy[1]){
 | 
| 
 | 
   922           if($parents_ploidy[0] > 2){
 | 
| 
 | 
   923             if($parents_ploidy[1] > 2){
 | 
| 
 | 
   924               $caveats = "Lower polyploidy already exists in both unaffected parents";
 | 
| 
 | 
   925             }
 | 
| 
 | 
   926             else{
 | 
| 
 | 
   927               $caveats = "Lower polyploidy already exists in unaffected father";
 | 
| 
 | 
   928             }
 | 
| 
 | 
   929           }
 | 
| 
 | 
   930           else{
 | 
| 
 | 
   931             if($parents_ploidy[1] > 2){
 | 
| 
 | 
   932               $caveats = "Lower polyploidy already exists in unaffected mother";
 | 
| 
 | 
   933             }
 | 
| 
 | 
   934             # else no caveats, because both parents are "normal", yet we have polyploidy in the offspring
 | 
| 
 | 
   935             else{
 | 
| 
 | 
   936               $caveats = "De novo polyploidy, unaffected parents are diploid";
 | 
| 
 | 
   937             }
 | 
| 
 | 
   938           }
 | 
| 
 | 
   939         }
 | 
| 
 | 
   940         # else
 | 
| 
 | 
   941         else{
 | 
| 
 | 
   942           die "Oops! Error in program logic...how did we get here (unaffected parents)? $_";
 | 
| 
 | 
   943         }
 | 
| 
 | 
   944       } 
 | 
| 
 | 
   945       elsif($parents_ploidy[2] == 1){ # father affected
 | 
| 
 | 
   946           if($ploidy == $parents_ploidy[1]){ # just like unaffected Mom
 | 
| 
 | 
   947             if($ploidy > 2){
 | 
| 
 | 
   948               if($ploidy == $parents_ploidy[0]){
 | 
| 
 | 
   949                 $caveats = "Same polyploidy present in both affected and unaffected parents"; 
 | 
| 
 | 
   950               }
 | 
| 
 | 
   951               else{
 | 
| 
 | 
   952                 $caveats = "Polyploidy inherited from unaffected mother";
 | 
| 
 | 
   953               }
 | 
| 
 | 
   954             }
 | 
| 
 | 
   955             elsif($ploidy < 2){
 | 
| 
 | 
   956               if($ploidy == $parents_ploidy[0]){
 | 
| 
 | 
   957                 $caveats = "Same copy loss in both affected and unaffected parents";
 | 
| 
 | 
   958               }
 | 
| 
 | 
   959               else{
 | 
| 
 | 
   960                 $caveats = "Copy loss is shared with unaffected mother";
 | 
| 
 | 
   961               }
 | 
| 
 | 
   962             }
 | 
| 
 | 
   963             else{
 | 
| 
 | 
   964               if($ploidy == $parents_ploidy[0]){
 | 
| 
 | 
   965                 # Why was this even reported? parents and child have diploid status...
 | 
| 
 | 
   966                 next;
 | 
| 
 | 
   967               }
 | 
| 
 | 
   968               $caveats = "Diploidy is shared with unaffected mother";
 | 
| 
 | 
   969             }
 | 
| 
 | 
   970           }
 | 
| 
 | 
   971           elsif($ploidy > 2){ # polyploidy
 | 
| 
 | 
   972             if($parents_ploidy[0] == 2){
 | 
| 
 | 
   973               if($parents_ploidy[1] > 2){
 | 
| 
 | 
   974                 $caveats = "Unaffected mother has polyploidy (".$parents_ploidy[1]."x), but affected father is diploid";
 | 
| 
 | 
   975               }
 | 
| 
 | 
   976               elsif($parents_ploidy[1] == 2){
 | 
| 
 | 
   977                 $caveats = "Both unaffected mother and affected father are diploid";
 | 
| 
 | 
   978               }
 | 
| 
 | 
   979               else{
 | 
| 
 | 
   980                 $caveats = "Affected father is diploid, unaffected mother has copy loss (".$parents_ploidy[1]."x)";
 | 
| 
 | 
   981               }
 | 
| 
 | 
   982             }
 | 
| 
 | 
   983             elsif($parents_ploidy[0] < 2){
 | 
| 
 | 
   984               $caveats = "Polyploidy found, but affected father had copy loss (".$parents_ploidy[0]."x)";
 | 
| 
 | 
   985             }
 | 
| 
 | 
   986             elsif($ploidy < $parents_ploidy[1]){
 | 
| 
 | 
   987               $caveats = "Polyploidy is less severe than in unaffected mother (".$parents_ploidy[1]."x), or affected father (".$parents_ploidy[0]."x)";
 | 
| 
 | 
   988             }
 | 
| 
 | 
   989             # past here the ploidy is great than in the unaffected mother
 | 
| 
 | 
   990             elsif($parents_ploidy[1] < 2){
 | 
| 
 | 
   991               $caveats = "Polyploidy is also severe in affected father (".$parents_ploidy[0]."x), but unaffected mother actually had copy loss (". $parents_ploidy[1]. "x)";
 | 
| 
 | 
   992             }
 | 
| 
 | 
   993             elsif($parents_ploidy[1] == 2){
 | 
| 
 | 
   994               $caveats = "Polyploidy is also severe in affected father (".$parents_ploidy[0]."x), and mother is diploid";
 | 
| 
 | 
   995             }
 | 
| 
 | 
   996             elsif($ploidy < $parents_ploidy[0]){
 | 
| 
 | 
   997               $caveats = "Polyploidy is less severe than in affected father (".$parents_ploidy[0]."x), but more severe than unaffected mother (". $parents_ploidy[1]. "x)";
 | 
| 
 | 
   998             }
 | 
| 
 | 
   999             elsif($ploidy > $parents_ploidy[0]){
 | 
| 
 | 
  1000               $caveats = "Polyploidy is more severe than in affected father (".$parents_ploidy[0]."x)";
 | 
| 
 | 
  1001             }
 | 
| 
 | 
  1002             else{
 | 
| 
 | 
  1003               $caveats = "Polyploidy is as severe as in affected father";
 | 
| 
 | 
  1004             }
 | 
| 
 | 
  1005           }
 | 
| 
 | 
  1006           elsif($ploidy == 2){
 | 
| 
 | 
  1007             # Don't report diploid status, any funny recombination should show up in large indel analysis
 | 
| 
 | 
  1008             next;
 | 
| 
 | 
  1009           }
 | 
| 
 | 
  1010           else{ # copies < 2
 | 
| 
 | 
  1011             if($ploidy == $parents_ploidy[0]){
 | 
| 
 | 
  1012               if($ploidy > $parents_ploidy[1]){
 | 
| 
 | 
  1013                 $caveats = "Copy loss is the same as affected father, but less than unaffected mother (". $parents_ploidy[1]. "x)";
 | 
| 
 | 
  1014               }
 | 
| 
 | 
  1015               else{
 | 
| 
 | 
  1016                 $caveats = "Copy loss is as severe as in affected father";
 | 
| 
 | 
  1017               }
 | 
| 
 | 
  1018             }
 | 
| 
 | 
  1019             elsif($ploidy > $parents_ploidy[0]){
 | 
| 
 | 
  1020               if($ploidy > $parents_ploidy[1]){
 | 
| 
 | 
  1021                 if($parents_ploidy[1] == 0 and $parents_ploidy[0] == 0){
 | 
| 
 | 
  1022                   $caveats = "Poor mapping, or Mendelian inheritence violation is severe: no copies of region in either parent, but present in offspring";
 | 
| 
 | 
  1023                 } 
 | 
| 
 | 
  1024                 elsif($ploidy == 2){
 | 
| 
 | 
  1025                   next; # child got best of both parents, ignore from CNV standpoint (may still have SNPs of course, or translocation, etc.)
 | 
| 
 | 
  1026                 }
 | 
| 
 | 
  1027                 else{
 | 
| 
 | 
  1028                   $caveats = "Copy loss is less severe than in unaffected mother (".$parents_ploidy[1]."x), or affected father (".$parents_ploidy[0]."x)";
 | 
| 
 | 
  1029                 }
 | 
| 
 | 
  1030               }
 | 
| 
 | 
  1031               # else: child has less copies than unaffected mom, but more than affected Dad
 | 
| 
 | 
  1032               else{
 | 
| 
 | 
  1033                 if($parents_ploidy[1] > 2){
 | 
| 
 | 
  1034                   $caveats = "Copy loss was more severe in affected father (".$parents_ploidy[0]."x), but unaffected mother had polyploidy (".$parents_ploidy[1]."x)";
 | 
| 
 | 
  1035                 }
 | 
| 
 | 
  1036                 elsif($parents_ploidy[1] == 2){
 | 
| 
 | 
  1037                   $caveats = "Copy loss was more severe in affected father (".$parents_ploidy[0]."x), but unaffected mother was diploid";
 | 
| 
 | 
  1038                 }
 | 
| 
 | 
  1039                 else{ # unaffected has loss
 | 
| 
 | 
  1040                   $caveats = "Copy loss is more severe than unaffect mother (".$parents_ploidy[1]."x), but less severe than affected father (".$parents_ploidy[0]."x)";
 | 
| 
 | 
  1041                 }
 | 
| 
 | 
  1042               }
 | 
| 
 | 
  1043             }
 | 
| 
 | 
  1044             # past here, ploidy is less than affected father
 | 
| 
 | 
  1045             elsif($parents_ploidy[1] > 2){
 | 
| 
 | 
  1046               $caveats = "Copy loss is more severe than affected father (".$parents_ploidy[0]."x), and unaffected mother had polyploidy (".$parents_ploidy[1]."x)";
 | 
| 
 | 
  1047             }
 | 
| 
 | 
  1048             elsif($parents_ploidy[1] == 2){
 | 
| 
 | 
  1049               $caveats = "Copy loss is more severe than in affected father (".$parents_ploidy[0]."x)";
 | 
| 
 | 
  1050             }
 | 
| 
 | 
  1051             else{
 | 
| 
 | 
  1052               $caveats = "Copy loss is more severe than in both unaffect mother (".$parents_ploidy[1]."x), and affected father (".$parents_ploidy[0]."x)";
 | 
| 
 | 
  1053             }
 | 
| 
 | 
  1054           }
 | 
| 
 | 
  1055       }
 | 
| 
 | 
  1056       elsif($parents_ploidy[2] == 2){ # mother affected
 | 
| 
 | 
  1057           if($ploidy == $parents_ploidy[0]){ # just like unaffected Dad
 | 
| 
 | 
  1058             if($ploidy > 2){
 | 
| 
 | 
  1059               if($ploidy == $parents_ploidy[1]){
 | 
| 
 | 
  1060                 $caveats = "Same polyploidy present in both affected and unaffected parents";
 | 
| 
 | 
  1061               }
 | 
| 
 | 
  1062               else{
 | 
| 
 | 
  1063                 $caveats = "Polyploidy inherited from unaffected father";
 | 
| 
 | 
  1064               }
 | 
| 
 | 
  1065             }
 | 
| 
 | 
  1066             elsif($ploidy < 2){
 | 
| 
 | 
  1067               if($ploidy == $parents_ploidy[1]){
 | 
| 
 | 
  1068                 $caveats = "Same copy loss in both affected and unaffected parents";
 | 
| 
 | 
  1069               }
 | 
| 
 | 
  1070               else{
 | 
| 
 | 
  1071                 $caveats = "Copy loss is shared with unaffected father";
 | 
| 
 | 
  1072               }
 | 
| 
 | 
  1073             }
 | 
| 
 | 
  1074             else{
 | 
| 
 | 
  1075               if($ploidy == $parents_ploidy[1]){
 | 
| 
 | 
  1076                 # Why was this even reported? parents and child have diploid status...
 | 
| 
 | 
  1077                 next;
 | 
| 
 | 
  1078               }
 | 
| 
 | 
  1079               $caveats = "Diploidy is shared with unaffected father";
 | 
| 
 | 
  1080             }
 | 
| 
 | 
  1081           }
 | 
| 
 | 
  1082           elsif($ploidy > 2){ # polyploidy
 | 
| 
 | 
  1083             if($parents_ploidy[1] == 2){
 | 
| 
 | 
  1084               if($parents_ploidy[0] > 2){
 | 
| 
 | 
  1085                 $caveats = "Unaffected father has polyploidy (".$parents_ploidy[0]."x), but affected mother is diploid";
 | 
| 
 | 
  1086               }
 | 
| 
 | 
  1087               elsif($parents_ploidy[0] == 2){
 | 
| 
 | 
  1088                 $caveats = "Both unaffected father and affected mother are diploid";
 | 
| 
 | 
  1089               }
 | 
| 
 | 
  1090               else{
 | 
| 
 | 
  1091                 $caveats = "Affected mother is diploid, unaffected father has copy loss (".$parents_ploidy[1]."x)";
 | 
| 
 | 
  1092               }
 | 
| 
 | 
  1093             }
 | 
| 
 | 
  1094             elsif($parents_ploidy[1] < 2){
 | 
| 
 | 
  1095               $caveats = "Polyploidy found, but affected mother had copy loss (".$parents_ploidy[1]."x)";
 | 
| 
 | 
  1096             }
 | 
| 
 | 
  1097             elsif($ploidy < $parents_ploidy[0]){
 | 
| 
 | 
  1098               $caveats = "Polyploidy is less severe than in unaffected father (".$parents_ploidy[0]."x), or affected mother (".$parents_ploidy[1]."x)";
 | 
| 
 | 
  1099             }
 | 
| 
 | 
  1100             # past here the ploidy is great than in the unaffected father
 | 
| 
 | 
  1101             elsif($parents_ploidy[0] < 2){
 | 
| 
 | 
  1102               $caveats = "Polyploidy is also severe in affected mother (".$parents_ploidy[1]."x), but unaffected father actually had copy loss (". $parents_ploidy[0]. "x)";
 | 
| 
 | 
  1103             }
 | 
| 
 | 
  1104             elsif($parents_ploidy[0] == 2){
 | 
| 
 | 
  1105               $caveats = "Polyploidy is also severe in affected mother (".$parents_ploidy[1]."x), and unaffected father is diploid";
 | 
| 
 | 
  1106             } 
 | 
| 
 | 
  1107             elsif($ploidy < $parents_ploidy[1]){
 | 
| 
 | 
  1108               $caveats = "Polyploidy is less severe than in affected mother (".$parents_ploidy[1]."x), but more severe than unaffected father (". $parents_ploidy[0]. "x)";
 | 
| 
 | 
  1109             }
 | 
| 
 | 
  1110             elsif($ploidy > $parents_ploidy[1]){
 | 
| 
 | 
  1111               $caveats = "Polyploidy is more severe than in affected mother (".$parents_ploidy[1]."x)";
 | 
| 
 | 
  1112             }
 | 
| 
 | 
  1113             else{
 | 
| 
 | 
  1114               $caveats = "Polyploidy is as severe as in affected mother";
 | 
| 
 | 
  1115             }
 | 
| 
 | 
  1116           }
 | 
| 
 | 
  1117           elsif($ploidy == 2){
 | 
| 
 | 
  1118             # Don't report diploid status, any funny recombination should show up in large indel analysis
 | 
| 
 | 
  1119             next;
 | 
| 
 | 
  1120           }
 | 
| 
 | 
  1121           else{ # copies < 2
 | 
| 
 | 
  1122             if($ploidy == $parents_ploidy[1]){
 | 
| 
 | 
  1123               if($ploidy > $parents_ploidy[0]){
 | 
| 
 | 
  1124                 $caveats = "Copy loss is the same as affected mother, but less than unaffected father (". $parents_ploidy[0]. "x)";
 | 
| 
 | 
  1125               }
 | 
| 
 | 
  1126               else{
 | 
| 
 | 
  1127                 $caveats = "Copy loss is as severe as in affected mother";
 | 
| 
 | 
  1128               }
 | 
| 
 | 
  1129             }
 | 
| 
 | 
  1130             elsif($ploidy > $parents_ploidy[1]){
 | 
| 
 | 
  1131               if($ploidy > $parents_ploidy[0]){
 | 
| 
 | 
  1132                 if($parents_ploidy[1] == 0 and $parents_ploidy[0] == 0){
 | 
| 
 | 
  1133                   $caveats = "Poor mapping, or Mendelian inheritence violation is severe: no copies of region in either parent, but present in offspring";
 | 
| 
 | 
  1134                 }
 | 
| 
 | 
  1135                 elsif($ploidy == 2){
 | 
| 
 | 
  1136                   next; # child got best of both parents, ignore from CNV standpoint (may still have SNPs of course, or translocation, etc.)
 | 
| 
 | 
  1137                 }
 | 
| 
 | 
  1138                 else{
 | 
| 
 | 
  1139                   $caveats = "Copy loss is less severe than in unaffected father (".$parents_ploidy[0]."x), or affected mother (".$parents_ploidy[1]."x)";
 | 
| 
 | 
  1140                 }
 | 
| 
 | 
  1141               }
 | 
| 
 | 
  1142               # else: child has less copies than unaffected Dad, but more than affected Mom
 | 
| 
 | 
  1143               else{
 | 
| 
 | 
  1144                 if($parents_ploidy[0] > 2){
 | 
| 
 | 
  1145                   $caveats = "Copy loss was more severe in affected mother (".$parents_ploidy[1]."x), but unaffected father had polyploidy (".$parents_ploidy[0]."x)";
 | 
| 
 | 
  1146                 }
 | 
| 
 | 
  1147                 elsif($parents_ploidy[0] == 2){
 | 
| 
 | 
  1148                   $caveats = "Copy loss was more severe in affected mother (".$parents_ploidy[1]."x), but unaffected father was diploid";
 | 
| 
 | 
  1149                 }
 | 
| 
 | 
  1150                 else{ # unaffected has loss
 | 
| 
 | 
  1151                   $caveats = "Copy loss is more severe than unaffect father (".$parents_ploidy[0]."x), but less severe than affected mother (".$parents_ploidy[1]."x)";
 | 
| 
 | 
  1152                 }
 | 
| 
 | 
  1153               }
 | 
| 
 | 
  1154             }
 | 
| 
 | 
  1155             # past here, ploidy is less than affected mother
 | 
| 
 | 
  1156             elsif($parents_ploidy[0] > 2){
 | 
| 
 | 
  1157               $caveats = "Copy loss is more severe than affected mother (".$parents_ploidy[1]."x), and unaffected father had polyploidy (".$parents_ploidy[0]."x)";
 | 
| 
 | 
  1158             }
 | 
| 
 | 
  1159             elsif($parents_ploidy[0] == 2){
 | 
| 
 | 
  1160               $caveats = "Copy loss is more severe than in affected mother (".$parents_ploidy[1]."x)";
 | 
| 
 | 
  1161             }
 | 
| 
 | 
  1162             else{
 | 
| 
 | 
  1163               $caveats = "Copy loss is more severe than in both unaffect father (".$parents_ploidy[0]."x), and affected mother (".$parents_ploidy[1]."x)";
 | 
| 
 | 
  1164             }
 | 
| 
 | 
  1165           }
 | 
| 
 | 
  1166         }
 | 
| 
 | 
  1167 
 | 
| 
 | 
  1168     }
 | 
| 
 | 
  1169     if($F[3] and $F[3] ne "-"){ # prexisting caveat from CNV caller
 | 
| 
 | 
  1170       if(defined $caveats){
 | 
| 
 | 
  1171         $caveats .= "; $F[3]" unless $caveats =~ /\b$F[3]\b/;
 | 
| 
 | 
  1172       }
 | 
| 
 | 
  1173       else{
 | 
| 
 | 
  1174        $caveats  = $F[3];
 | 
| 
 | 
  1175       }
 | 
| 
 | 
  1176     }
 | 
| 
 | 
  1177 
 | 
| 
 | 
  1178     # Sort by start for consistency
 | 
| 
 | 
  1179     my @target_parents = sort {$feature_range{$cnv_chr}->{$a}->[0]->[0] <=> $feature_range{$cnv_chr}->{$b}->[0]->[0]} @$target_parents;
 | 
| 
 | 
  1180 
 | 
| 
 | 
  1181     for my $target_parent (@target_parents){
 | 
| 
 | 
  1182       my $target_caveats = $caveats;
 | 
| 
 | 
  1183       my $strand = $feature_strand{$target_parent};
 | 
| 
 | 
  1184       # report the gain/loss of each gene separately, for simplicity in downstream analysis
 | 
| 
 | 
  1185       my $cnv_exon_start = 10000000000; # genomic coords
 | 
| 
 | 
  1186       my $cnv_exon_end = 0;
 | 
| 
 | 
  1187       my $cnv_cdna_start = 0; # cDNA coords
 | 
| 
 | 
  1188       my $cnv_cdna_end = 0;
 | 
| 
 | 
  1189       my $off5 = 0; # border outside the exon?
 | 
| 
 | 
  1190       my $off3 = 0;
 | 
| 
 | 
  1191       my @feature_ranges = @{$feature_range{$cnv_chr}->{$target_parent}};
 | 
| 
 | 
  1192       # find the first and last exons in the gene that are inside the CNV
 | 
| 
 | 
  1193       for my $subregion (@feature_ranges){
 | 
| 
 | 
  1194         # exon overlaps CNV?
 | 
| 
 | 
  1195         if($subregion->[0] <= $cnv_end and $subregion->[1] >= $cnv_start){
 | 
| 
 | 
  1196           if($cnv_exon_start > $subregion->[0]){
 | 
| 
 | 
  1197             if($cnv_start < $subregion->[0]){
 | 
| 
 | 
  1198               $cnv_exon_start = $subregion->[0]; $off5 = 1;
 | 
| 
 | 
  1199               $cnv_cdna_start = $subregion->[2];
 | 
| 
 | 
  1200             }
 | 
| 
 | 
  1201             else{
 | 
| 
 | 
  1202               $cnv_exon_start = $cnv_start; $off5 = 0;
 | 
| 
 | 
  1203               $cnv_cdna_start = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$cnv_start: $cnv_start-$subregion->[0]);
 | 
| 
 | 
  1204             }
 | 
| 
 | 
  1205           }
 | 
| 
 | 
  1206           if($cnv_exon_end < $subregion->[1]){ 
 | 
| 
 | 
  1207             if($cnv_end > $subregion->[1]){
 | 
| 
 | 
  1208               $cnv_exon_end = $subregion->[1]; $off3 = 1;
 | 
| 
 | 
  1209               $cnv_cdna_end = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$subregion->[1] : $subregion->[1]-$subregion->[0]);
 | 
| 
 | 
  1210             }
 | 
| 
 | 
  1211             else{
 | 
| 
 | 
  1212               $cnv_exon_end = $cnv_end; $off3 = 0;
 | 
| 
 | 
  1213               $cnv_cdna_end = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$cnv_end : $cnv_end-$subregion->[0]);
 | 
| 
 | 
  1214             }
 | 
| 
 | 
  1215           }
 | 
| 
 | 
  1216         }
 | 
| 
 | 
  1217       }
 | 
| 
 | 
  1218 
 | 
| 
 | 
  1219     my $ends_internally = 0;
 | 
| 
 | 
  1220     if($cnv_exon_end == 0){ # ends inside the exon
 | 
| 
 | 
  1221       $cnv_exon_end = $cnv_end;
 | 
| 
 | 
  1222       $ends_internally = 1;
 | 
| 
 | 
  1223     }
 | 
| 
 | 
  1224     # See if it's in the structural variant database
 | 
| 
 | 
  1225     my @gain_coverage; $#gain_coverage = $cnv_exon_end-$cnv_exon_start; # preallocate blanks
 | 
| 
 | 
  1226     my @loss_coverage; $#loss_coverage = $cnv_exon_end-$cnv_exon_start; # preallocate blanks
 | 
| 
 | 
  1227     my $dgv_loss_id; # report the DGV entry that covers most of the observed structural variant 
 | 
| 
 | 
  1228     my $dgv_loss_length = 0; # report the DGV entry that covers most of the observed structural variant 
 | 
| 
 | 
  1229     my $dgv_gain_id; # report the DGV entry that covers most of the observed structural variant 
 | 
| 
 | 
  1230     my $dgv_gain_length = 0; # report the DGV entry that covers most of the observed structural variant 
 | 
| 
 | 
  1231     my $gains;
 | 
| 
 | 
  1232     my $losses;
 | 
| 
 | 
  1233     my $dgv_chr = $cnv_chr;
 | 
| 
 | 
  1234     $dgv_chr =~ s/^chr//; # no prefix in DGV
 | 
| 
 | 
  1235     #open(DGV, "tabix $dgv_file $dgv_chr:$cnv_exon_start-$cnv_exon_end |") # check out CNV in this gene model region
 | 
| 
 | 
  1236     #  or die "Cannot run tabix: $!\n";
 | 
| 
 | 
  1237     open(DGV, "/dev/null");
 | 
| 
 | 
  1238     while(<DGV>){
 | 
| 
 | 
  1239       my @C = split /\t/, $_;
 | 
| 
 | 
  1240       next if $C[4] ne "CNV"; # todo: handle indels?
 | 
| 
 | 
  1241       my $dgv_start = $C[2];
 | 
| 
 | 
  1242       my $dgv_end = $C[3];
 | 
| 
 | 
  1243       my $dgv_direction = $C[5];
 | 
| 
 | 
  1244       my $gain_cov_count = 0;
 | 
| 
 | 
  1245       my $loss_cov_count = 0;
 | 
| 
 | 
  1246       if($dgv_direction eq "Gain"){
 | 
| 
 | 
  1247         for(my $i = ($dgv_start < $cnv_exon_start ? $cnv_exon_start : $dgv_start); $i <= $dgv_end and $i <= $cnv_exon_end; $i++){
 | 
| 
 | 
  1248           $gain_coverage[$i-$cnv_exon_start] = 1 unless defined $gain_coverage[$i-$cnv_exon_start];
 | 
| 
 | 
  1249           $gain_cov_count++;
 | 
| 
 | 
  1250         }
 | 
| 
 | 
  1251       }
 | 
| 
 | 
  1252       elsif($dgv_direction eq "Loss"){
 | 
| 
 | 
  1253         for(my $i = ($dgv_start < $cnv_exon_start ? $cnv_exon_start : $dgv_start); $i <= $dgv_end and $i <= $cnv_exon_end; $i++){
 | 
| 
 | 
  1254           $loss_coverage[$i-$cnv_exon_start] = 1 unless defined $loss_coverage[$i-$cnv_exon_start];
 | 
| 
 | 
  1255           $loss_cov_count++;
 | 
| 
 | 
  1256         }
 | 
| 
 | 
  1257       }
 | 
| 
 | 
  1258       if($dgv_direction eq "Gain" and $gain_cov_count > $dgv_gain_length){
 | 
| 
 | 
  1259         $dgv_gain_id = $C[0];
 | 
| 
 | 
  1260         $dgv_gain_length = $gain_cov_count;
 | 
| 
 | 
  1261       }
 | 
| 
 | 
  1262       if($dgv_direction eq "Loss" and $loss_cov_count > $dgv_loss_length){
 | 
| 
 | 
  1263         $dgv_loss_id = $C[0];
 | 
| 
 | 
  1264         $dgv_loss_length = $loss_cov_count;
 | 
| 
 | 
  1265       }
 | 
| 
 | 
  1266     }
 | 
| 
 | 
  1267     close(DGV);
 | 
| 
 | 
  1268 
 | 
| 
 | 
  1269     my $gain_coverage = 0;
 | 
| 
 | 
  1270     for my $count (@gain_coverage){
 | 
| 
 | 
  1271       $gain_coverage++ if defined $count;
 | 
| 
 | 
  1272     }
 | 
| 
 | 
  1273     $gain_coverage = sprintf "%.3f", $gain_coverage/($cnv_exon_end-$cnv_exon_start+1); # make it a proportion
 | 
| 
 | 
  1274     my $loss_coverage = 0;
 | 
| 
 | 
  1275     for my $count (@loss_coverage){
 | 
| 
 | 
  1276       $loss_coverage++ if defined $count;
 | 
| 
 | 
  1277     }
 | 
| 
 | 
  1278     $loss_coverage = sprintf "%.3f", $loss_coverage/($cnv_exon_end-$cnv_exon_start+1); # make it a proportion
 | 
| 
 | 
  1279 
 | 
| 
 | 
  1280     my $src = "DGV";
 | 
| 
 | 
  1281     my $dgv_id = "NA";
 | 
| 
 | 
  1282     my $dgv_caveat;
 | 
| 
 | 
  1283     my $dgv_coverage;
 | 
| 
 | 
  1284     if($ploidy > 2){
 | 
| 
 | 
  1285       if(not defined $dgv_gain_id){
 | 
| 
 | 
  1286         if(defined $dgv_loss_id){
 | 
| 
 | 
  1287           $dgv_id = sprintf "%s/%.3f", $dgv_loss_id, $dgv_loss_length/($cnv_exon_end-$cnv_exon_start+1); 
 | 
| 
 | 
  1288           $dgv_caveat = "; No gains are known in healthy controls, the DGV2 ID reported is for a loss in the same area";
 | 
| 
 | 
  1289           $dgv_coverage = $loss_coverage;
 | 
| 
 | 
  1290         }
 | 
| 
 | 
  1291         else{
 | 
| 
 | 
  1292           $dgv_id = "novel";
 | 
| 
 | 
  1293           $dgv_coverage = "NA";
 | 
| 
 | 
  1294           $src = "NA";
 | 
| 
 | 
  1295         }
 | 
| 
 | 
  1296       }
 | 
| 
 | 
  1297       else{
 | 
| 
 | 
  1298         $dgv_id = sprintf "%s/%.3f", $dgv_gain_id, $dgv_gain_length/($cnv_exon_end-$cnv_exon_start+1);
 | 
| 
 | 
  1299         $dgv_coverage = $gain_coverage;
 | 
| 
 | 
  1300       }
 | 
| 
 | 
  1301     }
 | 
| 
 | 
  1302     elsif($ploidy < 2){
 | 
| 
 | 
  1303       if(not defined $dgv_loss_id){
 | 
| 
 | 
  1304         if(defined $dgv_gain_id){
 | 
| 
 | 
  1305           $dgv_id = sprintf "%s/%.3f", $dgv_gain_id, $dgv_gain_length/($cnv_exon_end-$cnv_exon_start+1); 
 | 
| 
 | 
  1306           $dgv_caveat = "; No losses are known in healthy controls, the DGV2 ID reported is for a gain in the same area";
 | 
| 
 | 
  1307           $dgv_coverage = $gain_coverage;
 | 
| 
 | 
  1308         }
 | 
| 
 | 
  1309         else{
 | 
| 
 | 
  1310           $dgv_id = "novel";
 | 
| 
 | 
  1311           $dgv_coverage = "NA";
 | 
| 
 | 
  1312           $src = "NA";
 | 
| 
 | 
  1313         }
 | 
| 
 | 
  1314       }
 | 
| 
 | 
  1315       else{
 | 
| 
 | 
  1316         $dgv_id = sprintf "%s/%.3f", $dgv_loss_id, $dgv_loss_length/($cnv_exon_end-$cnv_exon_start+1);
 | 
| 
 | 
  1317         $dgv_coverage = $loss_coverage;
 | 
| 
 | 
  1318       }
 | 
| 
 | 
  1319     }
 | 
| 
 | 
  1320     
 | 
| 
 | 
  1321     my $non_coding = 0;
 | 
| 
 | 
  1322     if(not exists $feature_cds_max{$target_parent} or not defined $feature_cds_max{$target_parent} or $feature_cds_max{$target_parent} eq ""){
 | 
| 
 | 
  1323         $non_coding = 1;
 | 
| 
 | 
  1324     } 
 | 
| 
 | 
  1325     $target_caveats .= $dgv_caveat if defined $dgv_caveat and $dgv_id ne "novel" and $target_caveats !~ /\Q$dgv_caveat\E/;
 | 
| 
 | 
  1326     #print "Recorded $cnv_chr:$cnv_start caveat $caveats\n";
 | 
| 
 | 
  1327       # if it doesn't overlap an exon, we need to find out which two exons it's between
 | 
| 
 | 
  1328     if($ends_internally){
 | 
| 
 | 
  1329         my $intron_found = 0;
 | 
| 
 | 
  1330         for(my $i = 0; $i < $#feature_ranges; $i++){
 | 
| 
 | 
  1331           if($feature_ranges[$i]->[1] < $cnv_start and $feature_ranges[$i+1]->[0] > $cnv_end){
 | 
| 
 | 
  1332             if($ploidy > 2){ # gain
 | 
| 
 | 
  1333               if($strand eq "-"){
 | 
| 
 | 
  1334                 record_snv("$target_parent\t",
 | 
| 
 | 
  1335                    ($non_coding ? "g.$cnv_start\_$cnv_end" :
 | 
| 
 | 
  1336                     "c.".($feature_ranges[$i+1]->[2])."+".($feature_ranges[$i+1]->[0]-$cnv_end)."_".($feature_ranges[$i+1]->[2]+1)."-".($cnv_start-$feature_ranges[$i]->[1])),
 | 
| 
 | 
  1337                 # pos	Zygosity	P-value	Variant Reads	Total Reads	Ref Bases	Var Bases	Population Frequency Source	Pop. freq. or DGV2 gain/loss coverage	dbSNP or DGV2 ID
 | 
| 
 | 
  1338                    "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\tNA\t$p_value\tNA\tNA\t",
 | 
| 
 | 
  1339                    "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
 | 
| 
 | 
  1340               }
 | 
| 
 | 
  1341               else{
 | 
| 
 | 
  1342                 record_snv("$target_parent\t",
 | 
| 
 | 
  1343                    ($non_coding ? "g.$cnv_start\_$cnv_end" :
 | 
| 
 | 
  1344                    "c.".($feature_ranges[$i+1]->[2]-1)."+".($cnv_start-$feature_ranges[$i]->[1])."_".$feature_ranges[$i+1]->[2]."-".($feature_ranges[$i+1]->[0]-$cnv_end)),
 | 
| 
 | 
  1345                    "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\tNA\t$p_value\tNA\tNA\t",
 | 
| 
 | 
  1346                    "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
 | 
| 
 | 
  1347               }
 | 
| 
 | 
  1348             }
 | 
| 
 | 
  1349             else{ # loss
 | 
| 
 | 
  1350               if($strand eq "-"){
 | 
| 
 | 
  1351                 record_snv("$target_parent\t",
 | 
| 
 | 
  1352                    ($non_coding ? "g.$cnv_start\_$cnv_end" : 
 | 
| 
 | 
  1353                    "c.".($feature_ranges[$i+1]->[2])."+".($feature_ranges[$i+1]->[0]-$cnv_end)."_".($feature_ranges[$i+1]->[2]+1)."-".($cnv_start-$feature_ranges[$i]->[1])),
 | 
| 
 | 
  1354                    "del\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t",
 | 
| 
 | 
  1355                    "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
 | 
| 
 | 
  1356               }
 | 
| 
 | 
  1357               else{
 | 
| 
 | 
  1358                 record_snv("$target_parent\t",
 | 
| 
 | 
  1359                    ($non_coding ? "g.$cnv_start\_$cnv_end" :
 | 
| 
 | 
  1360                    "c.".($feature_ranges[$i+1]->[2]-1)."+".($cnv_start-$feature_ranges[$i]->[1])."_".$feature_ranges[$i+1]->[2]."-".($feature_ranges[$i+1]->[0]-$cnv_end)),
 | 
| 
 | 
  1361                    "del\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t",
 | 
| 
 | 
  1362                    "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
 | 
| 
 | 
  1363               }
 | 
| 
 | 
  1364             }
 | 
| 
 | 
  1365             $intron_found = 1; last;
 | 
| 
 | 
  1366           }
 | 
| 
 | 
  1367         }
 | 
| 
 | 
  1368         warn "Logic error: CNV overlaps a gene ($target_parent), but is neither intronic nor exonic. Offending CNV was $_\n" unless $intron_found;
 | 
| 
 | 
  1369         next;
 | 
| 
 | 
  1370       }
 | 
| 
 | 
  1371       if($strand eq "-"){
 | 
| 
 | 
  1372         my $tmp = $cnv_cdna_start;
 | 
| 
 | 
  1373         $cnv_cdna_start = $cnv_cdna_end;
 | 
| 
 | 
  1374         $cnv_cdna_end = $tmp;
 | 
| 
 | 
  1375       }
 | 
| 
 | 
  1376       # Make the location label pretty descriptive
 | 
| 
 | 
  1377       my $cnv_phase = "";
 | 
| 
 | 
  1378       if($cnv_exon_start > $cnv_start or $cnv_exon_end < $cnv_end){
 | 
| 
 | 
  1379         $cnv_phase = "CNV-$cnv_chr:$cnv_start-$cnv_end"; # Use phase to note that it's part of a bigger CNV than just the range of this feature
 | 
| 
 | 
  1380       }
 | 
| 
 | 
  1381       # if we get here, we're in a gained/deleted exon category
 | 
| 
 | 
  1382       # CNVs are fuzzy-edged (as opposed to large indels), so produce HGVS syntax that reflect this
 | 
| 
 | 
  1383       if($ploidy > 2){ # gain
 | 
| 
 | 
  1384         # find the exons encompassed by the CNV. NOTE that we assume that polyploidies are proximal 
 | 
| 
 | 
  1385         record_snv("$target_parent\t",
 | 
| 
 | 
  1386                    ($non_coding ? "g.".($cnv_exon_start > $cnv_start ? "$cnv_exon_start-?" : $cnv_start)."_".($cnv_exon_end < $cnv_end ? "$cnv_exon_end+?" : $cnv_end) : 
 | 
| 
 | 
  1387                    "c.$cnv_cdna_start".($off5?"-?":"")."_$cnv_cdna_end".($off3?"+?":"")),
 | 
| 
 | 
  1388                    "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_exon_start\t$cnv_exon_end\tNA\t$p_value\tNA\tNA\t",
 | 
| 
 | 
  1389                    "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t$cnv_phase\n");
 | 
| 
 | 
  1390       }
 | 
| 
 | 
  1391       else{ # loss
 | 
| 
 | 
  1392         #translate genome coordinates into cDNA coordinates
 | 
| 
 | 
  1393         record_snv("$target_parent\t",
 | 
| 
 | 
  1394                    ($non_coding ? "g.".($cnv_exon_start > $cnv_start ? "$cnv_exon_start-?" : $cnv_start)."_".($cnv_exon_end < $cnv_end ? "$cnv_exon_end+?" : $cnv_end) : 
 | 
| 
 | 
  1395                    "c.$cnv_cdna_start".($off5?"-?":"")."_$cnv_cdna_end".($off3?"+?":"")),
 | 
| 
 | 
  1396                    "del\t$strand\t$cnv_chr\t$cnv_exon_start\t$cnv_exon_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t",
 | 
| 
 | 
  1397                    "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t$cnv_phase\n");
 | 
| 
 | 
  1398       }
 | 
| 
 | 
  1399     } 
 | 
| 
 | 
  1400   }
 | 
| 
 | 
  1401   close(CNV);
 | 
| 
 | 
  1402 
 | 
| 
 | 
  1403 }
 | 
| 
 | 
  1404 
 | 
| 
 | 
  1405  
 | 
| 
 | 
  1406 #sort genes by start, then longest if tied
 | 
| 
 | 
  1407 my %rc = qw(A T T A G C C G N N S W W S K M M K Y R R Y X X);
 | 
| 
 | 
  1408 print STDERR "Processing variant calls..." unless $quiet;
 | 
| 
 | 
  1409 %chr_read = ();
 | 
| 
 | 
  1410 open(VCFIN, $input_file)
 | 
| 
 | 
  1411     or die "Cannot open $input_file for reading: $!\n";
 | 
| 
 | 
  1412 while(<VCFIN>){
 | 
| 
 | 
  1413     if(/^\s*(?:#|$)/){ # blank or hash comment lines
 | 
| 
 | 
  1414        next;
 | 
| 
 | 
  1415     }
 | 
| 
 | 
  1416     chomp;
 | 
| 
 | 
  1417     my @fields = split /\t/, $_;
 | 
| 
 | 
  1418  
 | 
| 
 | 
  1419     next unless exists $feature_range{$fields[0]};
 | 
| 
 | 
  1420     if(not $quiet and not exists $chr_read{$fields[0]}){
 | 
| 
 | 
  1421 	print STDERR " $fields[0]";
 | 
| 
 | 
  1422 	$chr_read{$fields[0]} = 1;
 | 
| 
 | 
  1423 	#print STDERR "(not in reference file!)" unless exists $feature_range{$fields[0]};
 | 
| 
 | 
  1424     }
 | 
| 
 | 
  1425 
 | 
| 
 | 
  1426     next if $fields[4] eq "<NON_REF>"; # GVCF background stuff
 | 
| 
 | 
  1427     next if $fields[9] eq "./." or $fields[9] eq "."; # no call
 | 
| 
 | 
  1428     my $chr = $fields[0];
 | 
| 
 | 
  1429     next if defined $which_chr and $chr ne $which_chr and $chr ne "chr$which_chr" and "chr$chr" ne $which_chr;
 | 
| 
 | 
  1430     print STDERR "passes chr and field # test" if grep /dataset_7684.dat/, @ARGV;
 | 
| 
 | 
  1431     $chr = "chr$chr" if $chr !~ /^chr/;
 | 
| 
 | 
  1432     $fields[8] =~ s/\s+$//;
 | 
| 
 | 
  1433     my @values = split /:/, $fields[9];
 | 
| 
 | 
  1434     #print STDERR join(" / ", @values), "\n" if $. == 84;
 | 
| 
 | 
  1435     my @keys = split /:/, $fields[8];
 | 
| 
 | 
  1436     my $zygosity = "n/a";
 | 
| 
 | 
  1437     my $quality = "n/a";
 | 
| 
 | 
  1438     my $read_depth = "n/a";
 | 
| 
 | 
  1439     my $variant_depths = "n/a";
 | 
| 
 | 
  1440     for(my $i = 0; $i <= $#keys and $i <= $#values; $i++){ # values max index check because some genotypers are nasty and don't provide as many fields as they say they will
 | 
| 
 | 
  1441 	if($keys[$i] eq "GT"){
 | 
| 
 | 
  1442             if($values[$i] =~ /\./ or $values[$i] =~ /0\/0/){ # one genotype not described
 | 
| 
 | 
  1443               $zygosity = "none";
 | 
| 
 | 
  1444               last;
 | 
| 
 | 
  1445             }
 | 
| 
 | 
  1446 	    else{ # genotypes described
 | 
| 
 | 
  1447               $zygosity = $values[$i] =~ /[02]/ ? "heterozygote" : "homozygote";
 | 
| 
 | 
  1448             }
 | 
| 
 | 
  1449 	}
 | 
| 
 | 
  1450         elsif($keys[$i] eq "DNM_CONFIG" and $zygosity eq "n/a"){ # hack
 | 
| 
 | 
  1451             $zygosity = $values[$i] =~ /^(.)\1/ ? "homozygote" : "heterozygote";
 | 
| 
 | 
  1452         }
 | 
| 
 | 
  1453     	elsif($keys[$i] eq "GQ" and $values[$i] ne "."){
 | 
| 
 | 
  1454             #print "Checking GQ (index $i) $values[$i] gq2p\n" if $. == 84;
 | 
| 
 | 
  1455 	    $quality = gq2p($values[$i]);
 | 
| 
 | 
  1456 	}
 | 
| 
 | 
  1457     	elsif($keys[$i] eq "RD"){ # from some tools like denovo variant finders
 | 
| 
 | 
  1458 	    $read_depth = $values[$i];
 | 
| 
 | 
  1459 	}
 | 
| 
 | 
  1460     	elsif($keys[$i] eq "DP"){
 | 
| 
 | 
  1461 	    $read_depth = $values[$i];
 | 
| 
 | 
  1462 	}
 | 
| 
 | 
  1463         # the frequency of the variant can go by many names, here we have freebayes (A* are new and old versions) and atlas2_indel
 | 
| 
 | 
  1464     	elsif($keys[$i] eq "AA" or $keys[$i] eq "VR" or $keys[$i] eq "AO"){
 | 
| 
 | 
  1465 	    $variant_depths = $values[$i];
 | 
| 
 | 
  1466 	}
 | 
| 
 | 
  1467         # here we have GATK variant freq of form ref#,var#
 | 
| 
 | 
  1468     	elsif($keys[$i] eq "AD"){
 | 
| 
 | 
  1469 	    $variant_depths = $values[$i];
 | 
| 
 | 
  1470             $variant_depths =~ s/^\d+,//;
 | 
| 
 | 
  1471 	}
 | 
| 
 | 
  1472         else{
 | 
| 
 | 
  1473             #print STDERR "Ignoring field $keys[$i]\n";
 | 
| 
 | 
  1474         }
 | 
| 
 | 
  1475     }
 | 
| 
 | 
  1476     next if $zygosity eq "none"; # GVCF non-ref for example or when multiple samples are reported simultaneously
 | 
| 
 | 
  1477     $quality = z2p($1) if $fields[7] =~ /Z=(\d+\.\d+)/;
 | 
| 
 | 
  1478     if($quality eq "n/a" and $fields[5] ne "."){
 | 
| 
 | 
  1479       $quality = gq2p($fields[5]);
 | 
| 
 | 
  1480     }
 | 
| 
 | 
  1481     if($fields[5] eq "0" and $fields[6] eq "PASS"){ # not qual and a PASS in the filter column
 | 
| 
 | 
  1482       $quality = 1;
 | 
| 
 | 
  1483     }
 | 
| 
 | 
  1484     elsif($quality ne "n/a" and $quality > $min_pvalue){ # p-value cutoff
 | 
| 
 | 
  1485       #print "Checking call quality $fields[5]  gq2p\n" if $. == 84;
 | 
| 
 | 
  1486       next unless gq2p($fields[5]) <= $min_pvalue; # in some cases, programs like FreeBayes give low genotype quality such as when a call is borderline homo/het
 | 
| 
 | 
  1487                                                    # in these cases it would be silly to reject the call if their are many supporting reads.
 | 
| 
 | 
  1488     }
 | 
| 
 | 
  1489 
 | 
| 
 | 
  1490     # Some tools like GATK don't report number of variant reads...infer from other data if possible
 | 
| 
 | 
  1491     if($variant_depths eq "n/a"){
 | 
| 
 | 
  1492       my @key_value_pairs = split /;/, $fields[7];
 | 
| 
 | 
  1493       for my $key_value_pair (@key_value_pairs){
 | 
| 
 | 
  1494         if($key_value_pair !~ /^(.*?)=(.*)$/){
 | 
| 
 | 
  1495           next;
 | 
| 
 | 
  1496           #next if $key_value_pair eq "INDEL"; # samtools peculiarity
 | 
| 
 | 
  1497           #die "Key-value pair field (column #8) does not have the format key=value for entry $key_value_pair (line #$. of ), please fix the VCF file\n"; 
 | 
| 
 | 
  1498         }
 | 
| 
 | 
  1499         if($1 eq "AB"){ # GATK: for het calls, AB is ref/(ref+var), only one variant reported per line
 | 
| 
 | 
  1500           $variant_depths = "";
 | 
| 
 | 
  1501           for my $ab (split /,/, $2){
 | 
| 
 | 
  1502             $variant_depths .= int((1-$ab)*$read_depth).","; 
 | 
| 
 | 
  1503           }
 | 
| 
 | 
  1504           chop $variant_depths;
 | 
| 
 | 
  1505         }
 | 
| 
 | 
  1506         elsif($1 eq "MLEAC"){
 | 
| 
 | 
  1507         }
 | 
| 
 | 
  1508         elsif($1 eq "DP4"){ # samtools: high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases
 | 
| 
 | 
  1509           my @rds = split /,/, $2;
 | 
| 
 | 
  1510           $variant_depths = $rds[2]+$rds[3]; 
 | 
| 
 | 
  1511           $read_depth = $rds[0]+$rds[1]+$variant_depths; 
 | 
| 
 | 
  1512           if($fields[4] =~ /,/){ # samtools doesn't break down compound het calls into individual frequencies
 | 
| 
 | 
  1513             my $num_alt_genotypes = $fields[4] =~ tr/,/,/;
 | 
| 
 | 
  1514             $num_alt_genotypes++;
 | 
| 
 | 
  1515             my $even_prop = sprintf "%.2f", $variant_depths/$read_depth/$num_alt_genotypes;
 | 
| 
 | 
  1516             $variant_depths = join(",", ($even_prop) x $num_alt_genotypes); 
 | 
| 
 | 
  1517             if(not exists $chr2caveats{"$chr:$fields[1]"}){
 | 
| 
 | 
  1518               $chr2caveats{"$chr:$fields[1]"} = "compound het var freq n/a in orig VCF file, auto set to $even_prop";
 | 
| 
 | 
  1519             }
 | 
| 
 | 
  1520             else{
 | 
| 
 | 
  1521               $chr2caveats{"$chr:$fields[1]"} .= "; compound het var freq n/a in orig VCF file, auto set to $even_prop";
 | 
| 
 | 
  1522             }
 | 
| 
 | 
  1523           }
 | 
| 
 | 
  1524         }
 | 
| 
 | 
  1525       }
 | 
| 
 | 
  1526     }
 | 
| 
 | 
  1527     if($variant_depths eq "n/a"){ # usually homo var calls, can only assume all reads are variant
 | 
| 
 | 
  1528       if($zygosity eq "homozygote"){
 | 
| 
 | 
  1529         $variant_depths = $read_depth;
 | 
| 
 | 
  1530         if($read_depth ne "n/a"){
 | 
| 
 | 
  1531           if(not exists $chr2caveats{"$chr:$fields[1]"}){
 | 
| 
 | 
  1532             $chr2caveats{"$chr:$fields[1]"} = "homo var freq n/a in orig VCF file, auto set to 1.0";
 | 
| 
 | 
  1533           }
 | 
| 
 | 
  1534           else{
 | 
| 
 | 
  1535             $chr2caveats{"$chr:$fields[1]"} = "; homo var freq n/a in orig VCF file, auto set to 1.0";
 | 
| 
 | 
  1536           }
 | 
| 
 | 
  1537         }
 | 
| 
 | 
  1538       }
 | 
| 
 | 
  1539       else{
 | 
| 
 | 
  1540         if($read_depth ne "n/a"){
 | 
| 
 | 
  1541           $variant_depths = int($read_depth/2);
 | 
| 
 | 
  1542           if(not exists $chr2caveats{"$chr:$fields[1]"}){
 | 
| 
 | 
  1543             $chr2caveats{"$chr:$fields[1]"} = "het var freq n/a in orig VCF file, auto set to 0.5";
 | 
| 
 | 
  1544           }
 | 
| 
 | 
  1545           else{
 | 
| 
 | 
  1546             $chr2caveats{"$chr:$fields[1]"} = "; het var freq n/a in orig VCF file, auto set to 0.5";
 | 
| 
 | 
  1547           }
 | 
| 
 | 
  1548         }
 | 
| 
 | 
  1549         else{
 | 
| 
 | 
  1550           $variant_depths = $read_depth;
 | 
| 
 | 
  1551         }
 | 
| 
 | 
  1552       }
 | 
| 
 | 
  1553     }
 | 
| 
 | 
  1554 
 | 
| 
 | 
  1555     my $target_parents = $feature_intervaltree{$chr}->fetch($fields[1]-$flanking_bases, $fields[1]+length($fields[3])+$flanking_bases);
 | 
| 
 | 
  1556     # Last ditch, if not in a gene model, is it at least in an enrichment region?
 | 
| 
 | 
  1557     if(not @$target_parents){
 | 
| 
 | 
  1558        next if not exists $enrichment_regions{$chr};
 | 
| 
 | 
  1559        my $regions_ref = $enrichment_regions{$chr};
 | 
| 
 | 
  1560        my $location = $fields[1];
 | 
| 
 | 
  1561        my $strand = "+"; # for lack of a better choice
 | 
| 
 | 
  1562        for(my $i = find_earliest_index($location-$flanking_bases, $regions_ref); 
 | 
| 
 | 
  1563            $i < $#$regions_ref and $location <= $regions_ref->[$i]->[1]+$flanking_bases; 
 | 
| 
 | 
  1564            $i++){
 | 
| 
 | 
  1565           next unless $regions_ref->[$i]->[0]-$flanking_bases <= $location and $regions_ref->[$i]->[1]+$flanking_bases >= $location;
 | 
| 
 | 
  1566  
 | 
| 
 | 
  1567           my $feature_name = "enrichment_target_$chr:".$regions_ref->[$i]->[0]."-".$regions_ref->[$i]->[1];
 | 
| 
 | 
  1568           $feature_type{$feature_name} = "misc_enrichment_kit_target";
 | 
| 
 | 
  1569           $feature_length{$feature_name} = $regions_ref->[$i]->[1]-$regions_ref->[$i]->[0]+1;
 | 
| 
 | 
  1570           my @variants = split /,/, $fields[4];
 | 
| 
 | 
  1571           my @variant_depths = split /,/, $variant_depths;
 | 
| 
 | 
  1572           my $ref = uc($fields[3]);
 | 
| 
 | 
  1573           for(my $j = 0; $j <= $#variants; $j++){
 | 
| 
 | 
  1574             my $variant = $variants[$j];
 | 
| 
 | 
  1575             next if $variant eq "<NON_REF>" or $variant_depths[$j] eq "0"; # GVCF stuff
 | 
| 
 | 
  1576             my $variant_depth = $variant_depths[$j];
 | 
| 
 | 
  1577             if($min_prop){
 | 
| 
 | 
  1578                next unless $variant_depth >= $min_depth and $read_depth ne "n/a" and $variant_depth/$read_depth >= $min_prop;
 | 
| 
 | 
  1579             }
 | 
| 
 | 
  1580             if(length($ref) == 1 and length($variant) == 1){ # SNP
 | 
| 
 | 
  1581               record_snv("$feature_name\tg.$location",
 | 
| 
 | 
  1582                          "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1583                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
 | 
| 
 | 
  1584             }
 | 
| 
 | 
  1585             elsif(length($ref) == 1 and length($variant) > 1){ # insertion
 | 
| 
 | 
  1586               record_snv("$feature_name\tg.$location\_",($location+1),
 | 
| 
 | 
  1587                          "ins",substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1588                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
 | 
| 
 | 
  1589             }
 | 
| 
 | 
  1590             elsif(length($variant) == 1 and length($ref) > 1){ # deletion
 | 
| 
 | 
  1591               record_snv("$feature_name\tg.$location\_",($location+length($ref)-1),
 | 
| 
 | 
  1592                          "del",substr($ref, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1593                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
 | 
| 
 | 
  1594             }
 | 
| 
 | 
  1595             else{ # indel
 | 
| 
 | 
  1596               record_snv("$feature_name\tg.$location\_",($location+length($ref)-1),
 | 
| 
 | 
  1597                          "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1598                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
 | 
| 
 | 
  1599             }
 | 
| 
 | 
  1600           } # end for variants
 | 
| 
 | 
  1601           next; # process next record, we've done all we can with a non-coding-gene SNP
 | 
| 
 | 
  1602        } 
 | 
| 
 | 
  1603     }
 | 
| 
 | 
  1604 
 | 
| 
 | 
  1605     for my $target_parent (@$target_parents){
 | 
| 
 | 
  1606 
 | 
| 
 | 
  1607         print STDERR "Checking parent $target_parent for on $chr:$fields[1] $fields[3] -> $fields[4]\n" if grep /dataset_7684.dat/, @ARGV;
 | 
| 
 | 
  1608     my @feature_ranges = @{$feature_range{$chr}->{$target_parent}};
 | 
| 
 | 
  1609     # Calculate the position of the change within the feature range position
 | 
| 
 | 
  1610     my $strand = $feature_strand{$target_parent};
 | 
| 
 | 
  1611     my $trans_table = exists $feature_transl_table{$target_parent} ? $feature_transl_table{$target_parent} : $default_transl_table;
 | 
| 
 | 
  1612     $fields[4]=~tr/"//d; # sometime strangely surroundsed by quotes
 | 
| 
 | 
  1613     my @variants = split /,/, $fields[4];
 | 
| 
 | 
  1614     my @variant_depths = split /,/, $variant_depths;
 | 
| 
 | 
  1615     my @refs = (uc($fields[3])) x scalar(@variants);
 | 
| 
 | 
  1616     my @locations = ($fields[1]) x scalar(@variants);
 | 
| 
 | 
  1617 
 | 
| 
 | 
  1618     for(my $j = 0; $j <= $#variants; $j++){
 | 
| 
 | 
  1619 	my $ref = $refs[$j];
 | 
| 
 | 
  1620 	my $location = $locations[$j];
 | 
| 
 | 
  1621 	my $feature_offset = 0;
 | 
| 
 | 
  1622 	my $feature_num = 0;
 | 
| 
 | 
  1623 	my $variant = uc($variants[$j]);
 | 
| 
 | 
  1624         next if $variant eq "<NON_REF>" or $variant_depths[$j] eq "0"; # GVCF stuff
 | 
| 
 | 
  1625 	my $variant_depth = $variant_depths[$j] || "n/a";
 | 
| 
 | 
  1626         #print STDERR "Evaluating target parent $target_parent for $chr:$fields[1]-".($fields[1]+length($fields[3]))." -> ",join("/", @$target_parents) , "\n" if $fields[1] == 982994;
 | 
| 
 | 
  1627 
 | 
| 
 | 
  1628         # Break down MNPs into individual SNPs that are phased (TODO: skip if it's an inversion? would require amalgamating SNPs for tools that call them individually, phased :-P)
 | 
| 
 | 
  1629         if(length($variant) > 1 and length($variant) == length($ref)){
 | 
| 
 | 
  1630            my @subvariants;
 | 
| 
 | 
  1631            my @subrefs;
 | 
| 
 | 
  1632            my @sublocations;
 | 
| 
 | 
  1633            my $phase_range = $location."-".($location+length($ref)-1);
 | 
| 
 | 
  1634            for(my $k = 0; $k < length($variant); $k++){ 
 | 
| 
 | 
  1635              my $r = substr($ref, $k, 1);
 | 
| 
 | 
  1636              my $v = substr($variant, $k, 1);
 | 
| 
 | 
  1637              if($r ne $v){
 | 
| 
 | 
  1638                push @subvariants, $v;
 | 
| 
 | 
  1639                push @subrefs, $r;
 | 
| 
 | 
  1640                push @sublocations, $location+$k;
 | 
| 
 | 
  1641              }
 | 
| 
 | 
  1642              elsif(@variants == 1){
 | 
| 
 | 
  1643                next; # homo ref call
 | 
| 
 | 
  1644              }
 | 
| 
 | 
  1645              if($zygosity eq "heterozygote"){
 | 
| 
 | 
  1646                # need to ignore case where a homozygous call (variant or ref) is included in a double non-ref het MNP
 | 
| 
 | 
  1647                if(@variants > 1){
 | 
| 
 | 
  1648                  my $homo = 1;
 | 
| 
 | 
  1649                  for(my $l = 0; $l <= $#variants; $l++){ # using loop in case we handle oligoploid genomes in the future
 | 
| 
 | 
  1650                    if(length($variants[$l]) <= $k or substr($variants[$l], $k, 1) ne $v){
 | 
| 
 | 
  1651                      $homo = 0;
 | 
| 
 | 
  1652                      last;
 | 
| 
 | 
  1653                    }
 | 
| 
 | 
  1654                  }
 | 
| 
 | 
  1655                  next if $homo;
 | 
| 
 | 
  1656                }
 | 
| 
 | 
  1657                my $phase_key = "$chr:".($location+$k).":$v";
 | 
| 
 | 
  1658                my $phase_label = "M$j-$chr:$phase_range";
 | 
| 
 | 
  1659                if(exists $chr2phase{$phase_key}){
 | 
| 
 | 
  1660                  if($chr2phase{$phase_key} !~ /$phase_label/){
 | 
| 
 | 
  1661                    $chr2phase{$phase_key} .= ",$phase_label";
 | 
| 
 | 
  1662                  }
 | 
| 
 | 
  1663                }
 | 
| 
 | 
  1664                else{
 | 
| 
 | 
  1665                  $chr2phase{$phase_key} = $phase_label;
 | 
| 
 | 
  1666                }
 | 
| 
 | 
  1667              }
 | 
| 
 | 
  1668            }
 | 
| 
 | 
  1669            # recycle this MNP variant loop to start processing the individual SNPs 
 | 
| 
 | 
  1670            splice(@refs, $j, 1, @subrefs);
 | 
| 
 | 
  1671            splice(@variants, $j, 1, @subvariants);
 | 
| 
 | 
  1672            splice(@locations, $j, 1, @sublocations);
 | 
| 
 | 
  1673            splice(@variant_depths, $j, 1, ($variant_depth) x scalar(@subvariants));
 | 
| 
 | 
  1674            $j--;
 | 
| 
 | 
  1675            next;
 | 
| 
 | 
  1676         }
 | 
| 
 | 
  1677 
 | 
| 
 | 
  1678         if($min_prop != 0 and $variant_depth eq "n/a" or $variant_depth eq "."){
 | 
| 
 | 
  1679           print STDERR "Could not parse variant depth from $_\n" unless $quiet;
 | 
| 
 | 
  1680           next;
 | 
| 
 | 
  1681         }
 | 
| 
 | 
  1682         next unless $min_prop == 0 or $min_prop and $variant_depth >= $min_depth and $read_depth ne "n/a" and $variant_depth/$read_depth >= $min_prop;
 | 
| 
 | 
  1683         if($zygosity eq "NA"){
 | 
| 
 | 
  1684           # make the call ourselves
 | 
| 
 | 
  1685           $zygosity = $variant_depths/$read_depth > 1-$min_prop ? "homozygote" : "heterozygote";
 | 
| 
 | 
  1686         }
 | 
| 
 | 
  1687 	# e.g. chr22   47857671   . CAAAG   AAGAT,AAAAG    29.04  .
 | 
| 
 | 
  1688 	if(length($variant) > 1 and length($variant) == length($ref)){	    
 | 
| 
 | 
  1689 	    for(my $k = 0; $k < length($variant); $k++){
 | 
| 
 | 
  1690 		my $fixed_variant = $variant;
 | 
| 
 | 
  1691 		substr($fixed_variant, $k, 1) = substr($ref, $k, 1);
 | 
| 
 | 
  1692 		if($fixed_variant eq $ref){ # single base difference at base k between the two
 | 
| 
 | 
  1693 		    $ref = substr($ref, $k, 1);
 | 
| 
 | 
  1694 		    $variant = substr($variant, $k, 1);
 | 
| 
 | 
  1695 		    $location += $k;
 | 
| 
 | 
  1696 		    last;
 | 
| 
 | 
  1697 		}
 | 
| 
 | 
  1698 	    }
 | 
| 
 | 
  1699 	}
 | 
| 
 | 
  1700 
 | 
| 
 | 
  1701         # samtools reports indels with long common tails, causing non-canonical HGVS reporting and a problem when looking up the variant in dbSNP etc.
 | 
| 
 | 
  1702         # remove common tails to variant calls in order to try to rectify this
 | 
| 
 | 
  1703         else{
 | 
| 
 | 
  1704             while(length($ref) > 1 and length($variant) > 1 and substr($ref, length($ref)-1) eq substr($variant, length($variant)-1)){
 | 
| 
 | 
  1705                 chop $ref; chop $variant;
 | 
| 
 | 
  1706             }
 | 
| 
 | 
  1707         }
 | 
| 
 | 
  1708 
 | 
| 
 | 
  1709         # See if a caveat should be added because the indel was in a polyhomomer region
 | 
| 
 | 
  1710         if(length($ref) > length($variant) and index($ref, $variant) == 0 and $fasta_index->fetch("$chr:".($location+1)."-".($location+length($ref)+1)) =~ /^([ACGT])\1+$/i){
 | 
| 
 | 
  1711           if(not exists $chr2caveats{"$chr:$location"}){
 | 
| 
 | 
  1712             $chr2caveats{"$chr:$location"} = "poly".uc($1)." region deletion";
 | 
| 
 | 
  1713           }
 | 
| 
 | 
  1714           elsif($chr2caveats{"$chr:$location"} !~ /poly/){
 | 
| 
 | 
  1715             $chr2caveats{"$chr:$location"} .= "; poly".uc($1)." region deletion";
 | 
| 
 | 
  1716           }
 | 
| 
 | 
  1717         }
 | 
| 
 | 
  1718         elsif(length($ref) < length($variant) and index($variant, $ref) == 0 and substr($variant, 1) =~ /^([ACGT])\1+$/i){
 | 
| 
 | 
  1719           if(not exists $chr2caveats{"$chr:$location"}){
 | 
| 
 | 
  1720             $chr2caveats{"$chr:$location"} .= "poly".uc($1)." region insertion";
 | 
| 
 | 
  1721           }
 | 
| 
 | 
  1722           elsif($chr2caveats{"$chr:$location"} !~ /poly/){
 | 
| 
 | 
  1723             $chr2caveats{"$chr:$location"} .= "; poly".uc($1)." region insertion";
 | 
| 
 | 
  1724           }
 | 
| 
 | 
  1725         }
 | 
| 
 | 
  1726 
 | 
| 
 | 
  1727         # Not a protein-coding gene?  Report genomic cooordinates for HGVS
 | 
| 
 | 
  1728         if(not exists $feature_cds_max{$target_parent} or not defined $feature_cds_max{$target_parent} or $feature_cds_max{$target_parent} eq ""){
 | 
| 
 | 
  1729             if(length($ref) == 1 and length($variant) == 1){ # SNP
 | 
| 
 | 
  1730               record_snv("$target_parent\tg.$location",
 | 
| 
 | 
  1731                          "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1732                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
 | 
| 
 | 
  1733             }
 | 
| 
 | 
  1734             elsif(length($ref) == 1 and length($variant) > 1){ # insertion
 | 
| 
 | 
  1735               record_snv("$target_parent\tg.$location\_",($location+1),
 | 
| 
 | 
  1736                          "ins",substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1737                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
 | 
| 
 | 
  1738             }
 | 
| 
 | 
  1739             elsif(length($variant) == 1 and length($ref) > 1){ # deletion
 | 
| 
 | 
  1740               record_snv("$target_parent\tg.$location\_",($location+length($ref)-1),
 | 
| 
 | 
  1741                          "del",substr($ref, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1742                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
 | 
| 
 | 
  1743             }
 | 
| 
 | 
  1744             else{ # indel
 | 
| 
 | 
  1745               record_snv("$target_parent\tg.$location\_",($location+length($ref)-1),
 | 
| 
 | 
  1746                          "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1747                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
 | 
| 
 | 
  1748             }
 | 
| 
 | 
  1749             next; # process next record, we've done all we can with a non-coding-gene SNP
 | 
| 
 | 
  1750         }
 | 
| 
 | 
  1751 
 | 
| 
 | 
  1752 	if($strand eq "-"){
 | 
| 
 | 
  1753 	    # set up utr offset for correct CDS coordinates
 | 
| 
 | 
  1754 	    for(my $i = $#feature_ranges; $i >= 0; $i--){
 | 
| 
 | 
  1755 		# exon is completely 5' of the start
 | 
| 
 | 
  1756 		if($feature_ranges[$i]->[0] > $feature_cds_max{$target_parent}){
 | 
| 
 | 
  1757 		    #print STDERR "Whole 5' UTR exon $i: ",$feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1,"\n";
 | 
| 
 | 
  1758 		    $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
 | 
| 
 | 
  1759 		}
 | 
| 
 | 
  1760 		# exon with the cds start
 | 
| 
 | 
  1761 		elsif($feature_ranges[$i]->[1] >= $feature_cds_max{$target_parent} and 
 | 
| 
 | 
  1762 		      $feature_ranges[$i]->[0] <= $feature_cds_max{$target_parent}){
 | 
| 
 | 
  1763 		    #print STDERR "Start codon in exon $i: ", $feature_cds_max{$target_parent} - $feature_ranges[$i]->[1],"\n";
 | 
| 
 | 
  1764 		    $feature_offset += $feature_cds_max{$target_parent} - $feature_ranges[$i]->[1];
 | 
| 
 | 
  1765 		    last;
 | 
| 
 | 
  1766 		}
 | 
| 
 | 
  1767 		else{
 | 
| 
 | 
  1768 		    die "The CDS for $target_parent (on negative strand) ends downstream ",
 | 
| 
 | 
  1769 		    "($feature_cds_max{$target_parent}) of the an exon",
 | 
| 
 | 
  1770 		    " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n";
 | 
| 
 | 
  1771 		}
 | 
| 
 | 
  1772 	    }
 | 
| 
 | 
  1773 	    for(my $i = $#feature_ranges; $i >= 0; $i--){
 | 
| 
 | 
  1774 		my $feature = $feature_ranges[$i];
 | 
| 
 | 
  1775 		# in the 3' UTR region of the gene
 | 
| 
 | 
  1776 		if($location < $feature_cds_min{$target_parent}){
 | 
| 
 | 
  1777 		    my $feature_exon = 0;
 | 
| 
 | 
  1778 		    $feature = $feature_ranges[$feature_exon];
 | 
| 
 | 
  1779 		    while($location > $feature->[1]+$flanking_bases and
 | 
| 
 | 
  1780 			  $feature_exon < $#feature_ranges){
 | 
| 
 | 
  1781 			$feature = $feature_ranges[++$feature_exon]; # find the 3' utr exon in which the variant is located
 | 
| 
 | 
  1782 		    }
 | 
| 
 | 
  1783 		    # utr exons passed entirely
 | 
| 
 | 
  1784 		    my $stop_exon = $feature_exon;
 | 
| 
 | 
  1785 		    while($feature_ranges[$stop_exon]->[1] < $feature_cds_min{$target_parent}){
 | 
| 
 | 
  1786 			$stop_exon++;
 | 
| 
 | 
  1787 		    }			
 | 
| 
 | 
  1788 		    my $post_offset = $feature_cds_min{$target_parent}-$feature_ranges[$stop_exon]->[0]; # offset from the stop codon in the final coding exon
 | 
| 
 | 
  1789 		    while($stop_exon > $feature_exon){
 | 
| 
 | 
  1790 			$post_offset += $feature_ranges[$stop_exon]->[1]-$feature_ranges[$stop_exon]->[0]+1;
 | 
| 
 | 
  1791 			$stop_exon--;
 | 
| 
 | 
  1792 		    }
 | 
| 
 | 
  1793 		    
 | 
| 
 | 
  1794 		    my $pos = $feature->[1]-$location+1+$post_offset;
 | 
| 
 | 
  1795                     my $junction_dist;
 | 
| 
 | 
  1796 		    if($location < $feature->[0]){ # after a UTR containing exon? set as .*DD+DD
 | 
| 
 | 
  1797                         $junction_dist = ($feature->[0]-$location);
 | 
| 
 | 
  1798 			$pos = ($post_offset+$feature->[1]-$feature->[0]+1)."+".$junction_dist;
 | 
| 
 | 
  1799 		    }
 | 
| 
 | 
  1800 		    elsif($location > $feature->[1]){ # before a total UTR exon? set as .*DD-DD
 | 
| 
 | 
  1801                         $junction_dist = -($location-$feature->[1]);
 | 
| 
 | 
  1802 			$pos = ($post_offset+1).$junction_dist;
 | 
| 
 | 
  1803 		    }
 | 
| 
 | 
  1804                     else{ # in the UTR exon
 | 
| 
 | 
  1805                         if($location - $feature->[0] < $feature->[1] - $location){ # location is closer to exon donor site
 | 
| 
 | 
  1806                             $junction_dist = -($location - $feature->[0]+1); # +1 for HGVS syntax compatibility (there is no position 0, direct skip from -1 to +1)
 | 
| 
 | 
  1807                         }
 | 
| 
 | 
  1808                         else{
 | 
| 
 | 
  1809                             $junction_dist = $feature->[1] - $location +1;
 | 
| 
 | 
  1810                         }
 | 
| 
 | 
  1811                     }
 | 
| 
 | 
  1812 		    if(length($ref) == 1 and length($variant) == 1){
 | 
| 
 | 
  1813                         my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
 | 
| 
 | 
  1814 			# 3' UTR SNP
 | 
| 
 | 
  1815 			record_snv("$target_parent\tc.*$pos",
 | 
| 
 | 
  1816 			"$rc{$ref}>$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1817 			#"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1818 			join("\t",prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
 | 
| 
 | 
  1819 		    }
 | 
| 
 | 
  1820 		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
 | 
| 
 | 
  1821                         my $rc = join("",map({$rc{$_}} split(//,reverse(substr($variant,1)))));
 | 
| 
 | 
  1822 			# 3' UTR insertion
 | 
| 
 | 
  1823 			record_snv("$target_parent\tc.*",
 | 
| 
 | 
  1824 			hgvs_plus($pos,-1),"_*",$pos,
 | 
| 
 | 
  1825 			"ins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1826 			#"ins",substr($variant,1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1827 			join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
 | 
| 
 | 
  1828 		    }
 | 
| 
 | 
  1829 		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
 | 
| 
 | 
  1830                         my $rc = join("",map({$rc{$_}} split(//,reverse($ref))));
 | 
| 
 | 
  1831                         my $delBases = substr($rc,0,length($rc)-1);
 | 
| 
 | 
  1832 			if(length($ref) == 2){
 | 
| 
 | 
  1833 			    # 3' UTR single base deletion
 | 
| 
 | 
  1834 			    record_snv("$target_parent\tc.*",hgvs_plus($pos,-1),
 | 
| 
 | 
  1835 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1836 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
 | 
| 
 | 
  1837 			}
 | 
| 
 | 
  1838 			else{
 | 
| 
 | 
  1839 			    # longer 3' UTR deletion
 | 
| 
 | 
  1840 			    record_snv("$target_parent\tc.*",
 | 
| 
 | 
  1841 			    hgvs_plus($pos,-length($ref)+1),"_*",hgvs_plus($pos, -1),
 | 
| 
 | 
  1842 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1843 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
 | 
| 
 | 
  1844 			}
 | 
| 
 | 
  1845 		    }
 | 
| 
 | 
  1846 		    else{
 | 
| 
 | 
  1847 			my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
 | 
| 
 | 
  1848 			if($rc eq $ref and length($variant) > 3){
 | 
| 
 | 
  1849 			    # 3' UTR inversion
 | 
| 
 | 
  1850 			    record_snv("$target_parent\tc.*",
 | 
| 
 | 
  1851 			    hgvs_plus($pos,-length($ref)+1),"_*",$pos,
 | 
| 
 | 
  1852 			    "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1853 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
 | 
| 
 | 
  1854 			    last;
 | 
| 
 | 
  1855 			}
 | 
| 
 | 
  1856 
 | 
| 
 | 
  1857 			# complex substitution in 3' UTR
 | 
| 
 | 
  1858 			# Will break if stop codon is modified 
 | 
| 
 | 
  1859 			record_snv("$target_parent\tc.*",
 | 
| 
 | 
  1860 			hgvs_plus($pos,-length($ref)+1),"_*", $pos,
 | 
| 
 | 
  1861 			"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1862 			join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
 | 
| 
 | 
  1863 		    }
 | 
| 
 | 
  1864 		    last;		
 | 
| 
 | 
  1865 		}
 | 
| 
 | 
  1866 		# in the feature	    
 | 
| 
 | 
  1867 		elsif($location >= $feature->[0] and $location <= $feature->[1]){
 | 
| 
 | 
  1868 		    my $pos = $feature_offset+$feature->[1]-$location+1;
 | 
| 
 | 
  1869 		    if($location > $feature_cds_max{$target_parent}){ #since there is no position 0, the pos is in UTR, subtract one
 | 
| 
 | 
  1870 			$pos = hgvs_plus($pos, -1);
 | 
| 
 | 
  1871 		    }
 | 
| 
 | 
  1872 		    my $first_exon_base = $feature_offset+1;
 | 
| 
 | 
  1873                     my $exon_edge_dist = $feature->[1]-$location+1; # equiv of HGVS +X or -X intron syntax, but for exons
 | 
| 
 | 
  1874                     $exon_edge_dist = $feature->[0]-$location-1 if abs($feature->[0]-$location-1) < $exon_edge_dist; # pick closer of donor and acceptor sites
 | 
| 
 | 
  1875                     #print STDERR "Got ", $feature->[1]-$location+1, "vs. ", $feature->[0]-$location-1, ": chose $exon_edge_dist\n";
 | 
| 
 | 
  1876 		    if(length($ref) == 1 and length($variant) == 1){
 | 
| 
 | 
  1877 			# SNP
 | 
| 
 | 
  1878 			record_snv("$target_parent\tc.",
 | 
| 
 | 
  1879 			$pos,
 | 
| 
 | 
  1880 			"$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1881 			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
 | 
| 
 | 
  1882                         ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  1883                         #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  1884 		    }
 | 
| 
 | 
  1885 		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
 | 
| 
 | 
  1886 			my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
 | 
| 
 | 
  1887                         my $insBases = substr($rc,1);
 | 
| 
 | 
  1888 			# insertion
 | 
| 
 | 
  1889 			record_snv("$target_parent\tc.",
 | 
| 
 | 
  1890 			hgvs_plus_exon($pos, -1, $first_exon_base),"_",$pos,"ins$insBases",
 | 
| 
 | 
  1891 			"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1892 			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
 | 
| 
 | 
  1893                         ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  1894                         #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  1895 		    }
 | 
| 
 | 
  1896 		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
 | 
| 
 | 
  1897 			my $rc = join("",map({$rc{$_}} split(//,reverse($ref))));
 | 
| 
 | 
  1898                         my $delBases = substr($rc,0,length($rc)-1);
 | 
| 
 | 
  1899 			# single nucleotide deletion
 | 
| 
 | 
  1900 			if(length($ref) == 2){
 | 
| 
 | 
  1901 			    record_snv("$target_parent\tc.",
 | 
| 
 | 
  1902 			    hgvs_plus_exon($pos, -1, $first_exon_base),
 | 
| 
 | 
  1903 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1904 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
 | 
| 
 | 
  1905                             ($pos-1 < 1 ? "NA" : $pos-1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  1906                             #($pos-1 < 1 ? "NA" : $pos-1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  1907 			}
 | 
| 
 | 
  1908 			# longer deletion
 | 
| 
 | 
  1909 			else{
 | 
| 
 | 
  1910                             $exon_edge_dist = $feature->[1]-$location-length($ref)+1 if abs($feature->[1]-$location-length($ref)+1) < $exon_edge_dist;
 | 
| 
 | 
  1911 			    record_snv("$target_parent\tc.",
 | 
| 
 | 
  1912 			    hgvs_plus_exon($pos, -length($ref)+1, $first_exon_base),"_",
 | 
| 
 | 
  1913 			    hgvs_plus_exon($pos, -1, $first_exon_base),
 | 
| 
 | 
  1914 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1915 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
 | 
| 
 | 
  1916                             ($pos-1 < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  1917                             #($pos-1 < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  1918 			}
 | 
| 
 | 
  1919 		    }
 | 
| 
 | 
  1920 		    else{
 | 
| 
 | 
  1921 			# complex substitution
 | 
| 
 | 
  1922                         $exon_edge_dist = $feature->[1]-$location-length($ref)+1 if abs($feature->[1]-$location-length($ref)+1) < $exon_edge_dist;
 | 
| 
 | 
  1923 			my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
 | 
| 
 | 
  1924                         if($rc eq $variant and length($variant) > 3){
 | 
| 
 | 
  1925                             # inversion
 | 
| 
 | 
  1926                             record_snv("$target_parent\tc.",
 | 
| 
 | 
  1927                                        hgvs_plus_exon($pos,-length($ref)+1,$first_exon_base),"_",
 | 
| 
 | 
  1928                                        $pos,
 | 
| 
 | 
  1929                                        "inv",
 | 
| 
 | 
  1930                                        "\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1931                                        join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
 | 
| 
 | 
  1932                                        ($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  1933 
 | 
| 
 | 
  1934                             last;
 | 
| 
 | 
  1935                         } 
 | 
| 
 | 
  1936 			record_snv("$target_parent\tc.",
 | 
| 
 | 
  1937 			hgvs_plus_exon($pos,-length($ref)+1,$first_exon_base),"_",
 | 
| 
 | 
  1938 			$pos,
 | 
| 
 | 
  1939 			"delins$rc",
 | 
| 
 | 
  1940 			"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1941 			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
 | 
| 
 | 
  1942                         ($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  1943                         #($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  1944 		    }
 | 
| 
 | 
  1945 		    last;
 | 
| 
 | 
  1946 		}
 | 
| 
 | 
  1947 		# 5' of feature (on negative strand)
 | 
| 
 | 
  1948 		elsif($location > $feature->[1]){
 | 
| 
 | 
  1949 		    if(length($ref) == 1 and length($variant) == 1){
 | 
| 
 | 
  1950 			# intronic SNP
 | 
| 
 | 
  1951 			if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
 | 
| 
 | 
  1952 			    # Closer to acceptor site
 | 
| 
 | 
  1953 			    record_snv("$target_parent\tc.",$feature_offset+1,
 | 
| 
 | 
  1954 			    ($feature->[1]-$location),
 | 
| 
 | 
  1955 			    "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1956 			    #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1957 			    join("\t", prop_info_key($chr,$location,$ref,$variant, $feature->[1]-$location)),"\tNA\n");
 | 
| 
 | 
  1958 			}
 | 
| 
 | 
  1959 			else{
 | 
| 
 | 
  1960 			    # Closer to donor site
 | 
| 
 | 
  1961 			    record_snv("$target_parent\tc.",$feature_offset,"+",
 | 
| 
 | 
  1962 			    ($feature_ranges[$i+1]->[0]-$location),			
 | 
| 
 | 
  1963 			    "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1964 			    #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1965 			    join("\t", prop_info_key($chr,$location,$ref,$variant, $feature_ranges[$i+1]->[0]-$location)),"\tNA\n");
 | 
| 
 | 
  1966 			}
 | 
| 
 | 
  1967 		    }
 | 
| 
 | 
  1968 		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
 | 
| 
 | 
  1969                         my $rc = join("",map({$rc{$_}} split(//,reverse(substr($variant,1)))));
 | 
| 
 | 
  1970 			if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
 | 
| 
 | 
  1971 			    # intronic insertion near acceptor
 | 
| 
 | 
  1972 			    my $pos = ($feature_offset+1).($feature->[1]-$location);
 | 
| 
 | 
  1973 			    record_snv("$target_parent\tc.",
 | 
| 
 | 
  1974 			    hgvs_plus($pos,-1),"_",$pos,
 | 
| 
 | 
  1975 			    "ins",
 | 
| 
 | 
  1976 			    $rc,"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1977 			    #substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1978 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location-1)),"\tNA\n");
 | 
| 
 | 
  1979 			}
 | 
| 
 | 
  1980 			else{
 | 
| 
 | 
  1981 			    # intronic insertion near donor
 | 
| 
 | 
  1982 			    my $pos = $feature_offset."+".($feature_ranges[$i+1]->[0]-$location);
 | 
| 
 | 
  1983 			    record_snv("$target_parent\tc.",
 | 
| 
 | 
  1984 			    hgvs_plus($pos,-1),"_",$pos,
 | 
| 
 | 
  1985 			    "ins",
 | 
| 
 | 
  1986 			    $rc,"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1987 			    #substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  1988 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location+1)),"\tNA\n");
 | 
| 
 | 
  1989 			}
 | 
| 
 | 
  1990 		    }
 | 
| 
 | 
  1991 		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
 | 
| 
 | 
  1992 			# intronic deletion
 | 
| 
 | 
  1993 			# single nucleotide deletion
 | 
| 
 | 
  1994 			my $rc = reverse($ref); 
 | 
| 
 | 
  1995 			$rc=~tr/ACGT/TGCA/;
 | 
| 
 | 
  1996                         my $delBases = substr($rc, 0, length($rc)-1);
 | 
| 
 | 
  1997 			if(length($ref) == 2){
 | 
| 
 | 
  1998 			    # single intronic deletion near acceptor
 | 
| 
 | 
  1999 			    if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
 | 
| 
 | 
  2000                                 my $off = $feature->[1]-$location-1;
 | 
| 
 | 
  2001 				record_snv("$target_parent\tc.",
 | 
| 
 | 
  2002 				($feature_offset+1),$off,
 | 
| 
 | 
  2003 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2004 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n");
 | 
| 
 | 
  2005 			    }
 | 
| 
 | 
  2006 			    # single intronic deletion near donor
 | 
| 
 | 
  2007 			    else{
 | 
| 
 | 
  2008 				my $pos = $feature_offset;
 | 
| 
 | 
  2009                                 my $off = $feature_ranges[$i+1]->[0]-$location+1;
 | 
| 
 | 
  2010 				record_snv("$target_parent\tc.",
 | 
| 
 | 
  2011 				hgvs_plus_exon($pos, $off, $pos),
 | 
| 
 | 
  2012 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2013 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n");
 | 
| 
 | 
  2014 			    }
 | 
| 
 | 
  2015 			}
 | 
| 
 | 
  2016 			# longer deletion
 | 
| 
 | 
  2017 			else{
 | 
| 
 | 
  2018 			    if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
 | 
| 
 | 
  2019 				# long intronic deletion near acceptor
 | 
| 
 | 
  2020                                 my $off = $feature->[1]-$location-1;
 | 
| 
 | 
  2021 				my $pos = ($feature_offset+1).$off;
 | 
| 
 | 
  2022 				record_snv("$target_parent\tc.",
 | 
| 
 | 
  2023 				hgvs_plus($pos,-length($ref)+2),"_",$pos,
 | 
| 
 | 
  2024 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2025 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n");
 | 
| 
 | 
  2026 			    }
 | 
| 
 | 
  2027 			    else{
 | 
| 
 | 
  2028 				# long intronic deletion near donor
 | 
| 
 | 
  2029                                 my $off = $feature_ranges[$i+1]->[0]-$location+1;
 | 
| 
 | 
  2030 				my $pos = ($feature_offset)."+".$off;
 | 
| 
 | 
  2031 				record_snv("$target_parent\tc.",
 | 
| 
 | 
  2032 				$pos,"_",hgvs_plus($pos,-length($ref)-1),
 | 
| 
 | 
  2033 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2034 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off-length($ref)+1 <= 2 ? "p.?" : "NA"),"\n");
 | 
| 
 | 
  2035 			    }
 | 
| 
 | 
  2036 			    last;
 | 
| 
 | 
  2037 			}
 | 
| 
 | 
  2038 		    }
 | 
| 
 | 
  2039 		    else{
 | 
| 
 | 
  2040 			my $rc = reverse($ref); 
 | 
| 
 | 
  2041 			$rc=~tr/ACGT/TGCA/;
 | 
| 
 | 
  2042 			if($rc eq $variant and length($variant) > 3){
 | 
| 
 | 
  2043 			    # intronic inversion near acceptor site
 | 
| 
 | 
  2044 			    if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
 | 
| 
 | 
  2045 				my $pos = ($feature_offset+1).($feature->[1]-$location);
 | 
| 
 | 
  2046 				record_snv("$target_parent\tc.",
 | 
| 
 | 
  2047 				hgvs_plus($pos,-length($ref)+1),"_",$pos,
 | 
| 
 | 
  2048 				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2049 				join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location)),"\tNA\n");
 | 
| 
 | 
  2050 			    }
 | 
| 
 | 
  2051 			    else{
 | 
| 
 | 
  2052 				my $pos = ($feature_offset)."+".($feature_ranges[$i+1]->[0]-$location);
 | 
| 
 | 
  2053 				record_snv("$target_parent\tc.",
 | 
| 
 | 
  2054 				$pos,"_",hgvs_plus($pos, length($ref)-1),
 | 
| 
 | 
  2055 				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2056 				join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location)),"\tNA\n");
 | 
| 
 | 
  2057 			    }
 | 
| 
 | 
  2058 			    last;
 | 
| 
 | 
  2059 			}
 | 
| 
 | 
  2060                         $rc = reverse($variant);
 | 
| 
 | 
  2061                         $rc=~tr/ACGT/TGCA/;
 | 
| 
 | 
  2062 			# Intronic complex substitution
 | 
| 
 | 
  2063 			if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
 | 
| 
 | 
  2064 			    # complex intronic substitution near acceptor site
 | 
| 
 | 
  2065 			    my $pos = ($feature_offset+1).($feature->[1]-$location);
 | 
| 
 | 
  2066 			    record_snv("$target_parent\tc.",
 | 
| 
 | 
  2067 			    hgvs_plus($pos, -length($ref)+1),"_",$pos,
 | 
| 
 | 
  2068 			    "delins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2069 			    #"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2070 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location)),"\tNA\n");
 | 
| 
 | 
  2071 			}
 | 
| 
 | 
  2072 			else{
 | 
| 
 | 
  2073 			    # complex intronic substitution near donor site
 | 
| 
 | 
  2074 			    my $pos = $feature_offset."+".($feature_ranges[$i+1]->[0]-$location);
 | 
| 
 | 
  2075 			    record_snv("$target_parent\tc.",
 | 
| 
 | 
  2076 			    $pos,"_",hgvs_plus($pos, length($ref)-1),
 | 
| 
 | 
  2077 			    "delins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2078 			    #"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2079 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location)),"\tNA\n");
 | 
| 
 | 
  2080 			}
 | 
| 
 | 
  2081 		    }
 | 
| 
 | 
  2082 		    last;
 | 
| 
 | 
  2083 		}
 | 
| 
 | 
  2084 		else{
 | 
| 
 | 
  2085 		    #print STDERR "Offset going from ", $feature_offset, " to ", $feature_offset+$feature->[1]-$feature->[0]+1,"\n";
 | 
| 
 | 
  2086 		    $feature_offset += $feature->[1]-$feature->[0]+1;
 | 
| 
 | 
  2087 		    #print STDERR "Set feature offset to  $feature_offset by adding ",$feature->[1],"-", $feature->[0],"+1\n";
 | 
| 
 | 
  2088 		}
 | 
| 
 | 
  2089 	    }
 | 
| 
 | 
  2090 	}
 | 
| 
 | 
  2091 	else{
 | 
| 
 | 
  2092 	    # forward strand
 | 
| 
 | 
  2093 
 | 
| 
 | 
  2094 	    # set up utr offset for correct CDS coordinates
 | 
| 
 | 
  2095 	    for(my $i = 0; $i <= $#feature_ranges; $i++){
 | 
| 
 | 
  2096 		# All 5' utr exon
 | 
| 
 | 
  2097 		if($feature_ranges[$i]->[1] < $feature_cds_min{$target_parent}){
 | 
| 
 | 
  2098 		    $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
 | 
| 
 | 
  2099 		}
 | 
| 
 | 
  2100 		# exon with the cds start
 | 
| 
 | 
  2101 		elsif($feature_ranges[$i]->[1] >= $feature_cds_min{$target_parent} and 
 | 
| 
 | 
  2102 		      $feature_ranges[$i]->[0] <= $feature_cds_min{$target_parent}){
 | 
| 
 | 
  2103 		    $feature_offset -= $feature_cds_min{$target_parent} - $feature_ranges[$i]->[0];
 | 
| 
 | 
  2104 		    last;
 | 
| 
 | 
  2105 		}
 | 
| 
 | 
  2106 		else{
 | 
| 
 | 
  2107 		    die "The CDS for $target_parent starts upstream ($feature_cds_max{$target_parent}) of the first exon",
 | 
| 
 | 
  2108 		    " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n";
 | 
| 
 | 
  2109 		}
 | 
| 
 | 
  2110 	    }
 | 
| 
 | 
  2111 	    for(my $i = 0; $i <= $#feature_ranges; $i++){
 | 
| 
 | 
  2112 		my $feature = $feature_ranges[$i];
 | 
| 
 | 
  2113 		# 3' of last coding position
 | 
| 
 | 
  2114 		if($location > $feature_cds_max{$target_parent}){
 | 
| 
 | 
  2115 		    # find the exon with the stop codon
 | 
| 
 | 
  2116 		    while($feature->[1] < $feature_cds_max{$target_parent}){
 | 
| 
 | 
  2117 			$feature = $feature_ranges[++$i];
 | 
| 
 | 
  2118 		    }
 | 
| 
 | 
  2119 		    my $post_offset = $feature->[0]-$feature_cds_max{$target_parent};
 | 
| 
 | 
  2120 		    while($location > $feature->[1]+$flanking_bases and 
 | 
| 
 | 
  2121 			  $i < $#feature_ranges){
 | 
| 
 | 
  2122 			$post_offset += $feature->[1]-$feature->[0]+1;
 | 
| 
 | 
  2123 			$feature = $feature_ranges[++$i]; # find the 3' utr exon in which the variant is located
 | 
| 
 | 
  2124 		    }
 | 
| 
 | 
  2125 		    my $pos = $location-$feature->[0]+$post_offset;
 | 
| 
 | 
  2126 		    #print STDERR "Positive strand: got $pos for $location, exon #$i of $#feature_ranges, post_offset is $post_offset\n" if $location-$feature->[1] > $flanking_bases;
 | 
| 
 | 
  2127                     my $off;
 | 
| 
 | 
  2128 		    if($location > $feature->[1]){ # after a UTR containing exon? set as .*DD+DD
 | 
| 
 | 
  2129                         $off = $location-$feature->[1];
 | 
| 
 | 
  2130 			$pos = ($post_offset+$feature->[1]-$feature->[0]+1)."+".$off;
 | 
| 
 | 
  2131 		    }
 | 
| 
 | 
  2132 		    elsif($location < $feature->[0]){ # before a total UTR exon? set as .*DD-DD
 | 
| 
 | 
  2133                         $off = -($feature->[0]-$location);
 | 
| 
 | 
  2134 			$pos = ($post_offset+1).$off;
 | 
| 
 | 
  2135 		    }
 | 
| 
 | 
  2136                     else{
 | 
| 
 | 
  2137                         if($location-$feature->[0] < $feature->[1]-$location){
 | 
| 
 | 
  2138                            $off = $location-$feature->[0]+1; # +1 since HGVS skips right from -1 to +1 at exon boundary
 | 
| 
 | 
  2139                         }
 | 
| 
 | 
  2140                         else{
 | 
| 
 | 
  2141                            $off = $location-$feature->[1]-1; # will be negative
 | 
| 
 | 
  2142                         }
 | 
| 
 | 
  2143                     }
 | 
| 
 | 
  2144 		    if(length($ref) == 1 and length($variant) == 1){
 | 
| 
 | 
  2145 			# 3' UTR SNP
 | 
| 
 | 
  2146 			record_snv("$target_parent\tc.*$pos",
 | 
| 
 | 
  2147 			"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2148 			join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n");
 | 
| 
 | 
  2149 		    }
 | 
| 
 | 
  2150 		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
 | 
| 
 | 
  2151 			# 3' UTR insertion
 | 
| 
 | 
  2152 			record_snv("$target_parent\tc.*",
 | 
| 
 | 
  2153 			hgvs_plus($pos,1),"_*",hgvs_plus($pos,2),
 | 
| 
 | 
  2154 			"ins",substr($variant,1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2155 			join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n");
 | 
| 
 | 
  2156 		    }
 | 
| 
 | 
  2157 		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
 | 
| 
 | 
  2158                         my $delBases = substr($ref, 1);
 | 
| 
 | 
  2159 			if(length($ref) == 2){
 | 
| 
 | 
  2160 			    # 3' UTR single base deletion
 | 
| 
 | 
  2161 			    record_snv("$target_parent\tc.*",hgvs_plus($pos,1),
 | 
| 
 | 
  2162 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2163 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n");
 | 
| 
 | 
  2164 			}
 | 
| 
 | 
  2165 			else{
 | 
| 
 | 
  2166 			    # longer 3' UTR deletion
 | 
| 
 | 
  2167 			    record_snv("$target_parent\tc.*",
 | 
| 
 | 
  2168 			    hgvs_plus($pos,1),"_*",hgvs_plus($pos,length($ref)-1),
 | 
| 
 | 
  2169 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2170 			    join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n");
 | 
| 
 | 
  2171 			}
 | 
| 
 | 
  2172 		    }
 | 
| 
 | 
  2173 		    else{
 | 
| 
 | 
  2174 			my $rc = reverse($ref); 
 | 
| 
 | 
  2175 			$rc=~tr/ACGT/TGCA/; 
 | 
| 
 | 
  2176 			if($rc eq $variant and length($variant) > 3){
 | 
| 
 | 
  2177 			    # 3' UTR inversion
 | 
| 
 | 
  2178 			    record_snv("$target_parent\tc.*$pos",
 | 
| 
 | 
  2179 			    "_*",hgvs_plus($pos,length($ref)-1),
 | 
| 
 | 
  2180 			    "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2181 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n");
 | 
| 
 | 
  2182 			    last;
 | 
| 
 | 
  2183 			}
 | 
| 
 | 
  2184 			# complex substitution in 3' UTR
 | 
| 
 | 
  2185 			record_snv("$target_parent\tc.*$pos",
 | 
| 
 | 
  2186 			"_*",hgvs_plus($pos,length($ref)-1),
 | 
| 
 | 
  2187 			"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2188 			join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n");
 | 
| 
 | 
  2189 		    }
 | 
| 
 | 
  2190 		    last;
 | 
| 
 | 
  2191 		}
 | 
| 
 | 
  2192 		# in the exon
 | 
| 
 | 
  2193 		elsif($location >= $feature->[0] and $location <= $feature->[1]){
 | 
| 
 | 
  2194 		    my $pos = $feature_offset+$location-$feature->[0]+1;
 | 
| 
 | 
  2195 		    my $last_exon_base = $feature_offset+$feature->[1]-$feature->[0]+1;
 | 
| 
 | 
  2196                     my $exon_edge_dist = $location-$feature->[0]+1; # equiv of HGVS +X or -X intron syntax, but for exons
 | 
| 
 | 
  2197                     $exon_edge_dist = $location-$feature->[1]-1 if abs($location-$feature->[1]-1) < $exon_edge_dist; # pick closer of donor and acceptor sites
 | 
| 
 | 
  2198                     #print STDERR "Got ", $location-$feature->[0]+1, "vs. ", $location-$feature->[1]-1, ": chose $exon_edge_dist\n";
 | 
| 
 | 
  2199 		    if($location < $feature_cds_min{$target_parent}){ #since there is no position 0, the pos is in UTR, subtract one
 | 
| 
 | 
  2200 			$pos = hgvs_plus($pos, -1);
 | 
| 
 | 
  2201 		    }
 | 
| 
 | 
  2202 		    if(length($ref) == 1 and length($variant) == 1){
 | 
| 
 | 
  2203 			# SNP
 | 
| 
 | 
  2204 			record_snv("$target_parent\tc.$pos",
 | 
| 
 | 
  2205 			"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2206 			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
 | 
| 
 | 
  2207                         ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  2208                         #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  2209 		    }
 | 
| 
 | 
  2210 		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
 | 
| 
 | 
  2211 			# insertion
 | 
| 
 | 
  2212 			record_snv("$target_parent\tc.$pos",
 | 
| 
 | 
  2213 			"_",hgvs_plus_exon($pos,1,$last_exon_base),"ins",
 | 
| 
 | 
  2214 			substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2215 			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
 | 
| 
 | 
  2216                         ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  2217                         #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  2218 		    }
 | 
| 
 | 
  2219 		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
 | 
| 
 | 
  2220                         my $delBases = substr($ref, 1);
 | 
| 
 | 
  2221 			# single nucleotide deletion
 | 
| 
 | 
  2222 			if(length($delBases) == 1){
 | 
| 
 | 
  2223 			    record_snv("$target_parent\tc.",
 | 
| 
 | 
  2224 			    hgvs_plus_exon($pos,1,$last_exon_base),
 | 
| 
 | 
  2225 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2226 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
 | 
| 
 | 
  2227                             ($pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  2228                             #($pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  2229 			}
 | 
| 
 | 
  2230 			# longer deletion
 | 
| 
 | 
  2231 			else{
 | 
| 
 | 
  2232                             $exon_edge_dist = $feature->[1]-$location-length($ref)-1 if abs($feature->[1]-$location-length($ref)-1) < $exon_edge_dist; 
 | 
| 
 | 
  2233 			    record_snv("$target_parent\tc.",
 | 
| 
 | 
  2234 			    hgvs_plus_exon($pos,1,$last_exon_base),"_",
 | 
| 
 | 
  2235 			    hgvs_plus_exon($pos,length($ref)-1,$last_exon_base),
 | 
| 
 | 
  2236 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2237 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
 | 
| 
 | 
  2238                             ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  2239                             #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  2240 			}
 | 
| 
 | 
  2241 		    }
 | 
| 
 | 
  2242 		    else{
 | 
| 
 | 
  2243                         $exon_edge_dist = $feature->[1]-$location-length($ref)-1 if abs($feature->[1]-$location-length($ref)-1) < $exon_edge_dist; 
 | 
| 
 | 
  2244 			my $rc = reverse($ref); 
 | 
| 
 | 
  2245 			$rc=~tr/ACGT/TGCA/; 
 | 
| 
 | 
  2246 			if($rc eq $variant and length($variant) > 3){
 | 
| 
 | 
  2247 			    # inversion
 | 
| 
 | 
  2248 			    record_snv("$target_parent\tc.$pos",
 | 
| 
 | 
  2249 			    "_",hgvs_plus_exon($pos,length($ref)-1, $last_exon_base),
 | 
| 
 | 
  2250 			    "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2251 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
 | 
| 
 | 
  2252                             ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  2253                             #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  2254 			    last;
 | 
| 
 | 
  2255 			}
 | 
| 
 | 
  2256 			# complex substitution
 | 
| 
 | 
  2257 			record_snv("$target_parent\tc.$pos",
 | 
| 
 | 
  2258 			"_",hgvs_plus_exon($pos, length($ref)-1, $last_exon_base),
 | 
| 
 | 
  2259 			"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2260 			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
 | 
| 
 | 
  2261                         ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  2262                         #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
 | 
| 
 | 
  2263 		    }
 | 
| 
 | 
  2264 		    last;
 | 
| 
 | 
  2265 		}
 | 
| 
 | 
  2266 		# 5' of feature
 | 
| 
 | 
  2267 		elsif($location < $feature->[0]){
 | 
| 
 | 
  2268 		    if(length($ref) == 1 and length($variant) == 1){
 | 
| 
 | 
  2269 			# intronic SNP
 | 
| 
 | 
  2270 			if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
 | 
| 
 | 
  2271 			    # Closer to donor site
 | 
| 
 | 
  2272 			    record_snv("$target_parent\tc.",$feature_offset,"+",
 | 
| 
 | 
  2273 			    ($location-$feature_ranges[$i-1]->[1]),
 | 
| 
 | 
  2274 			    "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2275 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
 | 
| 
 | 
  2276 			}
 | 
| 
 | 
  2277 			else{
 | 
| 
 | 
  2278 			    # Closer to acceptor site
 | 
| 
 | 
  2279 			    record_snv("$target_parent\tc.",$feature_offset+1,
 | 
| 
 | 
  2280 			    ($location-$feature->[0]),
 | 
| 
 | 
  2281 			    "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2282 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
 | 
| 
 | 
  2283 			}
 | 
| 
 | 
  2284 		    }
 | 
| 
 | 
  2285 		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
 | 
| 
 | 
  2286 			if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
 | 
| 
 | 
  2287 			    # intronic insertion near donor	
 | 
| 
 | 
  2288 			    my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]);
 | 
| 
 | 
  2289 			    record_snv("$target_parent\tc.",
 | 
| 
 | 
  2290 			    $pos,"_",hgvs_plus($pos,1),
 | 
| 
 | 
  2291 			    "ins",
 | 
| 
 | 
  2292 			    substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2293 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
 | 
| 
 | 
  2294 			}
 | 
| 
 | 
  2295 			else{
 | 
| 
 | 
  2296 			    # intronic insertion near acceptor
 | 
| 
 | 
  2297 			    my $pos = ($feature_offset+1).($location-$feature->[0]);
 | 
| 
 | 
  2298 			    record_snv("$target_parent\tc.",
 | 
| 
 | 
  2299 			    $pos,"_",hgvs_plus($pos,1),
 | 
| 
 | 
  2300 			    "ins",
 | 
| 
 | 
  2301 			    substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2302 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
 | 
| 
 | 
  2303 			}
 | 
| 
 | 
  2304 		    }
 | 
| 
 | 
  2305 		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
 | 
| 
 | 
  2306 			# intronic deletion
 | 
| 
 | 
  2307 			# single nucleotide deletion
 | 
| 
 | 
  2308                         my $delBases = substr($ref, 1);
 | 
| 
 | 
  2309 			if(length($ref) == 2){
 | 
| 
 | 
  2310 			    # single intronic deletion near donor
 | 
| 
 | 
  2311 			    if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
 | 
| 
 | 
  2312                                 my $off = $location-$feature_ranges[$i-1]->[1]+1;
 | 
| 
 | 
  2313 				record_snv("$target_parent\tc.",
 | 
| 
 | 
  2314 				$feature_offset,"+",$off,
 | 
| 
 | 
  2315 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2316 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n");
 | 
| 
 | 
  2317 			    }
 | 
| 
 | 
  2318 			    # single intronic deletion near acceptor
 | 
| 
 | 
  2319 			    else{
 | 
| 
 | 
  2320 				my $pos = ($feature_offset+1);
 | 
| 
 | 
  2321                                 my $off = $location-$feature->[0];
 | 
| 
 | 
  2322 				record_snv("$target_parent\tc.",			    
 | 
| 
 | 
  2323 				hgvs_plus_exon($pos, $off, $pos),
 | 
| 
 | 
  2324 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2325 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n");
 | 
| 
 | 
  2326 			    }
 | 
| 
 | 
  2327 			}
 | 
| 
 | 
  2328 			# longer deletion
 | 
| 
 | 
  2329 			else{
 | 
| 
 | 
  2330 			    if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
 | 
| 
 | 
  2331 				# long intronic deletion near donor
 | 
| 
 | 
  2332                                 my $off = $location-$feature_ranges[$i-1]->[1]+1;
 | 
| 
 | 
  2333 				my $pos = $feature_offset."+".$off;
 | 
| 
 | 
  2334 				record_snv("$target_parent\tc.",
 | 
| 
 | 
  2335 				$pos,"_",hgvs_plus($pos,length($ref)-2),
 | 
| 
 | 
  2336 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2337 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n");
 | 
| 
 | 
  2338 			    }
 | 
| 
 | 
  2339 			    else{
 | 
| 
 | 
  2340 				# long intronic deletion near acceptor
 | 
| 
 | 
  2341                                 my $off = $location-$feature->[0]+1;
 | 
| 
 | 
  2342 				my $pos = ($feature_offset+1).$off;
 | 
| 
 | 
  2343 				record_snv("$target_parent\tc.",
 | 
| 
 | 
  2344 				$pos,"_",hgvs_plus($pos,length($ref)-2),
 | 
| 
 | 
  2345 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2346 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off+length($ref)-2 >= -2 ? "p.?" : "NA"),"\n");
 | 
| 
 | 
  2347 			    }
 | 
| 
 | 
  2348 			}
 | 
| 
 | 
  2349 		    }
 | 
| 
 | 
  2350 		    else{
 | 
| 
 | 
  2351 			my $rc = reverse($ref); 
 | 
| 
 | 
  2352 			$rc=~tr/ACGT/TGCA/; 
 | 
| 
 | 
  2353 			if($rc eq $variant and length($variant) > 3){
 | 
| 
 | 
  2354 			    # intronic inversion near donor site
 | 
| 
 | 
  2355 			    if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
 | 
| 
 | 
  2356 				my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]);
 | 
| 
 | 
  2357 				record_snv("$target_parent\tc.",
 | 
| 
 | 
  2358 				$pos,"_",hgvs_plus($pos,length($ref)-1),
 | 
| 
 | 
  2359 				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2360 				join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
 | 
| 
 | 
  2361 			    }
 | 
| 
 | 
  2362 			    else{
 | 
| 
 | 
  2363 				my $pos = ($feature_offset+1).($location-$feature->[0]);
 | 
| 
 | 
  2364 				record_snv("$target_parent\tc.",
 | 
| 
 | 
  2365 				$pos,"_",hgvs_plus($pos, length($ref)-1),
 | 
| 
 | 
  2366 				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2367 				join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
 | 
| 
 | 
  2368 			    }
 | 
| 
 | 
  2369 			    last;
 | 
| 
 | 
  2370 			}
 | 
| 
 | 
  2371 			# Intronic complex substitution
 | 
| 
 | 
  2372 			# Note: sub maybe have  comma in it to denote two possible variants
 | 
| 
 | 
  2373 			if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
 | 
| 
 | 
  2374 			    # complex intronic substitution near donor site
 | 
| 
 | 
  2375 			    my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]);
 | 
| 
 | 
  2376 			    record_snv("$target_parent\tc.",
 | 
| 
 | 
  2377 			    $pos,"_",hgvs_plus($pos, length($ref)-1),
 | 
| 
 | 
  2378 			    "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2379 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
 | 
| 
 | 
  2380 			}
 | 
| 
 | 
  2381 			else{
 | 
| 
 | 
  2382 			    # complex intronic substitution near acceptor site
 | 
| 
 | 
  2383 			    my $pos = ($feature_offset+1).($location-$feature->[0]);
 | 
| 
 | 
  2384 			    record_snv("$target_parent\tc.",
 | 
| 
 | 
  2385 			    $pos,"_",hgvs_plus($pos, length($ref)-1),
 | 
| 
 | 
  2386 			    "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
 | 
| 
 | 
  2387 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
 | 
| 
 | 
  2388 			}
 | 
| 
 | 
  2389 		    }
 | 
| 
 | 
  2390 		    last;
 | 
| 
 | 
  2391 		}
 | 
| 
 | 
  2392 		else{
 | 
| 
 | 
  2393 		    # feature is past this exon
 | 
| 
 | 
  2394 		    $feature_offset += $feature->[1]-$feature->[0]+1;
 | 
| 
 | 
  2395 		}
 | 
| 
 | 
  2396 	    }
 | 
| 
 | 
  2397 	}
 | 
| 
 | 
  2398     }  # for each variant on the line
 | 
| 
 | 
  2399   } # for each  gene overlapping the site the VCF describes
 | 
| 
 | 
  2400 } # for each VCF line
 | 
| 
 | 
  2401 print STDERR "\n" unless $quiet;
 | 
| 
 | 
  2402 close(VCFIN);
 | 
| 
 | 
  2403 
 | 
| 
 | 
  2404 # Before we can start printing the variants, we need to look at the phase information and calculate the real haplotype HGVS changes
 | 
| 
 | 
  2405 #if(keys %chr2phase){
 | 
| 
 | 
  2406   # Note that we could have samtools read-backed haplotype info, MNPs in the VCF, and pre-existing haplotypes in the input VCF (e.g. imputed or based on Mendelian inheritance where trios exist)
 | 
| 
 | 
  2407   # We need to create new disjoint sets of phased blocks from the (consistent) union of these data.
 | 
| 
 | 
  2408 #  my $chr2phase2variants = combine_phase_data(\%chr2phase);
 | 
| 
 | 
  2409 
 | 
| 
 | 
  2410   # TODO: Calculate protein HGVS syntax for each variant, now that all phase data has been incorporated
 | 
| 
 | 
  2411   #for my $chr (keys %$chr2phase2variants){
 | 
| 
 | 
  2412   #  for my $phase (keys %{$chr2phase2variants{$chr}){
 | 
| 
 | 
  2413   #    # apply all phased changes to the reference chromosomal seq
 | 
| 
 | 
  2414   #    my $phased_seq = $seq{$chr}; #reference
 | 
| 
 | 
  2415   #    # sort the variants from 3' to 5' so that edits after indels don't need adjustment in their ref coordinate
 | 
| 
 | 
  2416   #    my @sorted_variants = sort {my($a_pos) = $a =~ /:(\d+):/; my($b_pos) = $b =~ /:(\d+):/; $b_pos <=> $a_pos} @{$chr2phase2variants{$chr}->{$phase}};
 | 
| 
 | 
  2417   #    for my $variant (@sorted_variants){
 | 
| 
 | 
  2418   #    }
 | 
| 
 | 
  2419   #  }
 | 
| 
 | 
  2420   #}
 | 
| 
 | 
  2421 #}
 | 
| 
 | 
  2422 
 | 
| 
 | 
  2423 # retrieve the MAF info en masse for each chromosome, as this is much more efficient
 | 
| 
 | 
  2424 my @out_lines;
 | 
| 
 | 
  2425 for my $snv (@snvs){
 | 
| 
 | 
  2426   chomp $snv;
 | 
| 
 | 
  2427   my @fields = split /\t/, $snv;
 | 
| 
 | 
  2428   # For CNVs, all the fields are already filled out
 | 
| 
 | 
  2429   if(@fields > 13){
 | 
| 
 | 
  2430     push @out_lines, join("\t", $feature_type{$fields[0]}, ($fields[0] =~ /\S/ ? $feature_length{$fields[0]} : "NA"), @fields);
 | 
| 
 | 
  2431     next;
 | 
| 
 | 
  2432   }
 | 
| 
 | 
  2433   my $variant_key = $fields[9];
 | 
| 
 | 
  2434   $fields[9] = join("\t", prop_info($dbsnp,$internal_snp,$variant_key));
 | 
| 
 | 
  2435   my $from = $fields[4];
 | 
| 
 | 
  2436   my $chr_pos_key = $fields[3].":".$from;
 | 
| 
 | 
  2437   my $to = $fields[4]; # true for SNPs and insertions
 | 
| 
 | 
  2438   my @variant_key = split /:/, $variant_key;
 | 
| 
 | 
  2439   # For deletions and complex variants, calculate the affected reference genome range and set the 'to'
 | 
| 
 | 
  2440   if(length($variant_key[2]) > 1){
 | 
| 
 | 
  2441     $to += length($variant_key[2])-1;
 | 
| 
 | 
  2442   }
 | 
| 
 | 
  2443   splice(@fields, 5, 0, $to);
 | 
| 
 | 
  2444   
 | 
| 
 | 
  2445   # Otherwise expand the key into the relevant MAF values
 | 
| 
 | 
  2446   $fields[0] =~ s/\/chr.*$//;  # some transcript ids are repeated... we expect "id/chr#" in the GTF file to distinguish these, but should get rid of them at reporting time
 | 
| 
 | 
  2447   # the offset from the nearest exon border if coding
 | 
| 
 | 
  2448   push @fields, ($#variant_key > 3 ? $variant_key[4] : "");
 | 
| 
 | 
  2449   # add gene name(s)
 | 
| 
 | 
  2450   push @fields, range2genes($fields[3], $from, $to+1);
 | 
| 
 | 
  2451   # add caveats
 | 
| 
 | 
  2452   my $c = $fields[3];
 | 
| 
 | 
  2453   if(not exists $chr2mappability{$c}){
 | 
| 
 | 
  2454     if($c =~ s/^chr//){
 | 
| 
 | 
  2455       # nothing more
 | 
| 
 | 
  2456     }
 | 
| 
 | 
  2457     else{
 | 
| 
 | 
  2458       $c = "chr$c";
 | 
| 
 | 
  2459     }
 | 
| 
 | 
  2460   }
 | 
| 
 | 
  2461   my $mappability_caveats = exists $chr2mappability{$c} ? $chr2mappability{$c}->fetch($fields[4], $fields[4]+1) : [];
 | 
| 
 | 
  2462   if(ref $mappability_caveats eq "ARRAY" and @$mappability_caveats){
 | 
| 
 | 
  2463     my %h;
 | 
| 
 | 
  2464     if(exists $chr2caveats{$chr_pos_key}){
 | 
| 
 | 
  2465       if($chr2caveats{$chr_pos_key} !~ /non-unique/){
 | 
| 
 | 
  2466          $chr2caveats{$chr_pos_key} = join("; ", grep {not $h{$_}++} @$mappability_caveats)."; ".$chr2caveats{$chr_pos_key};
 | 
| 
 | 
  2467       }
 | 
| 
 | 
  2468     }
 | 
| 
 | 
  2469     else{
 | 
| 
 | 
  2470       $chr2caveats{$chr_pos_key} = join("; ", grep {not $h{$_}++} @$mappability_caveats);
 | 
| 
 | 
  2471     }
 | 
| 
 | 
  2472   }
 | 
| 
 | 
  2473   push @fields, (exists $chr2caveats{$chr_pos_key} ? $chr2caveats{$chr_pos_key} : "");
 | 
| 
 | 
  2474   # add phase
 | 
| 
 | 
  2475   push @fields, find_phase($variant_key);
 | 
| 
 | 
  2476   push @out_lines, join("\t", $feature_type{$fields[0]}, $feature_length{$fields[0]}, @fields);
 | 
| 
 | 
  2477 }
 | 
| 
 | 
  2478 
 | 
| 
 | 
  2479 # Now tabulate the rare variant numbers
 | 
| 
 | 
  2480 my %gene2rares;
 | 
| 
 | 
  2481 my %gene2aa_rares;
 | 
| 
 | 
  2482 for my $line (@out_lines){
 | 
| 
 | 
  2483   my @F = split /\t/, $line, -1;
 | 
| 
 | 
  2484   if($F[15] eq "NA" or $F[15] < $rare_variant_prop and (!$internal_snp or $F[17] < $rare_variant_prop)){
 | 
| 
 | 
  2485     my $gene_list = $internal_snp ? $F[20] : $F[19];
 | 
| 
 | 
  2486     next unless defined $gene_list;
 | 
| 
 | 
  2487     for my $g (split /; /, $gene_list){
 | 
| 
 | 
  2488       $gene2rares{$g}++;
 | 
| 
 | 
  2489       # Check the cDNA HGVS syntax for relevance
 | 
| 
 | 
  2490       if($F[3] =~ /c.\d+/ or # coding
 | 
| 
 | 
  2491          $F[3] =~ /c.\d+.*-[12]/ or # splicing acceptor
 | 
| 
 | 
  2492          $F[3] =~ /c.\d+\+[12345]/){ # splicing donor
 | 
| 
 | 
  2493         $gene2aa_rares{$g}++;
 | 
| 
 | 
  2494       }
 | 
| 
 | 
  2495     }
 | 
| 
 | 
  2496   }
 | 
| 
 | 
  2497 }
 | 
| 
 | 
  2498 
 | 
| 
 | 
  2499 # Print the results
 | 
| 
 | 
  2500 for my $line (@out_lines){
 | 
| 
 | 
  2501   my @F = split /\t/, $line, -1;
 | 
| 
 | 
  2502   my $gene_list = $internal_snp ? $F[20] : $F[19];
 | 
| 
 | 
  2503   if(not defined $gene_list){
 | 
| 
 | 
  2504     print OUT join("\t", @F, "", ""), "\n"; next;
 | 
| 
 | 
  2505   }
 | 
| 
 | 
  2506 
 | 
| 
 | 
  2507   my $max_gene_rare = 0;
 | 
| 
 | 
  2508   my $max_gene_aa_rare = 0;
 | 
| 
 | 
  2509   for my $g (split /; /, $gene_list){
 | 
| 
 | 
  2510     next unless exists $gene2rares{$g};
 | 
| 
 | 
  2511     if($gene2rares{$g} > $max_gene_rare){
 | 
| 
 | 
  2512       $max_gene_rare = $gene2rares{$g};
 | 
| 
 | 
  2513     }
 | 
| 
 | 
  2514     next unless exists $gene2aa_rares{$g};
 | 
| 
 | 
  2515     if($gene2aa_rares{$g} > $max_gene_aa_rare){
 | 
| 
 | 
  2516       $max_gene_aa_rare = $gene2aa_rares{$g};
 | 
| 
 | 
  2517     }
 | 
| 
 | 
  2518   }
 | 
| 
 | 
  2519   print OUT join("\t", @F, $max_gene_rare, $max_gene_aa_rare), "\n";
 | 
| 
 | 
  2520 }
 | 
| 
 | 
  2521 close(OUT);
 | 
| 
 | 
  2522 
 | 
| 
 | 
  2523 sub range2genes{
 | 
| 
 | 
  2524   my ($c, $from, $to) = @_;
 | 
| 
 | 
  2525   if(not exists $gene_ids{$c}){
 | 
| 
 | 
  2526     if($c =~ s/^chr//){
 | 
| 
 | 
  2527       # nothing more
 | 
| 
 | 
  2528     }
 | 
| 
 | 
  2529     else{
 | 
| 
 | 
  2530       $c = "chr$c";
 | 
| 
 | 
  2531     }
 | 
| 
 | 
  2532   }
 | 
| 
 | 
  2533   if(exists $gene_ids{$c}){
 | 
| 
 | 
  2534     my %have;
 | 
| 
 | 
  2535     return join("; ", grep {not $have{$_}++} @{$gene_ids{$c}->fetch($from, $to+1)});
 | 
| 
 | 
  2536   }
 | 
| 
 | 
  2537   else{
 | 
| 
 | 
  2538     return "";
 | 
| 
 | 
  2539   }
 | 
| 
 | 
  2540 }
 | 
| 
 | 
  2541 sub combine_phase_data{
 | 
| 
 | 
  2542   my ($phases) = @_; # map from variant to its phase data
 | 
| 
 | 
  2543  
 | 
| 
 | 
  2544   # Create a reverse mapping from phase regions to their variants
 | 
| 
 | 
  2545   my %chr2phase_region2variants;
 | 
| 
 | 
  2546   my @variants = keys %$phases;
 | 
| 
 | 
  2547   for my $variant (@variants){
 | 
| 
 | 
  2548     my ($chr) = $variant =~ /^\S+?-(\S+):/;
 | 
| 
 | 
  2549     $chr2phase_region2variants{$chr} = {} if not exists $chr2phase_region2variants{$chr};
 | 
| 
 | 
  2550     for my $phase_region (split /,/, $phases->{$variant}){
 | 
| 
 | 
  2551       $chr2phase_region2variants{$chr}->{$phase_region} = [] if not exists $chr2phase_region2variants{$chr}->{$phase_region};
 | 
| 
 | 
  2552       push @{$chr2phase_region2variants{$phase_region}}, $variant;
 | 
| 
 | 
  2553     }
 | 
| 
 | 
  2554   }
 | 
| 
 | 
  2555 
 | 
| 
 | 
  2556   # Now for each phased block known so far, see if any variant in it is also part of another block
 | 
| 
 | 
  2557   # If so, do a union since phasing is both transitive and commutative.
 | 
| 
 | 
  2558   # The quickest way to do this is to check for overlapping intervals, then check for common members amongst those that do overlap
 | 
| 
 | 
  2559   for my $chr (keys %chr2phase_region2variants){
 | 
| 
 | 
  2560     my @ordered_phase_regions = sort {my($a_pos) = $a =~ /:(\d+)/; my($b_pos) = $b =~ /:(\d+)/; $a_pos <=> $b_pos} keys %{$chr2phase_region2variants{$chr}};
 | 
| 
 | 
  2561     my $sets = new DisjointSets(scalar(@ordered_phase_regions));
 | 
| 
 | 
  2562 
 | 
| 
 | 
  2563     for (my $i = 0; $i < $#ordered_phase_regions; $i++){
 | 
| 
 | 
  2564       my ($start, $stop, $variant) = $ordered_phase_regions[$i];
 | 
| 
 | 
  2565       for (my $j = $i+1; $j <= $#ordered_phase_regions; $j++){
 | 
| 
 | 
  2566         my ($start2, $stop2, $variant2) = $ordered_phase_regions[$j];
 | 
| 
 | 
  2567         if($start2 > $stop){ # won't overlap any regions after this in the sorted list
 | 
| 
 | 
  2568           last;
 | 
| 
 | 
  2569         }
 | 
| 
 | 
  2570         # If we get here, it is implicit that $stop >= $start2 and $start < $stop2, i.e. there is overlap
 | 
| 
 | 
  2571         # Now check if there is a shared variant (otherwise we might erroneously join blocks from different physical chromosomal arms) 
 | 
| 
 | 
  2572         my $have_shared_variant = 0;
 | 
| 
 | 
  2573         my $overlapping_phase_region = $ordered_phase_regions[$j];
 | 
| 
 | 
  2574         for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$i]}}){
 | 
| 
 | 
  2575           if($phases->{$variant} =~ /\b$overlapping_phase_region\b/){
 | 
| 
 | 
  2576             $have_shared_variant = 1; last;
 | 
| 
 | 
  2577           }
 | 
| 
 | 
  2578         }
 | 
| 
 | 
  2579         # sanity check that there aren't conflicting variants in the new block (i.e. two different variants in the same position)
 | 
| 
 | 
  2580         my %pos2base;
 | 
| 
 | 
  2581         my $have_conflicting_variant = 0;
 | 
| 
 | 
  2582         for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$i]}}, @{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$j]}}){
 | 
| 
 | 
  2583           my ($pos, $base) = $variant =~ /(\d+):(.+?)$/;
 | 
| 
 | 
  2584           if(exists $pos2base{$pos} and $pos2base{$pos} ne $base){
 | 
| 
 | 
  2585             # conflict, note with a caveat
 | 
| 
 | 
  2586             if(exists $chr2caveats{"$chr:$pos"}){
 | 
| 
 | 
  2587               $chr2caveats{"$chr:$pos"} .= "; inconsistent haplotype phasing" unless $chr2caveats{"$chr:$pos"} =~ /inconsistent haplotype phasing/;
 | 
| 
 | 
  2588             }
 | 
| 
 | 
  2589             else{
 | 
| 
 | 
  2590               $chr2caveats{"$chr:$pos"} = "inconsistent haplotype phasing";
 | 
| 
 | 
  2591             }
 | 
| 
 | 
  2592             $have_conflicting_variant ||= 1;
 | 
| 
 | 
  2593           }
 | 
| 
 | 
  2594           elsif(not exists $pos2base{$pos}){ 
 | 
| 
 | 
  2595             $pos2base{$pos} = $base;
 | 
| 
 | 
  2596           }
 | 
| 
 | 
  2597         }
 | 
| 
 | 
  2598 
 | 
| 
 | 
  2599         $sets->union($i+1, $j+1) if $have_shared_variant and not $have_conflicting_variant; # indexes are one-based for sets rather than 0-based
 | 
| 
 | 
  2600       }
 | 
| 
 | 
  2601     }
 | 
| 
 | 
  2602     my $phase_sets = $sets->sets; #disjoint haplotype sets
 | 
| 
 | 
  2603     my %region_counts;
 | 
| 
 | 
  2604     for my $phase_set (@$phase_sets){
 | 
| 
 | 
  2605       next if scalar(@$phase_set) == 1; # No change to existing phase region (is disjoint from all others)
 | 
| 
 | 
  2606       # Generate a new haploblock to replace the old ones that are being merged
 | 
| 
 | 
  2607       my $merged_start = 10000000000;
 | 
| 
 | 
  2608       my $merged_end = 0;
 | 
| 
 | 
  2609       for my $ordered_phase_region_index (@$phase_set){
 | 
| 
 | 
  2610         my ($start, $end) = $ordered_phase_regions[$ordered_phase_region_index-1] =~ /(\d+)-(\d+)$/;
 | 
| 
 | 
  2611         $merged_start = $start if $start < $merged_start;
 | 
| 
 | 
  2612         $merged_end = $end if $end > $merged_end;
 | 
| 
 | 
  2613       }
 | 
| 
 | 
  2614       # At the start of the region is a unique prefix so we can tell the arms apart if two haploblocks have the exact same boundary
 | 
| 
 | 
  2615       my $region_count = $region_counts{"$merged_start-$merged_end"}++;
 | 
| 
 | 
  2616       my $merged_haploblock_name = "Y$region_count-$chr:$merged_start-$merged_end";
 | 
| 
 | 
  2617       # Assign this new name to overwrite the premerge values for each variant contained within
 | 
| 
 | 
  2618       for my $ordered_phase_region_index (@$phase_set){
 | 
| 
 | 
  2619         for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$ordered_phase_region_index-1]}}){ # incl. one-based set correction in 0-based array index
 | 
| 
 | 
  2620           print STDERR "Merging $variant from ", $phases->{$variant}, " into new block $merged_haploblock_name\n";
 | 
| 
 | 
  2621           $phases->{$variant} = $merged_haploblock_name;
 | 
| 
 | 
  2622         }
 | 
| 
 | 
  2623       }
 | 
| 
 | 
  2624     }
 | 
| 
 | 
  2625     # TODO: if there are overlapping phase blocks still, but with different variants in the same position, we can infer that they are on the opposite strands...
 | 
| 
 | 
  2626   }
 | 
| 
 | 
  2627 }
 | 
| 
 | 
  2628 
 | 
| 
 | 
  2629 # Sees if the positions of the variant are in the range of a phased haplotype, returning which allele it belongs to
 | 
| 
 | 
  2630 sub find_phase{
 | 
| 
 | 
  2631   my ($chr,$pos,$ref,$variant) = split /:/, $_[0];
 | 
| 
 | 
  2632   return "" if length($ref) != length($variant); # Can only deal with SNPs (and broken down MNPs) for now
 | 
| 
 | 
  2633   for(my $i = 0; $i < length($ref); $i++){
 | 
| 
 | 
  2634     my $key = "$chr:".($pos+$i).":".substr($variant, $i, 1);
 | 
| 
 | 
  2635     #print STDERR "Checking phase for $key\n" if $pos == 12907379;
 | 
| 
 | 
  2636     if(exists $chr2phase{$key}){
 | 
| 
 | 
  2637       #print STDERR "returning phase data $chr2phase{$key}\n";
 | 
| 
 | 
  2638       return $chr2phase{$key};
 | 
| 
 | 
  2639     }
 | 
| 
 | 
  2640     elsif(exists $chr2phase{"chr".$key}){
 | 
| 
 | 
  2641       #print STDERR "returning phase data ", $chr2phase{"chr".$key}, "\n";
 | 
| 
 | 
  2642       return $chr2phase{"chr".$key};
 | 
| 
 | 
  2643     }
 | 
| 
 | 
  2644   }
 | 
| 
 | 
  2645   return "";
 | 
| 
 | 
  2646 }
 | 
| 
 | 
  2647 
 | 
| 
 | 
  2648 sub find_earliest_index{
 | 
| 
 | 
  2649   # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start
 | 
| 
 | 
  2650    my ($query, $array) = @_;
 | 
| 
 | 
  2651   
 | 
| 
 | 
  2652    return 0 if $query < $array->[0]->[0];
 | 
| 
 | 
  2653   
 | 
| 
 | 
  2654    my ($left, $right, $prevCenter) = (0, $#$array, -1);
 | 
| 
 | 
  2655   
 | 
| 
 | 
  2656    while(1){
 | 
| 
 | 
  2657       my $center = int (($left + $right)/2);
 | 
| 
 | 
  2658   
 | 
| 
 | 
  2659       my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1);
 | 
| 
 | 
  2660   
 | 
| 
 | 
  2661       return $center if $cmp == 0;
 | 
| 
 | 
  2662       if ($center == $prevCenter) {
 | 
| 
 | 
  2663          return $left;
 | 
| 
 | 
  2664       }
 | 
| 
 | 
  2665       $right = $center if $cmp < 0;
 | 
| 
 | 
  2666       $left  = $center if $cmp > 0;
 | 
| 
 | 
  2667       $prevCenter = $center;
 | 
| 
 | 
  2668    }
 | 
| 
 | 
  2669 }
 | 
| 
 | 
  2670 
 |