| 0 | 1 #!/usr/bin/env perl | 
|  | 2 | 
|  | 3 BEGIN{ | 
|  | 4   my $prog_dir = `dirname $0`; | 
|  | 5   chomp $prog_dir; | 
|  | 6   push @INC, $prog_dir; # so DisjointSets.pm can be found no matter the working directory | 
|  | 7 } | 
|  | 8 | 
|  | 9 use DisjointSets; # homebrew module | 
|  | 10 use Bio::DB::Sam; # for FastA reference pulls | 
|  | 11 use Bio::SeqUtils; | 
|  | 12 use Bio::Tools::CodonTable; | 
|  | 13 use Statistics::Zed; | 
|  | 14 use Getopt::Long; | 
|  | 15 use Set::IntervalTree; | 
|  | 16 use strict; | 
|  | 17 use warnings; | 
|  | 18 use vars qw($min_prop $zed $codonTable $default_transl_table %transl_except %internal_prop %dbsnp_info %chr2variant_locs %chr2dbsnp_vcf_lines %chr2internal_vcf_lines %chr2caveats %chr2phase @snvs $fasta_index $max_args $quiet); | 
|  | 19 | 
|  | 20 if(@ARGV == 1 and $ARGV[0] eq "-v"){ | 
|  | 21   print "Version 1.0\n"; | 
|  | 22   exit; | 
|  | 23 } | 
|  | 24 | 
|  | 25 #$max_args = `getconf ARG_MAX`; # largest number of args you can send to a system command (enviroment included, see limits.h) | 
|  | 26 #chomp $max_args; | 
|  | 27 $max_args = 4096; # if not defined $max_args or $max_args < 1; # the minimum since System V | 
|  | 28 $max_args -= 50; | 
|  | 29 | 
|  | 30 # find out if a variant appears in the user provided data | 
|  | 31 sub internal_prop($$$$){ | 
|  | 32     my ($chr,$pos,$ref,$variant) = @_; | 
|  | 33 | 
|  | 34     my $key = "$chr:$pos:$ref:$variant"; | 
|  | 35     if(exists $internal_prop{$key}){ | 
|  | 36         return $internal_prop{$key}; | 
|  | 37     } | 
|  | 38 | 
|  | 39     #print STDERR "Checking if internal_prop for $key exists: "; | 
|  | 40     if(exists $chr2internal_vcf_lines{$chr}->{$pos}){ | 
|  | 41       for(@{$chr2internal_vcf_lines{$chr}->{$pos}}){ | 
|  | 42         my @fields = split /\t/, $_; | 
|  | 43         if($pos == $fields[1] and length($fields[3]) == length($ref) and $fields[4] eq $variant){ | 
|  | 44           #print STDERR "yes\n"; | 
|  | 45           if(/MAF=(\d\.\d+)/){ | 
|  | 46             $internal_prop{$key} = $1; # change from percent to proportion | 
|  | 47             return $1; | 
|  | 48           } | 
|  | 49         } | 
|  | 50       } | 
|  | 51     } | 
|  | 52     else{ | 
|  | 53       #print STDERR "no\n"; | 
|  | 54     } | 
|  | 55 | 
|  | 56     $internal_prop{$key} = "NA"; | 
|  | 57     return "NA"; | 
|  | 58 } | 
|  | 59 | 
|  | 60 # find out if a variant appears in the NCBI's dbSNP | 
|  | 61 sub dbsnp_info($$$$){ | 
|  | 62     my ($chr,$pos,$ref,$variant) = @_; | 
|  | 63 | 
|  | 64     my $key = "$chr:$pos:$ref:$variant"; | 
|  | 65     if(exists $dbsnp_info{$key}){ | 
|  | 66         return @{$dbsnp_info{$key}}; | 
|  | 67     } | 
|  | 68 | 
|  | 69     if(exists $chr2dbsnp_vcf_lines{$chr}->{$pos}){ | 
|  | 70       #print STDERR "Checking existing SNP data for $chr:$pos -> ", join("\n", @{$chr2dbsnp_vcf_lines{$chr}->{$pos}}), "\n"; | 
|  | 71       for(@{$chr2dbsnp_vcf_lines{$chr}->{$pos}}){ | 
|  | 72 	my @fields = split /\t/, $_; | 
|  | 73 	for my $var (split /,/, $fields[4]){ | 
|  | 74             # Allows for different reference seqs between dbSNP and input, assuming patches only | 
|  | 75 	    if(length($fields[3]) == length($ref) and ($var eq $variant or $ref eq $var and $variant eq $fields[3])){ | 
|  | 76 		my ($freq, $subpop) = ("",""); | 
|  | 77 		$freq = $1 if $fields[7] =~ /(?:\A|;)MMAF=(0\.\d+)(?:;|\Z)/; | 
|  | 78 		$subpop = $1 if $fields[7] =~ /(?:\A|;)MMAF_SRC=(\S+?)(?:;|\Z)/; | 
|  | 79                 $dbsnp_info{$key} = [$subpop, $freq || "NA", $fields[2]]; | 
|  | 80 		return @{$dbsnp_info{$key}}; | 
|  | 81 	    } | 
|  | 82 	} | 
|  | 83       } | 
|  | 84     } | 
|  | 85     $dbsnp_info{$key} = ["novel", "NA", "NA"]; | 
|  | 86     return @{$dbsnp_info{$key}}; | 
|  | 87 } | 
|  | 88 | 
|  | 89 sub record_snv{ | 
|  | 90   my $line = join("", @_); | 
|  | 91   push @snvs, $line; | 
|  | 92 | 
|  | 93   my @fields = split /\t/, $line; | 
|  | 94   my $prop_info_key = $fields[9]; | 
|  | 95   my ($chr,$pos,$ref,$variant) = split /:/, $prop_info_key; | 
|  | 96   $chr2variant_locs{$chr} = {} unless exists $chr2variant_locs{$chr}; | 
|  | 97   return unless $ref; # ref not defined for CNVs | 
|  | 98   # Need to grab whole range for MNPs | 
|  | 99   for(my $i = 0; $i < length($ref); $i++){ | 
|  | 100     $chr2variant_locs{$chr}->{$pos+$i} = 1; | 
|  | 101   } | 
|  | 102 } | 
|  | 103 | 
|  | 104 sub retrieve_vcf_lines($$$){ | 
|  | 105   my ($dbsnp_file, $internal_snp_file, $chr) = @_; | 
|  | 106 | 
|  | 107   my (%dbsnp_lines, %internal_snp_lines); | 
|  | 108 | 
|  | 109   if(not defined $dbsnp_file or not exists $chr2variant_locs{$chr}){ | 
|  | 110     return ({}, {}, {}, {}); # no data requested for this chromosome | 
|  | 111   } | 
|  | 112 | 
|  | 113   # build up the request | 
|  | 114   my @tabix_regions; | 
|  | 115   my @var_locs = keys %{$chr2variant_locs{$chr}}; | 
|  | 116   # sort by variant start location | 
|  | 117   for my $var_loc (sort {$a <=> $b} @var_locs){ | 
|  | 118     push @tabix_regions, $chr.":".$var_loc."-".$var_loc; | 
|  | 119   } | 
|  | 120   for(my $i = 0; $i <= $#tabix_regions; $i += $max_args){ # chunkify tabix request if too many for the system to handle | 
|  | 121     my $end = $i + $max_args > $#tabix_regions ? $#tabix_regions : $i + $max_args; | 
|  | 122     my $regions = "'".join("' '", @tabix_regions[$i..$end])."'"; | 
|  | 123     # From file is very slow for some reason | 
|  | 124     #my $regions_file =  "/tmp/vcf2hgvs_$$.bed"; | 
|  | 125     #open(REQ_BED, ">$regions_file") | 
|  | 126     #  or die "Cannot open $regions_file for writing: $!\n"; | 
|  | 127     #print REQ_BED join("\n", @tabix_regions), "\n"; | 
|  | 128     #close(REQ_BED); | 
|  | 129 | 
|  | 130     # retrieve the data | 
|  | 131     die "Cannot find dbSNP VCF file $dbsnp_file\n" if not -e $dbsnp_file; | 
|  | 132 | 
|  | 133     open(VCF, "tabix $dbsnp_file $regions |") | 
|  | 134       or die "Cannot run tabix on $dbsnp_file (args ".substr($regions, 0, length($regions)>100? 100 : length($regions))."): $!\n"; | 
|  | 135     while(<VCF>){ | 
|  | 136       #if(/^(\S+\t(\d+)(?:\t\S+){6})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible | 
|  | 137       if(/^(\S+\t(\d+)(?:\t\S+){6})/ and exists $chr2variant_locs{$chr}->{$2}){ # take only main columns to save room, if possible | 
|  | 138         $dbsnp_lines{$2} = [] unless exists $dbsnp_lines{$2}; | 
|  | 139         push @{$dbsnp_lines{$2}}, $1; | 
|  | 140       } | 
|  | 141     } | 
|  | 142     close(VCF); | 
|  | 143 | 
|  | 144     if($internal_snp_file){ | 
|  | 145       die "Cannot find internal VCF file $internal_snp_file\n" if not -e $internal_snp_file; | 
|  | 146       open(VCF, "tabix $internal_snp_file $regions |") | 
|  | 147         or die "Cannot run tabix on $internal_snp_file: $!\n"; | 
|  | 148       while(<VCF>){ | 
|  | 149         #if(/^(\S+\t(\d+)(?:\t\S+){6})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible | 
|  | 150         if(/^(\S+\t(\d+)(?:\t\S+){5})/ and exists $chr2variant_locs{$chr}->{$2}){ # take only main columns to save room, if possible | 
|  | 151           $internal_snp_lines{$2} = [] unless exists $internal_snp_lines{$2}; | 
|  | 152           push @{$internal_snp_lines{$2}}, $1; | 
|  | 153         } | 
|  | 154       } | 
|  | 155       close(VCF); | 
|  | 156     } | 
|  | 157   } | 
|  | 158 | 
|  | 159   #unlink $regions_file; | 
|  | 160 | 
|  | 161   return (\%dbsnp_lines, \%internal_snp_lines); | 
|  | 162 } | 
|  | 163 | 
|  | 164 sub prop_info_key{ | 
|  | 165     my($chr,$pos,$ref,$variant,$exon_edge_dist) = @_; | 
|  | 166 | 
|  | 167     $chr =~ s/^chr//; | 
|  | 168     if($chr eq "M"){ | 
|  | 169       $chr = "MT"; # NCBI uses different name for mitochondrial chromosome | 
|  | 170       $pos-- if $pos >= 3107;  # also, doesn't keep the old positioning (historical) | 
|  | 171     } | 
|  | 172     return join(":", $chr,$pos,$ref,$variant, ($exon_edge_dist ? $exon_edge_dist : "")); | 
|  | 173 } | 
|  | 174 | 
|  | 175 sub prop_info($$$){ | 
|  | 176     my($snpfile,$internal_snps_file,$prop_info_key) = @_; | 
|  | 177 | 
|  | 178     my($chr,$pos,$ref,$variant) = split /:/, $prop_info_key; | 
|  | 179 | 
|  | 180     # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse | 
|  | 181     if(not exists $chr2dbsnp_vcf_lines{$chr}){ | 
|  | 182       ($chr2dbsnp_vcf_lines{$chr}, $chr2internal_vcf_lines{$chr}) = retrieve_vcf_lines($snpfile,$internal_snps_file,$chr); | 
|  | 183     } | 
|  | 184     my $internal_maf = 0; | 
|  | 185     if($internal_snps_file){ | 
|  | 186       $internal_maf = internal_prop($chr,$pos,$ref,$variant); | 
|  | 187       $internal_maf = 0 if $internal_maf eq "NA"; | 
|  | 188     } | 
|  | 189 | 
|  | 190     my @results = dbsnp_info($chr,$pos,$ref,$variant); | 
|  | 191 | 
|  | 192     # Not all entries have a proportion in dbSNP | 
|  | 193     return $internal_snps_file ? ($ref, $variant, @results, $internal_maf) : ($ref, $variant, @results); | 
|  | 194 } | 
|  | 195 | 
|  | 196 #offset a given HGVS nomenclature position (single position only) by a given number of bases | 
|  | 197 sub hgvs_plus($$){ | 
|  | 198    my ($hgvs, $offset) = @_; | 
|  | 199     if($hgvs =~ /^(\S+)(-\d+)(.*)/){ | 
|  | 200 	# all negative | 
|  | 201 	if($2+$offset<0){ | 
|  | 202 	    return $1.($2+$offset).$3; | 
|  | 203 	} | 
|  | 204 	# switches to positive, need to mod | 
|  | 205 	else{ | 
|  | 206 	    return $1+($2+$offset); | 
|  | 207 	} | 
|  | 208     } | 
|  | 209     elsif($hgvs =~ /^(\S+)\+(\d+)(.*)/){ | 
|  | 210 	# all positive | 
|  | 211 	if($2+$offset>0){ | 
|  | 212 	    return $1."+".($2+$offset).$3; | 
|  | 213 	} | 
|  | 214 	# switches to negative, need to mod | 
|  | 215 	else{ | 
|  | 216 	    return $1+($2+$offset); | 
|  | 217 	} | 
|  | 218     } | 
|  | 219     elsif($hgvs =~ /^(-?\d+)(.*)/){ | 
|  | 220 	# special case if offset spans -/+ since there is no position 0 | 
|  | 221 	if($1 < 0 and $1+$offset >= 0){ | 
|  | 222 	    $offset++; | 
|  | 223 	} | 
|  | 224 	elsif($1 > 0 and $1+$offset <= 0){ | 
|  | 225 	    $offset--; | 
|  | 226 	} | 
|  | 227 	return ($1+$offset).$2; | 
|  | 228     } | 
|  | 229     else{ | 
|  | 230 	die "Cannot convert $hgvs to a new offset ($offset), only single base position nomenclature is currently supported\n"; | 
|  | 231     } | 
|  | 232 } | 
|  | 233 | 
|  | 234 # offset a given position by a given number of bases, | 
|  | 235 # taking into account that if the new offset crosses the threshold in the last argument, | 
|  | 236 # HGVS boundary nomenclature has to be introduced | 
|  | 237 sub hgvs_plus_exon($$$){ | 
|  | 238     my ($pos, $offset, $boundary) = @_; | 
|  | 239 | 
|  | 240     # special case if offset spans -/+ since there is no position 0 | 
|  | 241     if($pos =~ /^(-?\d+)(.*)/){ | 
|  | 242       if($1 < 0 and $1+$offset >= 0){ | 
|  | 243          $offset++; | 
|  | 244       } | 
|  | 245       elsif($1 > 0 and $1+$offset <= 0){ | 
|  | 246          $offset--; | 
|  | 247       } | 
|  | 248     } | 
|  | 249     my $new_pos = $pos + $offset; | 
|  | 250     if($new_pos > $boundary and $pos <= $boundary){ | 
|  | 251 	# just moved into an intron 3' | 
|  | 252 	$new_pos = $boundary."+".($new_pos-$boundary); | 
|  | 253     } | 
|  | 254     elsif($new_pos < $boundary and $pos >= $boundary){ | 
|  | 255 	# just moved into an intron 5' | 
|  | 256 	$new_pos = $boundary.($new_pos-$boundary); | 
|  | 257     } | 
|  | 258     return $new_pos; | 
|  | 259 } | 
|  | 260 | 
|  | 261 # given a nucleotide position, calculates the AA there (assumes coding region) | 
|  | 262 sub getCodonFromSeq($$$$){ | 
|  | 263   my ($chr_ref, $location, $frame_offset, $strand) = @_; | 
|  | 264 | 
|  | 265   my $codon; | 
|  | 266   if($strand eq "+"){ | 
|  | 267     $codon = substr($$chr_ref, $location-1-$frame_offset, 3); | 
|  | 268   } | 
|  | 269   else{ | 
|  | 270     $codon = substr($$chr_ref, $location-3+$frame_offset, 3); | 
|  | 271     $codon = reverse($codon); | 
|  | 272     $codon =~ tr/ACGTacgt/TGCAtgca/; | 
|  | 273   } | 
|  | 274   return $codon; | 
|  | 275 } | 
|  | 276 | 
|  | 277 sub getCodonFromSeqIndex($$$$){ | 
|  | 278   my ($chr, $location, $frame_offset, $strand) = @_; | 
|  | 279 | 
|  | 280   my $codon; | 
|  | 281   if($strand eq "+"){ | 
|  | 282     $codon = $fasta_index->fetch($chr.":".($location-$frame_offset)."-".($location-$frame_offset+2)); | 
|  | 283   } | 
|  | 284   else{ | 
|  | 285     $codon = $fasta_index->fetch($chr.":".($location-2+$frame_offset)."-".($location+$frame_offset)); | 
|  | 286     $codon = reverse($codon); | 
|  | 287     $codon =~ tr/ACGTacgt/TGCAtgca/; | 
|  | 288   } | 
|  | 289   return $codon; | 
|  | 290 } | 
|  | 291 | 
|  | 292 sub getAAFromSeq($$$$$){ | 
|  | 293   return $_[4]->translate(getCodonFromSeq($_[0], $_[1], $_[2], $_[3])); | 
|  | 294 } | 
|  | 295 | 
|  | 296 sub getAAFromSeqIndex($$$$$){ | 
|  | 297   # convert codon to AA | 
|  | 298   if(exists $transl_except{"$_[0]:$_[1]"}){ | 
|  | 299     return $transl_except{"$_[0]:$_[1]"}; | 
|  | 300   } | 
|  | 301   else{ | 
|  | 302     return $_[4]->translate(getCodonFromSeqIndex($_[0], $_[1], $_[2], $_[3])); | 
|  | 303   } | 
|  | 304 } | 
|  | 305 | 
|  | 306 sub hgvs_protein{ | 
|  | 307   my ($chr, $location, $ref, $variant, $cdna_pos, $strand, $transl_table) = @_; | 
|  | 308 | 
|  | 309   if(substr($ref,0,1) eq substr($variant,0,1)){ | 
|  | 310     substr($ref,0,1) = ""; | 
|  | 311     substr($variant,0,1) = ""; | 
|  | 312     $location++; | 
|  | 313     if($strand eq "-"){ | 
|  | 314       $cdna_pos--; | 
|  | 315     } | 
|  | 316     else{ | 
|  | 317       $cdna_pos++; | 
|  | 318     } | 
|  | 319   } | 
|  | 320 | 
|  | 321   if($cdna_pos !~ /^\d+/){ | 
|  | 322     die "Aborting: got illegal cDNA position ($cdna_pos) for protein HGVS conversion of position ", | 
|  | 323         "$location, ref $ref, variant $variant. Please correct the program code.\n"; | 
|  | 324   } | 
|  | 325   # Get the correct frame for the protein translation, to know what codons are affected | 
|  | 326   my $aapos = int(($cdna_pos-1)/3)+1; | 
|  | 327 | 
|  | 328   # does it destroy the start codon? | 
|  | 329   if($cdna_pos < 4){ # assumes animal codon usage | 
|  | 330     return "p.0?"; # indicates start codon missing, unsure of effect | 
|  | 331   } | 
|  | 332 | 
|  | 333   my $table = $transl_table ne $default_transl_table ?  # non standard translation table requested | 
|  | 334     Bio::Tools::CodonTable->new(-id=>$transl_table) : $codonTable; | 
|  | 335 | 
|  | 336   my $frame_offset = ($cdna_pos-1)%3; | 
|  | 337   my $origAA = getAAFromSeqIndex($chr, $location, $frame_offset, $strand, $table); | 
|  | 338   # take 100000 bp on either side for translation context of variant seq | 
|  | 339   my $five_prime_buffer = $location < 10000 ? $location-1 : 10000; | 
|  | 340   my $mutSeq = $fasta_index->fetch($chr.":".($location-$five_prime_buffer)."-".($location+10000)); | 
|  | 341 | 
|  | 342   # substitute all of the immediately adjacent variants in phase with this one to get the correct local effect | 
|  | 343   substr($mutSeq, $five_prime_buffer, length($ref)) = $variant; | 
|  | 344 | 
|  | 345   # does it cause a frameshift? | 
|  | 346   my $length_diff = length($variant)-length($ref); | 
|  | 347   if($length_diff%3){ # insertion or deletion not a multiple of three | 
|  | 348     my $fs_codon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1, $frame_offset, $strand); | 
|  | 349     my $ext = 0; | 
|  | 350     my $newAA; | 
|  | 351     do{ | 
|  | 352        $ext++; | 
|  | 353        # The "NA"s below make it so that we don't pick up any translation exceptions from the original reference annotation | 
|  | 354        if($strand eq "+"){ | 
|  | 355          $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1+$ext*3, $frame_offset, $strand, $table); | 
|  | 356        } | 
|  | 357        else{ | 
|  | 358          $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1-$ext*3, $frame_offset, $strand, $table); | 
|  | 359        } | 
|  | 360      } while($newAA ne "*"); | 
|  | 361 | 
|  | 362     return "p.".$origAA.$aapos.$table->translate($fs_codon)."fs*$ext"; | 
|  | 363   } | 
|  | 364 | 
|  | 365   # does it cause a stop codon to be lost? | 
|  | 366   if($origAA eq "*"){ | 
|  | 367      my $stopChangeCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1, $frame_offset, $strand); | 
|  | 368      # still a stop after the mutation (ignore translation exceptions) | 
|  | 369      if($table->is_ter_codon($stopChangeCodon)){ | 
|  | 370         return "p.*$aapos="; | 
|  | 371      } | 
|  | 372      # calculate the new stop, assuming there aren't mutations downstream in candidate stop codons | 
|  | 373      my $ext = 0; | 
|  | 374      my $newCodon; | 
|  | 375      do{ | 
|  | 376        if($strand eq "+"){ | 
|  | 377          $newCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1+(++$ext*3), $frame_offset, $strand); | 
|  | 378        } | 
|  | 379        else{ | 
|  | 380          $newCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1-(++$ext*3), $frame_offset, $strand); | 
|  | 381        } | 
|  | 382      } while(not $table->is_ter_codon($newCodon)); | 
|  | 383 | 
|  | 384      return "p.*".$aapos.$table->translate($stopChangeCodon)."ext*".$ext; | 
|  | 385   } | 
|  | 386 | 
|  | 387   # if we get this far, it's a "regular" AA level change | 
|  | 388   my $origAAs = ""; | 
|  | 389   for(my $i = 0; $i < length($ref)+$frame_offset; $i+=3){ | 
|  | 390     my $oldAA = getAAFromSeqIndex($chr, $location+$i, $frame_offset, $strand, $table); | 
|  | 391     if($strand eq "+"){ | 
|  | 392       $origAAs .= $oldAA; | 
|  | 393     } | 
|  | 394     else{ | 
|  | 395       $origAAs = $oldAA . $origAAs; | 
|  | 396     } | 
|  | 397   } | 
|  | 398   my $newAAs = ""; | 
|  | 399   for(my $i = 0; $i < length($variant)+$frame_offset; $i+=3){ | 
|  | 400     # NA means we don't take translation exceptions from the original | 
|  | 401     my $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1+$i, $frame_offset, $strand, $table); | 
|  | 402     if($strand eq "+"){ | 
|  | 403       $newAAs .= $newAA; | 
|  | 404     } | 
|  | 405     else{ | 
|  | 406       $newAAs = $newAA . $newAAs; | 
|  | 407     } | 
|  | 408   } | 
|  | 409 | 
|  | 410   # silent | 
|  | 411   if($origAAs eq $newAAs){ | 
|  | 412       return "p.".$origAAs.$aapos."="; | 
|  | 413   } | 
|  | 414 | 
|  | 415   # minimize the difference if there are leading or trailing AAs the same | 
|  | 416   my $delLength = length($ref); | 
|  | 417   while(substr($newAAs, 0, 1) eq substr($origAAs, 0, 1)){ | 
|  | 418     $newAAs = substr($newAAs, 1); | 
|  | 419     $origAAs = substr($origAAs, 1); | 
|  | 420     $location+=3; | 
|  | 421     $delLength-=3; | 
|  | 422     $aapos++; | 
|  | 423   } | 
|  | 424   while(substr($newAAs, -1) eq substr($origAAs, -1)){ | 
|  | 425     $newAAs = substr($newAAs, 0, length($newAAs)-1); | 
|  | 426     $origAAs = substr($origAAs, 0, length($origAAs)-1); | 
|  | 427   } | 
|  | 428 | 
|  | 429   # insertion | 
|  | 430   if(length($origAAs) == 0){ | 
|  | 431     my $insAAs = getAAFromSeqIndex($chr,$location-3,$frame_offset,$strand,$table).($aapos-1)."_". | 
|  | 432                  getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table); | 
|  | 433     return "p.".$insAAs.$aapos."ins".$newAAs; | 
|  | 434   } | 
|  | 435   # deletion | 
|  | 436   elsif(length($newAAs) == 0){ | 
|  | 437     my $delAAs; | 
|  | 438     if(length($origAAs) == 1){ | 
|  | 439        $delAAs = getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos; # single AA deletion | 
|  | 440     } | 
|  | 441     else{ # deleting a stretch | 
|  | 442        if($strand eq "+"){ | 
|  | 443          my $endPoint = $location+$delLength-1; | 
|  | 444          $delAAs = getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos."_". | 
|  | 445                    getAAFromSeqIndex($chr,$endPoint,$frame_offset,$strand,$table).($aapos+int(($delLength-1)/3)); | 
|  | 446        } | 
|  | 447        else{ | 
|  | 448          my $endPoint = $location-$delLength+1; | 
|  | 449          $delAAs = getAAFromSeqIndex($chr,$endPoint,$frame_offset,$strand,$table).($aapos-int(($delLength-1)/3))."_". | 
|  | 450                    getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos; | 
|  | 451        } | 
|  | 452     } | 
|  | 453     return "p.".$delAAs."del"; | 
|  | 454   } | 
|  | 455   else{ | 
|  | 456     # substitution | 
|  | 457     if(length($origAAs) == 1 and length($newAAs) == 1){ | 
|  | 458       return "p.".$origAAs.$aapos.$newAAs; | 
|  | 459     } | 
|  | 460     # indel | 
|  | 461     elsif(length($origAAs) != 1){ | 
|  | 462        # convert ref stretch into range syntax | 
|  | 463        if($strand eq "+"){ | 
|  | 464          $origAAs = substr($origAAs, 0, 1).$aapos."_".substr($origAAs, -1).($aapos+length($origAAs)-1); | 
|  | 465        } | 
|  | 466        else{ | 
|  | 467          $origAAs = substr($origAAs, 0, 1).($aapos-length($origAAs)+1)."_".substr($origAAs, -1).$aapos; | 
|  | 468        } | 
|  | 469     } | 
|  | 470     return "p.".$origAAs."delins".$newAAs; | 
|  | 471   } | 
|  | 472   return ("NA", ""); | 
|  | 473 } | 
|  | 474 | 
|  | 475 sub z2p{ | 
|  | 476   if(not defined $zed){ | 
|  | 477     $zed = new Statistics::Zed; | 
|  | 478   } | 
|  | 479   my $p = $zed->z2p(value => $_[0]); | 
|  | 480   return $p < 0.0000000001 ? 0 : $p; | 
|  | 481 } | 
|  | 482 sub gq2p{ | 
|  | 483     return $_[0] > 200 ? 0 : 10**($_[0]/-10); | 
|  | 484 } | 
|  | 485 | 
|  | 486 my ($multi_phased, $min_depth, $flanking_bases, $dbsnp, $internal_snp, $genename_bed_file, $dir_1000G, $dir_esp6500, $min_pvalue, $mappability_file, $reference_file, $samtools_phasing_file, $exons_file, $input_file, $output_file, $cnv_file, $dgv_file, $which_chr, $enrichment_regions_file, $rare_variant_prop); | 
|  | 487 $multi_phased = 0; | 
|  | 488 $min_depth = 2; | 
|  | 489 $flanking_bases = 30; | 
|  | 490 $min_pvalue = 0.01; | 
|  | 491 $min_prop = 0.14; | 
|  | 492 $rare_variant_prop = 0.05; | 
|  | 493 $input_file = "-"; # STDIN by default | 
|  | 494 $output_file = "-"; # STDOUT by default | 
|  | 495 $default_transl_table = "1"; # assumes NCBI 'Standard' table, unless it is an argument to the script... | 
|  | 496 &GetOptions("d=i"       => \$min_depth, | 
|  | 497             "f=i"       => \$flanking_bases, | 
|  | 498             "s=s"       => \$dbsnp, | 
|  | 499             "t=s"       => \$dir_1000G, | 
|  | 500             "n=s"       => \$dir_esp6500, | 
|  | 501             "u=s"       => \$internal_snp, | 
|  | 502             "q"         => \$quiet, | 
|  | 503             "p=f"       => \$min_pvalue, | 
|  | 504             "h=f"       => \$min_prop, | 
|  | 505             "m=s"       => \$mappability_file, | 
|  | 506             "r=s"       => \$reference_file, | 
|  | 507             "z=s"       => \$samtools_phasing_file, | 
|  | 508             "e=s"       => \$exons_file, | 
|  | 509             "i=s"       => \$input_file, | 
|  | 510             "c=s"       => \$cnv_file, | 
|  | 511             "g=s"       => \$dgv_file, | 
|  | 512             "b=s"       => \$genename_bed_file, | 
|  | 513             "w=s"       => \$which_chr, | 
|  | 514             "o=s"       => \$output_file, | 
|  | 515             "a=i"       => \$default_transl_table, | 
|  | 516             "v=f"       => \$rare_variant_prop, | 
|  | 517             "x=s"       => \$enrichment_regions_file); # if enrichment regions are specified, variants without a transcript model but in these ranges will be reported | 
|  | 518 | 
|  | 519 if(($input_file ne "/dev/null" and not defined $reference_file) or | 
|  | 520    not defined $exons_file or | 
|  | 521    (defined $cnv_file and not defined $dgv_file)){ | 
|  | 522   die "Usage: $0 [-v(ersion)] [-q(uiet)] [-w(hich) contig_to_report (default is all)] [-d(epth of variant reads req'd) #] [-v(ariant max freq to count as rare)] [-f(lanking exon bases to report) #] [-p(robability of random genotype, maximum to report) 0.#]\n", | 
|  | 523       "  [-h(et proportion of variant reads, minimum to report) 0.#] [-c(opy number) variants_file.bed -g(enomic structural) variants_control_db.txt.gz] [-z file_containing_samtools_phase_output.txt]\n", | 
|  | 524       "  [-t(housand) genomes_integrated_vcfs_gz_dir] [-n ESP6500_dir] [-u(ser) specified_population.vcf.gz] [-m(appability) crg_file.bed]\n", | 
|  | 525       "  [-x enrichment_regions_file.bed] [-a(mino) acid translation table number from NCBI]\n", | 
|  | 526       "  [-i(nput) genotypes.vcf <-r(eference) sequence_file.fasta>] [-o(utput) hgvs_file.tsv] [-s(np) database_from_ncbi.vcf.gz]\n", | 
|  | 527       "  <-b(ed) file of named gene regions.bed> <-e(xons) file.gtf>\n\n", | 
|  | 528       "Input gz files must be indexed with Tabix.\nDefault input is STDIN, default output is STDOUT.  Note: if -c is specified, polyploidies are are assume to be proximal. Other defaults: -d 2, -v 0.05, -f 30, -p 0.01, -h 0.14 -a 1\nReference sequence is not strictly necessary if only CNV are being annotated.\n"; | 
|  | 529 } | 
|  | 530 | 
|  | 531 print STDERR "Considering $flanking_bases flanking bases for variants as well\n" unless $quiet; | 
|  | 532 | 
|  | 533 $codonTable = new Bio::Tools::CodonTable(id => $default_transl_table); | 
|  | 534 | 
|  | 535 my %enrichment_regions; | 
|  | 536 # Note, we assume the regions are non-overlapping | 
|  | 537 if(defined $enrichment_regions_file){ | 
|  | 538   print STDERR "Loading enrichment regions...\n" unless $quiet; | 
|  | 539   open(BED, $enrichment_regions_file) | 
|  | 540     or die "Cannot open $enrichment_regions_file for reading: $!\n"; | 
|  | 541   while(<BED>){ | 
|  | 542     chomp; | 
|  | 543     my @F = split /\t/, $_; | 
|  | 544     $enrichment_regions{$F[0]} = [] if not exists $enrichment_regions{$F[0]}; | 
|  | 545     push @{$enrichment_regions{$F[0]}}, [$F[1], $F[2]]; | 
|  | 546   } | 
|  | 547   close(BED); | 
|  | 548 } | 
|  | 549 for my $chr (keys %enrichment_regions){ # sort by start | 
|  | 550   $enrichment_regions{$chr} = [sort {$a->[0] <=> $b->[0]} @{$enrichment_regions{$chr}}]; | 
|  | 551 } | 
|  | 552 | 
|  | 553 if(defined $reference_file){ | 
|  | 554   print STDERR "Scanning reference FastA info\n" unless $quiet; | 
|  | 555   if(not -e $reference_file){ | 
|  | 556     die "Reference FastA file ($reference_file) does not exist.\n"; | 
|  | 557   } | 
|  | 558   if(not -e $reference_file.".fai" and not -w dirname($reference_file)){ | 
|  | 559     die "Reference FastA file ($reference_file) is not indexed, and the directory is not writable.\n"; | 
|  | 560   } | 
|  | 561   $fasta_index = Bio::DB::Sam::Fai->load($reference_file); | 
|  | 562 } | 
|  | 563 | 
|  | 564 my %chr2mappability; | 
|  | 565 if(defined $mappability_file){ | 
|  | 566   print STDERR "Reading in mappability data\n" unless $quiet; | 
|  | 567   my ($nmer) = $mappability_file =~ /(\d+).*?$/; | 
|  | 568   die "Cannot determine nmer from nmer file name $mappability_file, aborting\n" unless $nmer; | 
|  | 569   open(MAP, $mappability_file) | 
|  | 570     or die "Cannot open mappability data file $mappability_file for reading: $!\n"; | 
|  | 571   <MAP>; # header | 
|  | 572   while(<MAP>){ | 
|  | 573     next if /^#/; | 
|  | 574     chomp; | 
|  | 575     my @F = split /\t/, $_; | 
|  | 576     my $x = int(1/$F[3]+0.5); | 
|  | 577     $chr2mappability{$F[0]} = Set::IntervalTree->new() if not exists $chr2mappability{$F[0]}; | 
|  | 578     $chr2mappability{$F[0]}->insert("non-unique mapping region (x$x)", $F[1], $F[2]+$nmer-1); | 
|  | 579   } | 
|  | 580   close(MAP); | 
|  | 581 } | 
|  | 582 | 
|  | 583 # Is phasing data provided? | 
|  | 584 if(defined $samtools_phasing_file){ | 
|  | 585   print STDERR "Reading in phasing data\n" unless $quiet; | 
|  | 586   open(PHASE, $samtools_phasing_file) | 
|  | 587     or die "Cannot open phasing data file $samtools_phasing_file for reading: $!\n"; | 
|  | 588   my $phase_range; | 
|  | 589   while(<PHASE>){ | 
|  | 590     if(/^PS/){ | 
|  | 591       chomp; | 
|  | 592       my @F = split /\t/, $_; | 
|  | 593       $phase_range = "$F[2]-$F[3]"; | 
|  | 594     } | 
|  | 595     if(/^M[12]/){ | 
|  | 596       chomp; | 
|  | 597       my @F = split /\t/, $_; | 
|  | 598       #ignore strange cases where haplotype reference has no cases (weird samtools call) | 
|  | 599       next if $F[9] == 0 or $F[7] == 0; | 
|  | 600       my $chr = $F[1]; | 
|  | 601       next if defined $which_chr and not $chr eq $which_chr; | 
|  | 602       my $pos = $F[3]; | 
|  | 603       #print STDERR "Recording phase for $chr:$pos:$F[4] , $chr:$pos:$F[5] as A-$chr:$phase_range and B-$chr:$phase_range\n" if $pos == 12907379; | 
|  | 604       if(($F[10]+$F[8])/($F[9]+$F[7]) >= $min_prop){ # error meets reporting threshold | 
|  | 605         $chr2caveats{"$chr:$pos"} .= "; " if exists $chr2caveats{"$chr:$pos"}; | 
|  | 606         $chr2caveats{"$chr:$pos"} .= "inconsistent haplotype phasing"; | 
|  | 607       } | 
|  | 608       else{ # appears to be a genuine phasing | 
|  | 609         $chr2phase{"$chr:$pos:$F[4]"} = "A-$chr:$phase_range"; # grouping for haplotype | 
|  | 610         $chr2phase{"$chr:$pos:$F[5]"} = "B-$chr:$phase_range"; # grouping for haplotype | 
|  | 611       } | 
|  | 612     } | 
|  | 613   } | 
|  | 614   close(PHASE); | 
|  | 615 } | 
|  | 616 | 
|  | 617 # Check the VCF file to see if contains phase data | 
|  | 618 open(VCFIN, $input_file) | 
|  | 619     or die "Cannot open $input_file for reading: $!\n"; | 
|  | 620 my $phase_chr = ""; | 
|  | 621 my @phase_dataA; | 
|  | 622 my @phase_dataB; | 
|  | 623 while(<VCFIN>){ | 
|  | 624     if(/^\s*(?:#|$)/){ # blank or hash comment | 
|  | 625        next; | 
|  | 626     } | 
|  | 627     my @F = split /\t/, $_; | 
|  | 628     next if exists $chr2caveats{"$F[0]:$F[1]"} and $chr2caveats{"$F[0]:$F[1]"} =~ /inconsistent haplotype phasing/; | 
|  | 629     # | indicates phased | 
|  | 630     if($F[8] =~ m(^(\d+)\|(\d+):)){ | 
|  | 631       next if $1 eq $2;  # not useful to us (actually would mess up phase combining later on), but is provided sometimes | 
|  | 632       # start of a phasing block | 
|  | 633       if($phase_chr eq ""){ | 
|  | 634         $phase_chr = $F[0]; | 
|  | 635       } | 
|  | 636       my @vars = split /,/, $F[4]; | 
|  | 637       if($1 > @vars){ | 
|  | 638         die "Invalid VCF file (line #$.): First haplotype listed as $1, but only ", scalar(@vars), " variants were provided (", join(",", @vars), "\n"; | 
|  | 639       } | 
|  | 640       if($2 > @vars){ | 
|  | 641         die "Invalid VCF file (line #$.): Second haplotype listed as $1, but only ", scalar(@vars), " variants were provided (", join(",", @vars), "\n"; | 
|  | 642       } | 
|  | 643       unshift @vars, $F[3]; | 
|  | 644       push @phase_dataA, [$F[1], $vars[$1]]; | 
|  | 645       push @phase_dataB, [$F[1], $vars[$2]]; | 
|  | 646     } | 
|  | 647     # non phased het call, ends any phasing block there might be | 
|  | 648     elsif($F[8] =~ m(^0/1)){ | 
|  | 649       # Did we just finish a phased block? If so, output it. | 
|  | 650       if(@phase_dataA > 1){ | 
|  | 651         my $phase_def = "G-$phase_chr:".$phase_dataA[0]->[0]."-".$phase_dataA[$#phase_dataA]->[0]; | 
|  | 652         for my $d (@phase_dataA){ | 
|  | 653           my ($p, $v) = @$d; | 
|  | 654           if(exists $chr2phase{"$phase_chr:$p:$v"}){ | 
|  | 655             $chr2phase{"$phase_chr:$p:$v"} .= ",$phase_def"; | 
|  | 656             $multi_phased ||= 1; | 
|  | 657           } | 
|  | 658           else{ | 
|  | 659             $chr2phase{"$phase_chr:$p:$v"} = $phase_def; | 
|  | 660           } | 
|  | 661         } | 
|  | 662         $phase_def = "H-$phase_chr:".$phase_dataB[0]->[0]."-".$phase_dataB[$#phase_dataB]->[0]; | 
|  | 663         for my $d (@phase_dataB){ | 
|  | 664           my ($p, $v) = @$d; | 
|  | 665           if(exists $chr2phase{"$phase_chr:$p:$v"}){ | 
|  | 666             $chr2phase{"$phase_chr:$p:$v"} = ",$phase_def"; | 
|  | 667             $multi_phased ||= 1; | 
|  | 668           } | 
|  | 669           else{ | 
|  | 670             $chr2phase{"$phase_chr:$p:$v"} = $phase_def; | 
|  | 671           } | 
|  | 672         } | 
|  | 673       } | 
|  | 674       if($phase_chr ne ""){ | 
|  | 675         $phase_chr = ""; | 
|  | 676         @phase_dataA = (); | 
|  | 677         @phase_dataB = (); | 
|  | 678       } | 
|  | 679     } | 
|  | 680 } | 
|  | 681 | 
|  | 682 print STDERR "Reading in feature GTF data..." unless $quiet; | 
|  | 683 my %feature_range; # chr => transcript_id => [[genomic_exon_start,genomic_exon_end,cdna_start_pos],...] | 
|  | 684 my %feature_intervaltree; # chr => transcript_id => [[genomic_exon_start,genomic_exon_end,cdna_start_pos],...] | 
|  | 685 my %feature_strand; # transcript_id => +|- | 
|  | 686 my $feature_count = 0; | 
|  | 687 my %feature_min; | 
|  | 688 my %feature_max; | 
|  | 689 my %feature_cds_min; | 
|  | 690 my %feature_cds_max; | 
|  | 691 my %feature_contig; | 
|  | 692 my %feature_length; | 
|  | 693 my %feature_type; | 
|  | 694 my %feature_transl_table; # note alternate translation table usage | 
|  | 695 my %chr_read; | 
|  | 696 open(GTF, $exons_file) | 
|  | 697     or die "Cannot open $exons_file for reading: $!\n"; | 
|  | 698 while(<GTF>){ | 
|  | 699     next if /^\s*#/; | 
|  | 700     my @fields = split /\t/, $_; | 
|  | 701     next if defined $which_chr and $fields[0] ne $which_chr and "chr$fields[0]" ne $which_chr and $fields[0] ne "chr$which_chr"; | 
|  | 702 | 
|  | 703     if($fields[2] eq "exon" or $fields[2] eq "CDS"){ | 
|  | 704 	next unless $fields[$#fields] =~ /transcript_id \"(.*?)\"/o; | 
|  | 705 	my $parent = $1; | 
|  | 706 	if(not $quiet and not exists $chr_read{$fields[0]}){ | 
|  | 707 	    print STDERR " $fields[0]"; | 
|  | 708 	    $chr_read{$fields[0]} = 1; | 
|  | 709 	} | 
|  | 710 	if(not exists $feature_strand{$parent}){ | 
|  | 711 	    $feature_strand{$parent} = $fields[6]; | 
|  | 712 	    $feature_contig{$parent} = $fields[0]; | 
|  | 713             if($fields[$#fields] =~ /transcript_type \"(.*?)\"/){ | 
|  | 714               $feature_type{$parent} = $1; | 
|  | 715             } | 
|  | 716             else{ | 
|  | 717               $feature_type{$parent} = "NA"; | 
|  | 718             } | 
|  | 719 	} | 
|  | 720 	if($fields[2] eq "CDS"){ | 
|  | 721 	    #print STDERR "CDS value for $parent is $fields[2]..$fields[3]\n"; | 
|  | 722 	    if(not exists $feature_cds_min{$parent} or $fields[3] < $feature_cds_min{$parent}){ | 
|  | 723 		$feature_cds_min{$parent} = $fields[3]; | 
|  | 724 	    } | 
|  | 725 	    if(not exists $feature_cds_max{$parent} or $fields[4] > $feature_cds_max{$parent}){ | 
|  | 726 		$feature_cds_max{$parent} = $fields[4]; | 
|  | 727 	    } | 
|  | 728             if($fields[$#fields] =~ /transl_table \"(\d+)\"/){ | 
|  | 729                 $feature_transl_table{$parent} = $1; #assume one translation table per CDS, which should be reasonable | 
|  | 730             } | 
|  | 731             while($fields[$#fields] =~ /transl_except \"pos:(\S+?),aa:(\S+?)\"/g){ | 
|  | 732                 my $pos = $1; | 
|  | 733                 my $new_aa = $2; # needs to change from three letter code to 1 | 
|  | 734                 if($new_aa =~ /^ter/i){ # can be funny so have special case (allows TERM, etc.) | 
|  | 735                   $new_aa = "*"; | 
|  | 736                 } | 
|  | 737                 elsif(length($new_aa) == 3){ | 
|  | 738                   $new_aa = Bio::SeqUtils->new()->seq3in($new_aa); | 
|  | 739                 } | 
|  | 740                 if($pos =~ /^(\d+)\.\.(\d+)/){ | 
|  | 741                   for my $p ($1..$2){ | 
|  | 742                     $transl_except{"$fields[0]:$p"} = $new_aa; | 
|  | 743                   } | 
|  | 744                 } | 
|  | 745                 else{ | 
|  | 746                   $transl_except{"$fields[0]:$pos"} = $new_aa; | 
|  | 747                 } | 
|  | 748             } | 
|  | 749 	    next; | 
|  | 750 	} | 
|  | 751 	if(not exists $feature_min{$parent} or $fields[3] < $feature_min{$parent}){ | 
|  | 752 	    $feature_min{$parent} = $fields[3]; | 
|  | 753 	} | 
|  | 754 	if(not exists $feature_max{$parent} or $fields[4] > $feature_max{$parent}){ | 
|  | 755 	    $feature_max{$parent} = $fields[4]; | 
|  | 756 	} | 
|  | 757 | 
|  | 758 	$feature_count++; | 
|  | 759 	if(not exists $feature_range{$fields[0]}){ | 
|  | 760 	    $feature_range{$fields[0]} = {}; # Chr => {parentID => [start,stop]} | 
|  | 761             $feature_intervaltree{$fields[0]} = Set::IntervalTree->new(); | 
|  | 762 	} | 
|  | 763 	if(not exists $feature_range{$fields[0]}->{$parent}){ | 
|  | 764 	    $feature_range{$fields[0]}->{$parent} = []; | 
|  | 765 	} | 
|  | 766         push @{$feature_range{$fields[0]}->{$parent}}, [$fields[3],$fields[4]]; | 
|  | 767         $feature_intervaltree{$fields[0]}->insert($parent, $fields[3], $fields[4]+1); # ranges need to have positive length for module to work properly | 
|  | 768         $feature_length{$parent} += $fields[4]-$fields[3]+1; | 
|  | 769     } | 
|  | 770 } | 
|  | 771 close(GTF); | 
|  | 772 print STDERR "\nFound $feature_count exons on ", scalar(keys %feature_range), " contigs in the GTF file\n" unless $quiet; | 
|  | 773 | 
|  | 774 for my $contig (keys %feature_range){ | 
|  | 775     for my $parent (keys %{$feature_range{$contig}}){ | 
|  | 776 	# sort by subrange start | 
|  | 777 	my @feature_ranges = sort {$a->[0] <=> $b->[0]} @{$feature_range{$contig}->{$parent}}; | 
|  | 778 	$feature_range{$contig}->{$parent} = \@feature_ranges; | 
|  | 779 	$feature_range{"chr".$contig}->{$parent} = \@feature_ranges if not $contig =~ /^chr/; | 
|  | 780 	$feature_range{$1}->{$parent} = \@feature_ranges if $contig =~ /^chr(\S+)/; | 
|  | 781     } | 
|  | 782 } | 
|  | 783 | 
|  | 784 # Calculate the cDNA position of the leftmost (reference strand) base for each exon | 
|  | 785 for my $contig (keys %feature_range){ | 
|  | 786     for my $parent (keys %{$feature_range{$contig}}){ | 
|  | 787         my @feature_ranges = @{$feature_range{$contig}->{$parent}}; | 
|  | 788         if($feature_strand{$parent} eq "-"){ | 
|  | 789             # set up utr offset for correct CDS coordinates | 
|  | 790             my $feature_offset = 0; | 
|  | 791             for(my $i = $#feature_ranges; $i >= 0; $i--){ | 
|  | 792                 last if not $feature_cds_max{$parent}; | 
|  | 793                 # exon is completely 5' of the start | 
|  | 794                 if($feature_ranges[$i]->[0] > $feature_cds_max{$parent}){ | 
|  | 795                     $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; | 
|  | 796                 } | 
|  | 797                 # exon with the cds start | 
|  | 798                 elsif($feature_ranges[$i]->[1] >= $feature_cds_max{$parent} and | 
|  | 799                       $feature_ranges[$i]->[0] <= $feature_cds_max{$parent}){ | 
|  | 800                     $feature_offset += $feature_cds_max{$parent} - $feature_ranges[$i]->[1]; | 
|  | 801                     last; | 
|  | 802                 } | 
|  | 803                 else{ | 
|  | 804                     die "The CDS for $parent (on negative strand) ends downstream ", | 
|  | 805                     "($feature_cds_max{$parent}) of the an exon", | 
|  | 806                     " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n"; | 
|  | 807                 } | 
|  | 808             } | 
|  | 809             for(my $i = $#feature_ranges; $i >= 0; $i--){ | 
|  | 810               $feature_offset += $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; | 
|  | 811               $feature_ranges[$i]->[2] = $feature_offset-1; | 
|  | 812             } | 
|  | 813        } | 
|  | 814        else{ # positive strand | 
|  | 815             # set up utr offset for correct CDS coordinates | 
|  | 816             my $feature_offset = 0; | 
|  | 817             for(my $i = 0; $i <= $#feature_ranges; $i++){ | 
|  | 818                 last if not $feature_cds_min{$parent}; | 
|  | 819                 # All 5' utr exon | 
|  | 820                 if($feature_ranges[$i]->[1] < $feature_cds_min{$parent}){ | 
|  | 821                     $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; | 
|  | 822                 } | 
|  | 823                 # exon with the cds start | 
|  | 824                 elsif($feature_ranges[$i]->[1] >= $feature_cds_min{$parent} and | 
|  | 825                       $feature_ranges[$i]->[0] <= $feature_cds_min{$parent}){ | 
|  | 826                     $feature_offset -= $feature_cds_min{$parent} - $feature_ranges[$i]->[0]; | 
|  | 827                     last; | 
|  | 828                 } | 
|  | 829                 else{ | 
|  | 830                     die "The CDS for $parent starts upstream ($feature_cds_min{$parent}) of the first exon", | 
|  | 831                     " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n"; | 
|  | 832                 } | 
|  | 833             } | 
|  | 834             # assign cDNA coords for each exon to the third array element | 
|  | 835             for(my $i = 0; $i <= $#feature_ranges; $i++){ | 
|  | 836               $feature_ranges[$i]->[2] = $feature_offset; | 
|  | 837               $feature_offset += $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; | 
|  | 838             } | 
|  | 839        } | 
|  | 840     } | 
|  | 841 } | 
|  | 842 | 
|  | 843 print STDERR "Reading in gene name definitions...\n" unless $quiet; | 
|  | 844 die "Data file $genename_bed_file does not exist, aborting.\n" if not -e $genename_bed_file; | 
|  | 845 my %gene_ids; | 
|  | 846 open(TAB, $genename_bed_file) | 
|  | 847   or die "Cannot open gene name BED file $genename_bed_file for reading: $!\n"; | 
|  | 848 while(<TAB>){ | 
|  | 849   chomp; | 
|  | 850   # format should be "chr start stop gene_name ..." | 
|  | 851   my @fields = split /\t/, $_; | 
|  | 852   next if $#fields < 3; | 
|  | 853   my $c = $fields[0]; | 
|  | 854   if(not exists $gene_ids{$c}){ | 
|  | 855     $gene_ids{$c} = Set::IntervalTree->new(); | 
|  | 856   } | 
|  | 857   $gene_ids{$c}->insert($fields[3], $fields[1], $fields[2]); | 
|  | 858 } | 
|  | 859 | 
|  | 860 # Print output header | 
|  | 861 open(OUT, ">$output_file") | 
|  | 862   or die "Cannot open $output_file for writing: $!\n"; | 
|  | 863 | 
|  | 864 print OUT join("\t", "Feature type", "Transcript length", "Selected transcript", "Transcript HGVS", "Strand", "Chr", "DNA From", "DNA To", "Zygosity", "P-value", "Variant Reads", "Total Reads", | 
|  | 865 	       "Ref base", "Obs base", "Pop. freq. source", "Pop. freq.", "Variant DB ID"), "\t", | 
|  | 866                ($internal_snp ? "Internal pop. freq.\t" : ""), | 
|  | 867                join("\t", "Protein HGVS", "Closest exon junction (AA coding variants)", "Gene Name", "Caveats", "Phase", "Num rare variants in gene (MAF <= $rare_variant_prop)", "Num rare coding and splice site variants in gene (MAF <= $rare_variant_prop)"),"\n"; | 
|  | 868 | 
|  | 869 # If there is CNV data, load it. | 
|  | 870 # BED columns should be chr start stop caveats ploidy . ignored ignored r,g,b | 
|  | 871 # The dot means the strand doesn't matter. | 
|  | 872 # where the first five fields are required, others optional | 
|  | 873 # where r,g,b is overloaded with father,mother ploidies and "b" is integer representing affected status logical AND (father bit mask 1, mother bit mask 2) | 
|  | 874 if(defined $cnv_file){ | 
|  | 875   print STDERR "Reading in CNV data...\n" unless $quiet; | 
|  | 876   open(CNV, $cnv_file) | 
|  | 877     or die "Cannot open $cnv_file for reading: $!\n"; | 
|  | 878   while(<CNV>){ | 
|  | 879     chomp; | 
|  | 880     my @F = split /\t/,  $_, -1; | 
|  | 881     if(@F < 5){ | 
|  | 882       print STDERR "Skipping unparseable line ($cnv_file #$.): $_\n"; | 
|  | 883       next; | 
|  | 884     } | 
|  | 885     my $ploidy = $F[4]; | 
|  | 886     my $cnv_chr = $F[0]; | 
|  | 887     next if defined $which_chr and $cnv_chr ne $which_chr and "chr$cnv_chr" ne $which_chr and $cnv_chr ne "chr$which_chr"; | 
|  | 888     my $cnv_start = $F[1]; | 
|  | 889     my $cnv_end = $F[2]; | 
|  | 890     my $p_value = "NA"; | 
|  | 891     if($F[3] =~ s/p-value=(\S+?)(?:;|$)//){ | 
|  | 892       $p_value = $1; | 
|  | 893       next if $min_pvalue < $p_value; | 
|  | 894     } | 
|  | 895 | 
|  | 896     # Report a variant line for each gene that is found in this CNV | 
|  | 897     my $target_parents = $feature_intervaltree{$cnv_chr}->fetch($cnv_start, $cnv_end+1); | 
|  | 898 | 
|  | 899     my $caveats = ""; | 
|  | 900     if(@F == 9){ | 
|  | 901       my @parents_ploidy = split /,/, $F[8]; | 
|  | 902       if($parents_ploidy[2] == 0){ # neither parent affected | 
|  | 903         if($ploidy < $parents_ploidy[0] and $ploidy < $parents_ploidy[1]){ | 
|  | 904           if($ploidy > 2){ | 
|  | 905             $caveats = "Polyploidy is less severe than in either unaffected parents"; | 
|  | 906           } | 
|  | 907           # else: no caveats, this offspring has fewer copies than normally observed, or in unaffected parents | 
|  | 908           elsif($ploidy < 2){ | 
|  | 909             if($parents_ploidy[0] == 2 and $parents_ploidy[1] == 2){ | 
|  | 910               $caveats = "De novo copy loss, unaffected parents are diploid"; | 
|  | 911             } | 
|  | 912             else{ | 
|  | 913               $caveats = "Copy loss is greater than in either unaffected parent"; | 
|  | 914             } | 
|  | 915           } | 
|  | 916         } | 
|  | 917         elsif($ploidy >= $parents_ploidy[0] and $ploidy <= $parents_ploidy[1] or | 
|  | 918            $ploidy >= $parents_ploidy[1] and $ploidy <= $parents_ploidy[0]){ | 
|  | 919           $caveats = "Aneuploidy likely inherited from an unaffected parent"; | 
|  | 920         } | 
|  | 921         elsif($ploidy > $parents_ploidy[0] and $ploidy > $parents_ploidy[1]){ | 
|  | 922           if($parents_ploidy[0] > 2){ | 
|  | 923             if($parents_ploidy[1] > 2){ | 
|  | 924               $caveats = "Lower polyploidy already exists in both unaffected parents"; | 
|  | 925             } | 
|  | 926             else{ | 
|  | 927               $caveats = "Lower polyploidy already exists in unaffected father"; | 
|  | 928             } | 
|  | 929           } | 
|  | 930           else{ | 
|  | 931             if($parents_ploidy[1] > 2){ | 
|  | 932               $caveats = "Lower polyploidy already exists in unaffected mother"; | 
|  | 933             } | 
|  | 934             # else no caveats, because both parents are "normal", yet we have polyploidy in the offspring | 
|  | 935             else{ | 
|  | 936               $caveats = "De novo polyploidy, unaffected parents are diploid"; | 
|  | 937             } | 
|  | 938           } | 
|  | 939         } | 
|  | 940         # else | 
|  | 941         else{ | 
|  | 942           die "Oops! Error in program logic...how did we get here (unaffected parents)? $_"; | 
|  | 943         } | 
|  | 944       } | 
|  | 945       elsif($parents_ploidy[2] == 1){ # father affected | 
|  | 946           if($ploidy == $parents_ploidy[1]){ # just like unaffected Mom | 
|  | 947             if($ploidy > 2){ | 
|  | 948               if($ploidy == $parents_ploidy[0]){ | 
|  | 949                 $caveats = "Same polyploidy present in both affected and unaffected parents"; | 
|  | 950               } | 
|  | 951               else{ | 
|  | 952                 $caveats = "Polyploidy inherited from unaffected mother"; | 
|  | 953               } | 
|  | 954             } | 
|  | 955             elsif($ploidy < 2){ | 
|  | 956               if($ploidy == $parents_ploidy[0]){ | 
|  | 957                 $caveats = "Same copy loss in both affected and unaffected parents"; | 
|  | 958               } | 
|  | 959               else{ | 
|  | 960                 $caveats = "Copy loss is shared with unaffected mother"; | 
|  | 961               } | 
|  | 962             } | 
|  | 963             else{ | 
|  | 964               if($ploidy == $parents_ploidy[0]){ | 
|  | 965                 # Why was this even reported? parents and child have diploid status... | 
|  | 966                 next; | 
|  | 967               } | 
|  | 968               $caveats = "Diploidy is shared with unaffected mother"; | 
|  | 969             } | 
|  | 970           } | 
|  | 971           elsif($ploidy > 2){ # polyploidy | 
|  | 972             if($parents_ploidy[0] == 2){ | 
|  | 973               if($parents_ploidy[1] > 2){ | 
|  | 974                 $caveats = "Unaffected mother has polyploidy (".$parents_ploidy[1]."x), but affected father is diploid"; | 
|  | 975               } | 
|  | 976               elsif($parents_ploidy[1] == 2){ | 
|  | 977                 $caveats = "Both unaffected mother and affected father are diploid"; | 
|  | 978               } | 
|  | 979               else{ | 
|  | 980                 $caveats = "Affected father is diploid, unaffected mother has copy loss (".$parents_ploidy[1]."x)"; | 
|  | 981               } | 
|  | 982             } | 
|  | 983             elsif($parents_ploidy[0] < 2){ | 
|  | 984               $caveats = "Polyploidy found, but affected father had copy loss (".$parents_ploidy[0]."x)"; | 
|  | 985             } | 
|  | 986             elsif($ploidy < $parents_ploidy[1]){ | 
|  | 987               $caveats = "Polyploidy is less severe than in unaffected mother (".$parents_ploidy[1]."x), or affected father (".$parents_ploidy[0]."x)"; | 
|  | 988             } | 
|  | 989             # past here the ploidy is great than in the unaffected mother | 
|  | 990             elsif($parents_ploidy[1] < 2){ | 
|  | 991               $caveats = "Polyploidy is also severe in affected father (".$parents_ploidy[0]."x), but unaffected mother actually had copy loss (". $parents_ploidy[1]. "x)"; | 
|  | 992             } | 
|  | 993             elsif($parents_ploidy[1] == 2){ | 
|  | 994               $caveats = "Polyploidy is also severe in affected father (".$parents_ploidy[0]."x), and mother is diploid"; | 
|  | 995             } | 
|  | 996             elsif($ploidy < $parents_ploidy[0]){ | 
|  | 997               $caveats = "Polyploidy is less severe than in affected father (".$parents_ploidy[0]."x), but more severe than unaffected mother (". $parents_ploidy[1]. "x)"; | 
|  | 998             } | 
|  | 999             elsif($ploidy > $parents_ploidy[0]){ | 
|  | 1000               $caveats = "Polyploidy is more severe than in affected father (".$parents_ploidy[0]."x)"; | 
|  | 1001             } | 
|  | 1002             else{ | 
|  | 1003               $caveats = "Polyploidy is as severe as in affected father"; | 
|  | 1004             } | 
|  | 1005           } | 
|  | 1006           elsif($ploidy == 2){ | 
|  | 1007             # Don't report diploid status, any funny recombination should show up in large indel analysis | 
|  | 1008             next; | 
|  | 1009           } | 
|  | 1010           else{ # copies < 2 | 
|  | 1011             if($ploidy == $parents_ploidy[0]){ | 
|  | 1012               if($ploidy > $parents_ploidy[1]){ | 
|  | 1013                 $caveats = "Copy loss is the same as affected father, but less than unaffected mother (". $parents_ploidy[1]. "x)"; | 
|  | 1014               } | 
|  | 1015               else{ | 
|  | 1016                 $caveats = "Copy loss is as severe as in affected father"; | 
|  | 1017               } | 
|  | 1018             } | 
|  | 1019             elsif($ploidy > $parents_ploidy[0]){ | 
|  | 1020               if($ploidy > $parents_ploidy[1]){ | 
|  | 1021                 if($parents_ploidy[1] == 0 and $parents_ploidy[0] == 0){ | 
|  | 1022                   $caveats = "Poor mapping, or Mendelian inheritence violation is severe: no copies of region in either parent, but present in offspring"; | 
|  | 1023                 } | 
|  | 1024                 elsif($ploidy == 2){ | 
|  | 1025                   next; # child got best of both parents, ignore from CNV standpoint (may still have SNPs of course, or translocation, etc.) | 
|  | 1026                 } | 
|  | 1027                 else{ | 
|  | 1028                   $caveats = "Copy loss is less severe than in unaffected mother (".$parents_ploidy[1]."x), or affected father (".$parents_ploidy[0]."x)"; | 
|  | 1029                 } | 
|  | 1030               } | 
|  | 1031               # else: child has less copies than unaffected mom, but more than affected Dad | 
|  | 1032               else{ | 
|  | 1033                 if($parents_ploidy[1] > 2){ | 
|  | 1034                   $caveats = "Copy loss was more severe in affected father (".$parents_ploidy[0]."x), but unaffected mother had polyploidy (".$parents_ploidy[1]."x)"; | 
|  | 1035                 } | 
|  | 1036                 elsif($parents_ploidy[1] == 2){ | 
|  | 1037                   $caveats = "Copy loss was more severe in affected father (".$parents_ploidy[0]."x), but unaffected mother was diploid"; | 
|  | 1038                 } | 
|  | 1039                 else{ # unaffected has loss | 
|  | 1040                   $caveats = "Copy loss is more severe than unaffect mother (".$parents_ploidy[1]."x), but less severe than affected father (".$parents_ploidy[0]."x)"; | 
|  | 1041                 } | 
|  | 1042               } | 
|  | 1043             } | 
|  | 1044             # past here, ploidy is less than affected father | 
|  | 1045             elsif($parents_ploidy[1] > 2){ | 
|  | 1046               $caveats = "Copy loss is more severe than affected father (".$parents_ploidy[0]."x), and unaffected mother had polyploidy (".$parents_ploidy[1]."x)"; | 
|  | 1047             } | 
|  | 1048             elsif($parents_ploidy[1] == 2){ | 
|  | 1049               $caveats = "Copy loss is more severe than in affected father (".$parents_ploidy[0]."x)"; | 
|  | 1050             } | 
|  | 1051             else{ | 
|  | 1052               $caveats = "Copy loss is more severe than in both unaffect mother (".$parents_ploidy[1]."x), and affected father (".$parents_ploidy[0]."x)"; | 
|  | 1053             } | 
|  | 1054           } | 
|  | 1055       } | 
|  | 1056       elsif($parents_ploidy[2] == 2){ # mother affected | 
|  | 1057           if($ploidy == $parents_ploidy[0]){ # just like unaffected Dad | 
|  | 1058             if($ploidy > 2){ | 
|  | 1059               if($ploidy == $parents_ploidy[1]){ | 
|  | 1060                 $caveats = "Same polyploidy present in both affected and unaffected parents"; | 
|  | 1061               } | 
|  | 1062               else{ | 
|  | 1063                 $caveats = "Polyploidy inherited from unaffected father"; | 
|  | 1064               } | 
|  | 1065             } | 
|  | 1066             elsif($ploidy < 2){ | 
|  | 1067               if($ploidy == $parents_ploidy[1]){ | 
|  | 1068                 $caveats = "Same copy loss in both affected and unaffected parents"; | 
|  | 1069               } | 
|  | 1070               else{ | 
|  | 1071                 $caveats = "Copy loss is shared with unaffected father"; | 
|  | 1072               } | 
|  | 1073             } | 
|  | 1074             else{ | 
|  | 1075               if($ploidy == $parents_ploidy[1]){ | 
|  | 1076                 # Why was this even reported? parents and child have diploid status... | 
|  | 1077                 next; | 
|  | 1078               } | 
|  | 1079               $caveats = "Diploidy is shared with unaffected father"; | 
|  | 1080             } | 
|  | 1081           } | 
|  | 1082           elsif($ploidy > 2){ # polyploidy | 
|  | 1083             if($parents_ploidy[1] == 2){ | 
|  | 1084               if($parents_ploidy[0] > 2){ | 
|  | 1085                 $caveats = "Unaffected father has polyploidy (".$parents_ploidy[0]."x), but affected mother is diploid"; | 
|  | 1086               } | 
|  | 1087               elsif($parents_ploidy[0] == 2){ | 
|  | 1088                 $caveats = "Both unaffected father and affected mother are diploid"; | 
|  | 1089               } | 
|  | 1090               else{ | 
|  | 1091                 $caveats = "Affected mother is diploid, unaffected father has copy loss (".$parents_ploidy[1]."x)"; | 
|  | 1092               } | 
|  | 1093             } | 
|  | 1094             elsif($parents_ploidy[1] < 2){ | 
|  | 1095               $caveats = "Polyploidy found, but affected mother had copy loss (".$parents_ploidy[1]."x)"; | 
|  | 1096             } | 
|  | 1097             elsif($ploidy < $parents_ploidy[0]){ | 
|  | 1098               $caveats = "Polyploidy is less severe than in unaffected father (".$parents_ploidy[0]."x), or affected mother (".$parents_ploidy[1]."x)"; | 
|  | 1099             } | 
|  | 1100             # past here the ploidy is great than in the unaffected father | 
|  | 1101             elsif($parents_ploidy[0] < 2){ | 
|  | 1102               $caveats = "Polyploidy is also severe in affected mother (".$parents_ploidy[1]."x), but unaffected father actually had copy loss (". $parents_ploidy[0]. "x)"; | 
|  | 1103             } | 
|  | 1104             elsif($parents_ploidy[0] == 2){ | 
|  | 1105               $caveats = "Polyploidy is also severe in affected mother (".$parents_ploidy[1]."x), and unaffected father is diploid"; | 
|  | 1106             } | 
|  | 1107             elsif($ploidy < $parents_ploidy[1]){ | 
|  | 1108               $caveats = "Polyploidy is less severe than in affected mother (".$parents_ploidy[1]."x), but more severe than unaffected father (". $parents_ploidy[0]. "x)"; | 
|  | 1109             } | 
|  | 1110             elsif($ploidy > $parents_ploidy[1]){ | 
|  | 1111               $caveats = "Polyploidy is more severe than in affected mother (".$parents_ploidy[1]."x)"; | 
|  | 1112             } | 
|  | 1113             else{ | 
|  | 1114               $caveats = "Polyploidy is as severe as in affected mother"; | 
|  | 1115             } | 
|  | 1116           } | 
|  | 1117           elsif($ploidy == 2){ | 
|  | 1118             # Don't report diploid status, any funny recombination should show up in large indel analysis | 
|  | 1119             next; | 
|  | 1120           } | 
|  | 1121           else{ # copies < 2 | 
|  | 1122             if($ploidy == $parents_ploidy[1]){ | 
|  | 1123               if($ploidy > $parents_ploidy[0]){ | 
|  | 1124                 $caveats = "Copy loss is the same as affected mother, but less than unaffected father (". $parents_ploidy[0]. "x)"; | 
|  | 1125               } | 
|  | 1126               else{ | 
|  | 1127                 $caveats = "Copy loss is as severe as in affected mother"; | 
|  | 1128               } | 
|  | 1129             } | 
|  | 1130             elsif($ploidy > $parents_ploidy[1]){ | 
|  | 1131               if($ploidy > $parents_ploidy[0]){ | 
|  | 1132                 if($parents_ploidy[1] == 0 and $parents_ploidy[0] == 0){ | 
|  | 1133                   $caveats = "Poor mapping, or Mendelian inheritence violation is severe: no copies of region in either parent, but present in offspring"; | 
|  | 1134                 } | 
|  | 1135                 elsif($ploidy == 2){ | 
|  | 1136                   next; # child got best of both parents, ignore from CNV standpoint (may still have SNPs of course, or translocation, etc.) | 
|  | 1137                 } | 
|  | 1138                 else{ | 
|  | 1139                   $caveats = "Copy loss is less severe than in unaffected father (".$parents_ploidy[0]."x), or affected mother (".$parents_ploidy[1]."x)"; | 
|  | 1140                 } | 
|  | 1141               } | 
|  | 1142               # else: child has less copies than unaffected Dad, but more than affected Mom | 
|  | 1143               else{ | 
|  | 1144                 if($parents_ploidy[0] > 2){ | 
|  | 1145                   $caveats = "Copy loss was more severe in affected mother (".$parents_ploidy[1]."x), but unaffected father had polyploidy (".$parents_ploidy[0]."x)"; | 
|  | 1146                 } | 
|  | 1147                 elsif($parents_ploidy[0] == 2){ | 
|  | 1148                   $caveats = "Copy loss was more severe in affected mother (".$parents_ploidy[1]."x), but unaffected father was diploid"; | 
|  | 1149                 } | 
|  | 1150                 else{ # unaffected has loss | 
|  | 1151                   $caveats = "Copy loss is more severe than unaffect father (".$parents_ploidy[0]."x), but less severe than affected mother (".$parents_ploidy[1]."x)"; | 
|  | 1152                 } | 
|  | 1153               } | 
|  | 1154             } | 
|  | 1155             # past here, ploidy is less than affected mother | 
|  | 1156             elsif($parents_ploidy[0] > 2){ | 
|  | 1157               $caveats = "Copy loss is more severe than affected mother (".$parents_ploidy[1]."x), and unaffected father had polyploidy (".$parents_ploidy[0]."x)"; | 
|  | 1158             } | 
|  | 1159             elsif($parents_ploidy[0] == 2){ | 
|  | 1160               $caveats = "Copy loss is more severe than in affected mother (".$parents_ploidy[1]."x)"; | 
|  | 1161             } | 
|  | 1162             else{ | 
|  | 1163               $caveats = "Copy loss is more severe than in both unaffect father (".$parents_ploidy[0]."x), and affected mother (".$parents_ploidy[1]."x)"; | 
|  | 1164             } | 
|  | 1165           } | 
|  | 1166         } | 
|  | 1167 | 
|  | 1168     } | 
|  | 1169     if($F[3] and $F[3] ne "-"){ # prexisting caveat from CNV caller | 
|  | 1170       if(defined $caveats){ | 
|  | 1171         $caveats .= "; $F[3]" unless $caveats =~ /\b$F[3]\b/; | 
|  | 1172       } | 
|  | 1173       else{ | 
|  | 1174        $caveats  = $F[3]; | 
|  | 1175       } | 
|  | 1176     } | 
|  | 1177 | 
|  | 1178     # Sort by start for consistency | 
|  | 1179     my @target_parents = sort {$feature_range{$cnv_chr}->{$a}->[0]->[0] <=> $feature_range{$cnv_chr}->{$b}->[0]->[0]} @$target_parents; | 
|  | 1180 | 
|  | 1181     for my $target_parent (@target_parents){ | 
|  | 1182       my $target_caveats = $caveats; | 
|  | 1183       my $strand = $feature_strand{$target_parent}; | 
|  | 1184       # report the gain/loss of each gene separately, for simplicity in downstream analysis | 
|  | 1185       my $cnv_exon_start = 10000000000; # genomic coords | 
|  | 1186       my $cnv_exon_end = 0; | 
|  | 1187       my $cnv_cdna_start = 0; # cDNA coords | 
|  | 1188       my $cnv_cdna_end = 0; | 
|  | 1189       my $off5 = 0; # border outside the exon? | 
|  | 1190       my $off3 = 0; | 
|  | 1191       my @feature_ranges = @{$feature_range{$cnv_chr}->{$target_parent}}; | 
|  | 1192       # find the first and last exons in the gene that are inside the CNV | 
|  | 1193       for my $subregion (@feature_ranges){ | 
|  | 1194         # exon overlaps CNV? | 
|  | 1195         if($subregion->[0] <= $cnv_end and $subregion->[1] >= $cnv_start){ | 
|  | 1196           if($cnv_exon_start > $subregion->[0]){ | 
|  | 1197             if($cnv_start < $subregion->[0]){ | 
|  | 1198               $cnv_exon_start = $subregion->[0]; $off5 = 1; | 
|  | 1199               $cnv_cdna_start = $subregion->[2]; | 
|  | 1200             } | 
|  | 1201             else{ | 
|  | 1202               $cnv_exon_start = $cnv_start; $off5 = 0; | 
|  | 1203               $cnv_cdna_start = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$cnv_start: $cnv_start-$subregion->[0]); | 
|  | 1204             } | 
|  | 1205           } | 
|  | 1206           if($cnv_exon_end < $subregion->[1]){ | 
|  | 1207             if($cnv_end > $subregion->[1]){ | 
|  | 1208               $cnv_exon_end = $subregion->[1]; $off3 = 1; | 
|  | 1209               $cnv_cdna_end = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$subregion->[1] : $subregion->[1]-$subregion->[0]); | 
|  | 1210             } | 
|  | 1211             else{ | 
|  | 1212               $cnv_exon_end = $cnv_end; $off3 = 0; | 
|  | 1213               $cnv_cdna_end = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$cnv_end : $cnv_end-$subregion->[0]); | 
|  | 1214             } | 
|  | 1215           } | 
|  | 1216         } | 
|  | 1217       } | 
|  | 1218 | 
|  | 1219     my $ends_internally = 0; | 
|  | 1220     if($cnv_exon_end == 0){ # ends inside the exon | 
|  | 1221       $cnv_exon_end = $cnv_end; | 
|  | 1222       $ends_internally = 1; | 
|  | 1223     } | 
|  | 1224     # See if it's in the structural variant database | 
|  | 1225     my @gain_coverage; $#gain_coverage = $cnv_exon_end-$cnv_exon_start; # preallocate blanks | 
|  | 1226     my @loss_coverage; $#loss_coverage = $cnv_exon_end-$cnv_exon_start; # preallocate blanks | 
|  | 1227     my $dgv_loss_id; # report the DGV entry that covers most of the observed structural variant | 
|  | 1228     my $dgv_loss_length = 0; # report the DGV entry that covers most of the observed structural variant | 
|  | 1229     my $dgv_gain_id; # report the DGV entry that covers most of the observed structural variant | 
|  | 1230     my $dgv_gain_length = 0; # report the DGV entry that covers most of the observed structural variant | 
|  | 1231     my $gains; | 
|  | 1232     my $losses; | 
|  | 1233     my $dgv_chr = $cnv_chr; | 
|  | 1234     $dgv_chr =~ s/^chr//; # no prefix in DGV | 
|  | 1235     #open(DGV, "tabix $dgv_file $dgv_chr:$cnv_exon_start-$cnv_exon_end |") # check out CNV in this gene model region | 
|  | 1236     #  or die "Cannot run tabix: $!\n"; | 
|  | 1237     open(DGV, "/dev/null"); | 
|  | 1238     while(<DGV>){ | 
|  | 1239       my @C = split /\t/, $_; | 
|  | 1240       next if $C[4] ne "CNV"; # todo: handle indels? | 
|  | 1241       my $dgv_start = $C[2]; | 
|  | 1242       my $dgv_end = $C[3]; | 
|  | 1243       my $dgv_direction = $C[5]; | 
|  | 1244       my $gain_cov_count = 0; | 
|  | 1245       my $loss_cov_count = 0; | 
|  | 1246       if($dgv_direction eq "Gain"){ | 
|  | 1247         for(my $i = ($dgv_start < $cnv_exon_start ? $cnv_exon_start : $dgv_start); $i <= $dgv_end and $i <= $cnv_exon_end; $i++){ | 
|  | 1248           $gain_coverage[$i-$cnv_exon_start] = 1 unless defined $gain_coverage[$i-$cnv_exon_start]; | 
|  | 1249           $gain_cov_count++; | 
|  | 1250         } | 
|  | 1251       } | 
|  | 1252       elsif($dgv_direction eq "Loss"){ | 
|  | 1253         for(my $i = ($dgv_start < $cnv_exon_start ? $cnv_exon_start : $dgv_start); $i <= $dgv_end and $i <= $cnv_exon_end; $i++){ | 
|  | 1254           $loss_coverage[$i-$cnv_exon_start] = 1 unless defined $loss_coverage[$i-$cnv_exon_start]; | 
|  | 1255           $loss_cov_count++; | 
|  | 1256         } | 
|  | 1257       } | 
|  | 1258       if($dgv_direction eq "Gain" and $gain_cov_count > $dgv_gain_length){ | 
|  | 1259         $dgv_gain_id = $C[0]; | 
|  | 1260         $dgv_gain_length = $gain_cov_count; | 
|  | 1261       } | 
|  | 1262       if($dgv_direction eq "Loss" and $loss_cov_count > $dgv_loss_length){ | 
|  | 1263         $dgv_loss_id = $C[0]; | 
|  | 1264         $dgv_loss_length = $loss_cov_count; | 
|  | 1265       } | 
|  | 1266     } | 
|  | 1267     close(DGV); | 
|  | 1268 | 
|  | 1269     my $gain_coverage = 0; | 
|  | 1270     for my $count (@gain_coverage){ | 
|  | 1271       $gain_coverage++ if defined $count; | 
|  | 1272     } | 
|  | 1273     $gain_coverage = sprintf "%.3f", $gain_coverage/($cnv_exon_end-$cnv_exon_start+1); # make it a proportion | 
|  | 1274     my $loss_coverage = 0; | 
|  | 1275     for my $count (@loss_coverage){ | 
|  | 1276       $loss_coverage++ if defined $count; | 
|  | 1277     } | 
|  | 1278     $loss_coverage = sprintf "%.3f", $loss_coverage/($cnv_exon_end-$cnv_exon_start+1); # make it a proportion | 
|  | 1279 | 
|  | 1280     my $src = "DGV"; | 
|  | 1281     my $dgv_id = "NA"; | 
|  | 1282     my $dgv_caveat; | 
|  | 1283     my $dgv_coverage; | 
|  | 1284     if($ploidy > 2){ | 
|  | 1285       if(not defined $dgv_gain_id){ | 
|  | 1286         if(defined $dgv_loss_id){ | 
|  | 1287           $dgv_id = sprintf "%s/%.3f", $dgv_loss_id, $dgv_loss_length/($cnv_exon_end-$cnv_exon_start+1); | 
|  | 1288           $dgv_caveat = "; No gains are known in healthy controls, the DGV2 ID reported is for a loss in the same area"; | 
|  | 1289           $dgv_coverage = $loss_coverage; | 
|  | 1290         } | 
|  | 1291         else{ | 
|  | 1292           $dgv_id = "novel"; | 
|  | 1293           $dgv_coverage = "NA"; | 
|  | 1294           $src = "NA"; | 
|  | 1295         } | 
|  | 1296       } | 
|  | 1297       else{ | 
|  | 1298         $dgv_id = sprintf "%s/%.3f", $dgv_gain_id, $dgv_gain_length/($cnv_exon_end-$cnv_exon_start+1); | 
|  | 1299         $dgv_coverage = $gain_coverage; | 
|  | 1300       } | 
|  | 1301     } | 
|  | 1302     elsif($ploidy < 2){ | 
|  | 1303       if(not defined $dgv_loss_id){ | 
|  | 1304         if(defined $dgv_gain_id){ | 
|  | 1305           $dgv_id = sprintf "%s/%.3f", $dgv_gain_id, $dgv_gain_length/($cnv_exon_end-$cnv_exon_start+1); | 
|  | 1306           $dgv_caveat = "; No losses are known in healthy controls, the DGV2 ID reported is for a gain in the same area"; | 
|  | 1307           $dgv_coverage = $gain_coverage; | 
|  | 1308         } | 
|  | 1309         else{ | 
|  | 1310           $dgv_id = "novel"; | 
|  | 1311           $dgv_coverage = "NA"; | 
|  | 1312           $src = "NA"; | 
|  | 1313         } | 
|  | 1314       } | 
|  | 1315       else{ | 
|  | 1316         $dgv_id = sprintf "%s/%.3f", $dgv_loss_id, $dgv_loss_length/($cnv_exon_end-$cnv_exon_start+1); | 
|  | 1317         $dgv_coverage = $loss_coverage; | 
|  | 1318       } | 
|  | 1319     } | 
|  | 1320 | 
|  | 1321     my $non_coding = 0; | 
|  | 1322     if(not exists $feature_cds_max{$target_parent} or not defined $feature_cds_max{$target_parent} or $feature_cds_max{$target_parent} eq ""){ | 
|  | 1323         $non_coding = 1; | 
|  | 1324     } | 
|  | 1325     $target_caveats .= $dgv_caveat if defined $dgv_caveat and $dgv_id ne "novel" and $target_caveats !~ /\Q$dgv_caveat\E/; | 
|  | 1326     #print "Recorded $cnv_chr:$cnv_start caveat $caveats\n"; | 
|  | 1327       # if it doesn't overlap an exon, we need to find out which two exons it's between | 
|  | 1328     if($ends_internally){ | 
|  | 1329         my $intron_found = 0; | 
|  | 1330         for(my $i = 0; $i < $#feature_ranges; $i++){ | 
|  | 1331           if($feature_ranges[$i]->[1] < $cnv_start and $feature_ranges[$i+1]->[0] > $cnv_end){ | 
|  | 1332             if($ploidy > 2){ # gain | 
|  | 1333               if($strand eq "-"){ | 
|  | 1334                 record_snv("$target_parent\t", | 
|  | 1335                    ($non_coding ? "g.$cnv_start\_$cnv_end" : | 
|  | 1336                     "c.".($feature_ranges[$i+1]->[2])."+".($feature_ranges[$i+1]->[0]-$cnv_end)."_".($feature_ranges[$i+1]->[2]+1)."-".($cnv_start-$feature_ranges[$i]->[1])), | 
|  | 1337                 # pos	Zygosity	P-value	Variant Reads	Total Reads	Ref Bases	Var Bases	Population Frequency Source	Pop. freq. or DGV2 gain/loss coverage	dbSNP or DGV2 ID | 
|  | 1338                    "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\tNA\t$p_value\tNA\tNA\t", | 
|  | 1339                    "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n"); | 
|  | 1340               } | 
|  | 1341               else{ | 
|  | 1342                 record_snv("$target_parent\t", | 
|  | 1343                    ($non_coding ? "g.$cnv_start\_$cnv_end" : | 
|  | 1344                    "c.".($feature_ranges[$i+1]->[2]-1)."+".($cnv_start-$feature_ranges[$i]->[1])."_".$feature_ranges[$i+1]->[2]."-".($feature_ranges[$i+1]->[0]-$cnv_end)), | 
|  | 1345                    "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\tNA\t$p_value\tNA\tNA\t", | 
|  | 1346                    "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n"); | 
|  | 1347               } | 
|  | 1348             } | 
|  | 1349             else{ # loss | 
|  | 1350               if($strand eq "-"){ | 
|  | 1351                 record_snv("$target_parent\t", | 
|  | 1352                    ($non_coding ? "g.$cnv_start\_$cnv_end" : | 
|  | 1353                    "c.".($feature_ranges[$i+1]->[2])."+".($feature_ranges[$i+1]->[0]-$cnv_end)."_".($feature_ranges[$i+1]->[2]+1)."-".($cnv_start-$feature_ranges[$i]->[1])), | 
|  | 1354                    "del\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t", | 
|  | 1355                    "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n"); | 
|  | 1356               } | 
|  | 1357               else{ | 
|  | 1358                 record_snv("$target_parent\t", | 
|  | 1359                    ($non_coding ? "g.$cnv_start\_$cnv_end" : | 
|  | 1360                    "c.".($feature_ranges[$i+1]->[2]-1)."+".($cnv_start-$feature_ranges[$i]->[1])."_".$feature_ranges[$i+1]->[2]."-".($feature_ranges[$i+1]->[0]-$cnv_end)), | 
|  | 1361                    "del\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t", | 
|  | 1362                    "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n"); | 
|  | 1363               } | 
|  | 1364             } | 
|  | 1365             $intron_found = 1; last; | 
|  | 1366           } | 
|  | 1367         } | 
|  | 1368         warn "Logic error: CNV overlaps a gene ($target_parent), but is neither intronic nor exonic. Offending CNV was $_\n" unless $intron_found; | 
|  | 1369         next; | 
|  | 1370       } | 
|  | 1371       if($strand eq "-"){ | 
|  | 1372         my $tmp = $cnv_cdna_start; | 
|  | 1373         $cnv_cdna_start = $cnv_cdna_end; | 
|  | 1374         $cnv_cdna_end = $tmp; | 
|  | 1375       } | 
|  | 1376       # Make the location label pretty descriptive | 
|  | 1377       my $cnv_phase = ""; | 
|  | 1378       if($cnv_exon_start > $cnv_start or $cnv_exon_end < $cnv_end){ | 
|  | 1379         $cnv_phase = "CNV-$cnv_chr:$cnv_start-$cnv_end"; # Use phase to note that it's part of a bigger CNV than just the range of this feature | 
|  | 1380       } | 
|  | 1381       # if we get here, we're in a gained/deleted exon category | 
|  | 1382       # CNVs are fuzzy-edged (as opposed to large indels), so produce HGVS syntax that reflect this | 
|  | 1383       if($ploidy > 2){ # gain | 
|  | 1384         # find the exons encompassed by the CNV. NOTE that we assume that polyploidies are proximal | 
|  | 1385         record_snv("$target_parent\t", | 
|  | 1386                    ($non_coding ? "g.".($cnv_exon_start > $cnv_start ? "$cnv_exon_start-?" : $cnv_start)."_".($cnv_exon_end < $cnv_end ? "$cnv_exon_end+?" : $cnv_end) : | 
|  | 1387                    "c.$cnv_cdna_start".($off5?"-?":"")."_$cnv_cdna_end".($off3?"+?":"")), | 
|  | 1388                    "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_exon_start\t$cnv_exon_end\tNA\t$p_value\tNA\tNA\t", | 
|  | 1389                    "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t$cnv_phase\n"); | 
|  | 1390       } | 
|  | 1391       else{ # loss | 
|  | 1392         #translate genome coordinates into cDNA coordinates | 
|  | 1393         record_snv("$target_parent\t", | 
|  | 1394                    ($non_coding ? "g.".($cnv_exon_start > $cnv_start ? "$cnv_exon_start-?" : $cnv_start)."_".($cnv_exon_end < $cnv_end ? "$cnv_exon_end+?" : $cnv_end) : | 
|  | 1395                    "c.$cnv_cdna_start".($off5?"-?":"")."_$cnv_cdna_end".($off3?"+?":"")), | 
|  | 1396                    "del\t$strand\t$cnv_chr\t$cnv_exon_start\t$cnv_exon_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t", | 
|  | 1397                    "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t$cnv_phase\n"); | 
|  | 1398       } | 
|  | 1399     } | 
|  | 1400   } | 
|  | 1401   close(CNV); | 
|  | 1402 | 
|  | 1403 } | 
|  | 1404 | 
|  | 1405 | 
|  | 1406 #sort genes by start, then longest if tied | 
|  | 1407 my %rc = qw(A T T A G C C G N N S W W S K M M K Y R R Y X X); | 
|  | 1408 print STDERR "Processing variant calls..." unless $quiet; | 
|  | 1409 %chr_read = (); | 
|  | 1410 open(VCFIN, $input_file) | 
|  | 1411     or die "Cannot open $input_file for reading: $!\n"; | 
|  | 1412 while(<VCFIN>){ | 
|  | 1413     if(/^\s*(?:#|$)/){ # blank or hash comment lines | 
|  | 1414        next; | 
|  | 1415     } | 
|  | 1416     chomp; | 
|  | 1417     my @fields = split /\t/, $_; | 
|  | 1418 | 
|  | 1419     next unless exists $feature_range{$fields[0]}; | 
|  | 1420     if(not $quiet and not exists $chr_read{$fields[0]}){ | 
|  | 1421 	print STDERR " $fields[0]"; | 
|  | 1422 	$chr_read{$fields[0]} = 1; | 
|  | 1423 	#print STDERR "(not in reference file!)" unless exists $feature_range{$fields[0]}; | 
|  | 1424     } | 
|  | 1425 | 
|  | 1426     next if $fields[4] eq "<NON_REF>"; # GVCF background stuff | 
|  | 1427     next if $fields[9] eq "./." or $fields[9] eq "."; # no call | 
|  | 1428     my $chr = $fields[0]; | 
|  | 1429     next if defined $which_chr and $chr ne $which_chr and $chr ne "chr$which_chr" and "chr$chr" ne $which_chr; | 
|  | 1430     print STDERR "passes chr and field # test" if grep /dataset_7684.dat/, @ARGV; | 
|  | 1431     $chr = "chr$chr" if $chr !~ /^chr/; | 
|  | 1432     $fields[8] =~ s/\s+$//; | 
|  | 1433     my @values = split /:/, $fields[9]; | 
|  | 1434     #print STDERR join(" / ", @values), "\n" if $. == 84; | 
|  | 1435     my @keys = split /:/, $fields[8]; | 
|  | 1436     my $zygosity = "n/a"; | 
|  | 1437     my $quality = "n/a"; | 
|  | 1438     my $read_depth = "n/a"; | 
|  | 1439     my $variant_depths = "n/a"; | 
|  | 1440     for(my $i = 0; $i <= $#keys and $i <= $#values; $i++){ # values max index check because some genotypers are nasty and don't provide as many fields as they say they will | 
|  | 1441 	if($keys[$i] eq "GT"){ | 
|  | 1442             if($values[$i] =~ /\./ or $values[$i] =~ /0\/0/){ # one genotype not described | 
|  | 1443               $zygosity = "none"; | 
|  | 1444               last; | 
|  | 1445             } | 
|  | 1446 	    else{ # genotypes described | 
|  | 1447               $zygosity = $values[$i] =~ /[02]/ ? "heterozygote" : "homozygote"; | 
|  | 1448             } | 
|  | 1449 	} | 
|  | 1450         elsif($keys[$i] eq "DNM_CONFIG" and $zygosity eq "n/a"){ # hack | 
|  | 1451             $zygosity = $values[$i] =~ /^(.)\1/ ? "homozygote" : "heterozygote"; | 
|  | 1452         } | 
|  | 1453     	elsif($keys[$i] eq "GQ" and $values[$i] ne "."){ | 
|  | 1454             #print "Checking GQ (index $i) $values[$i] gq2p\n" if $. == 84; | 
|  | 1455 	    $quality = gq2p($values[$i]); | 
|  | 1456 	} | 
|  | 1457     	elsif($keys[$i] eq "RD"){ # from some tools like denovo variant finders | 
|  | 1458 	    $read_depth = $values[$i]; | 
|  | 1459 	} | 
|  | 1460     	elsif($keys[$i] eq "DP"){ | 
|  | 1461 	    $read_depth = $values[$i]; | 
|  | 1462 	} | 
|  | 1463         # the frequency of the variant can go by many names, here we have freebayes (A* are new and old versions) and atlas2_indel | 
|  | 1464     	elsif($keys[$i] eq "AA" or $keys[$i] eq "VR" or $keys[$i] eq "AO"){ | 
|  | 1465 	    $variant_depths = $values[$i]; | 
|  | 1466 	} | 
|  | 1467         # here we have GATK variant freq of form ref#,var# | 
|  | 1468     	elsif($keys[$i] eq "AD"){ | 
|  | 1469 	    $variant_depths = $values[$i]; | 
|  | 1470             $variant_depths =~ s/^\d+,//; | 
|  | 1471 	} | 
|  | 1472         else{ | 
|  | 1473             #print STDERR "Ignoring field $keys[$i]\n"; | 
|  | 1474         } | 
|  | 1475     } | 
|  | 1476     next if $zygosity eq "none"; # GVCF non-ref for example or when multiple samples are reported simultaneously | 
|  | 1477     $quality = z2p($1) if $fields[7] =~ /Z=(\d+\.\d+)/; | 
|  | 1478     if($quality eq "n/a" and $fields[5] ne "."){ | 
|  | 1479       $quality = gq2p($fields[5]); | 
|  | 1480     } | 
|  | 1481     if($fields[5] eq "0" and $fields[6] eq "PASS"){ # not qual and a PASS in the filter column | 
|  | 1482       $quality = 1; | 
|  | 1483     } | 
|  | 1484     elsif($quality ne "n/a" and $quality > $min_pvalue){ # p-value cutoff | 
|  | 1485       #print "Checking call quality $fields[5]  gq2p\n" if $. == 84; | 
|  | 1486       next unless gq2p($fields[5]) <= $min_pvalue; # in some cases, programs like FreeBayes give low genotype quality such as when a call is borderline homo/het | 
|  | 1487                                                    # in these cases it would be silly to reject the call if their are many supporting reads. | 
|  | 1488     } | 
|  | 1489 | 
|  | 1490     # Some tools like GATK don't report number of variant reads...infer from other data if possible | 
|  | 1491     if($variant_depths eq "n/a"){ | 
|  | 1492       my @key_value_pairs = split /;/, $fields[7]; | 
|  | 1493       for my $key_value_pair (@key_value_pairs){ | 
|  | 1494         if($key_value_pair !~ /^(.*?)=(.*)$/){ | 
|  | 1495           next; | 
|  | 1496           #next if $key_value_pair eq "INDEL"; # samtools peculiarity | 
|  | 1497           #die "Key-value pair field (column #8) does not have the format key=value for entry $key_value_pair (line #$. of ), please fix the VCF file\n"; | 
|  | 1498         } | 
|  | 1499         if($1 eq "AB"){ # GATK: for het calls, AB is ref/(ref+var), only one variant reported per line | 
|  | 1500           $variant_depths = ""; | 
|  | 1501           for my $ab (split /,/, $2){ | 
|  | 1502             $variant_depths .= int((1-$ab)*$read_depth).","; | 
|  | 1503           } | 
|  | 1504           chop $variant_depths; | 
|  | 1505         } | 
|  | 1506         elsif($1 eq "MLEAC"){ | 
|  | 1507         } | 
|  | 1508         elsif($1 eq "DP4"){ # samtools: high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases | 
|  | 1509           my @rds = split /,/, $2; | 
|  | 1510           $variant_depths = $rds[2]+$rds[3]; | 
|  | 1511           $read_depth = $rds[0]+$rds[1]+$variant_depths; | 
|  | 1512           if($fields[4] =~ /,/){ # samtools doesn't break down compound het calls into individual frequencies | 
|  | 1513             my $num_alt_genotypes = $fields[4] =~ tr/,/,/; | 
|  | 1514             $num_alt_genotypes++; | 
|  | 1515             my $even_prop = sprintf "%.2f", $variant_depths/$read_depth/$num_alt_genotypes; | 
|  | 1516             $variant_depths = join(",", ($even_prop) x $num_alt_genotypes); | 
|  | 1517             if(not exists $chr2caveats{"$chr:$fields[1]"}){ | 
|  | 1518               $chr2caveats{"$chr:$fields[1]"} = "compound het var freq n/a in orig VCF file, auto set to $even_prop"; | 
|  | 1519             } | 
|  | 1520             else{ | 
|  | 1521               $chr2caveats{"$chr:$fields[1]"} .= "; compound het var freq n/a in orig VCF file, auto set to $even_prop"; | 
|  | 1522             } | 
|  | 1523           } | 
|  | 1524         } | 
|  | 1525       } | 
|  | 1526     } | 
|  | 1527     if($variant_depths eq "n/a"){ # usually homo var calls, can only assume all reads are variant | 
|  | 1528       if($zygosity eq "homozygote"){ | 
|  | 1529         $variant_depths = $read_depth; | 
|  | 1530         if($read_depth ne "n/a"){ | 
|  | 1531           if(not exists $chr2caveats{"$chr:$fields[1]"}){ | 
|  | 1532             $chr2caveats{"$chr:$fields[1]"} = "homo var freq n/a in orig VCF file, auto set to 1.0"; | 
|  | 1533           } | 
|  | 1534           else{ | 
|  | 1535             $chr2caveats{"$chr:$fields[1]"} = "; homo var freq n/a in orig VCF file, auto set to 1.0"; | 
|  | 1536           } | 
|  | 1537         } | 
|  | 1538       } | 
|  | 1539       else{ | 
|  | 1540         if($read_depth ne "n/a"){ | 
|  | 1541           $variant_depths = int($read_depth/2); | 
|  | 1542           if(not exists $chr2caveats{"$chr:$fields[1]"}){ | 
|  | 1543             $chr2caveats{"$chr:$fields[1]"} = "het var freq n/a in orig VCF file, auto set to 0.5"; | 
|  | 1544           } | 
|  | 1545           else{ | 
|  | 1546             $chr2caveats{"$chr:$fields[1]"} = "; het var freq n/a in orig VCF file, auto set to 0.5"; | 
|  | 1547           } | 
|  | 1548         } | 
|  | 1549         else{ | 
|  | 1550           $variant_depths = $read_depth; | 
|  | 1551         } | 
|  | 1552       } | 
|  | 1553     } | 
|  | 1554 | 
|  | 1555     my $target_parents = $feature_intervaltree{$chr}->fetch($fields[1]-$flanking_bases, $fields[1]+length($fields[3])+$flanking_bases); | 
|  | 1556     # Last ditch, if not in a gene model, is it at least in an enrichment region? | 
|  | 1557     if(not @$target_parents){ | 
|  | 1558        next if not exists $enrichment_regions{$chr}; | 
|  | 1559        my $regions_ref = $enrichment_regions{$chr}; | 
|  | 1560        my $location = $fields[1]; | 
|  | 1561        my $strand = "+"; # for lack of a better choice | 
|  | 1562        for(my $i = find_earliest_index($location-$flanking_bases, $regions_ref); | 
|  | 1563            $i < $#$regions_ref and $location <= $regions_ref->[$i]->[1]+$flanking_bases; | 
|  | 1564            $i++){ | 
|  | 1565           next unless $regions_ref->[$i]->[0]-$flanking_bases <= $location and $regions_ref->[$i]->[1]+$flanking_bases >= $location; | 
|  | 1566 | 
|  | 1567           my $feature_name = "enrichment_target_$chr:".$regions_ref->[$i]->[0]."-".$regions_ref->[$i]->[1]; | 
|  | 1568           $feature_type{$feature_name} = "misc_enrichment_kit_target"; | 
|  | 1569           $feature_length{$feature_name} = $regions_ref->[$i]->[1]-$regions_ref->[$i]->[0]+1; | 
|  | 1570           my @variants = split /,/, $fields[4]; | 
|  | 1571           my @variant_depths = split /,/, $variant_depths; | 
|  | 1572           my $ref = uc($fields[3]); | 
|  | 1573           for(my $j = 0; $j <= $#variants; $j++){ | 
|  | 1574             my $variant = $variants[$j]; | 
|  | 1575             next if $variant eq "<NON_REF>" or $variant_depths[$j] eq "0"; # GVCF stuff | 
|  | 1576             my $variant_depth = $variant_depths[$j]; | 
|  | 1577             if($min_prop){ | 
|  | 1578                next unless $variant_depth >= $min_depth and $read_depth ne "n/a" and $variant_depth/$read_depth >= $min_prop; | 
|  | 1579             } | 
|  | 1580             if(length($ref) == 1 and length($variant) == 1){ # SNP | 
|  | 1581               record_snv("$feature_name\tg.$location", | 
|  | 1582                          "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1583                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | 
|  | 1584             } | 
|  | 1585             elsif(length($ref) == 1 and length($variant) > 1){ # insertion | 
|  | 1586               record_snv("$feature_name\tg.$location\_",($location+1), | 
|  | 1587                          "ins",substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1588                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | 
|  | 1589             } | 
|  | 1590             elsif(length($variant) == 1 and length($ref) > 1){ # deletion | 
|  | 1591               record_snv("$feature_name\tg.$location\_",($location+length($ref)-1), | 
|  | 1592                          "del",substr($ref, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1593                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | 
|  | 1594             } | 
|  | 1595             else{ # indel | 
|  | 1596               record_snv("$feature_name\tg.$location\_",($location+length($ref)-1), | 
|  | 1597                          "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1598                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | 
|  | 1599             } | 
|  | 1600           } # end for variants | 
|  | 1601           next; # process next record, we've done all we can with a non-coding-gene SNP | 
|  | 1602        } | 
|  | 1603     } | 
|  | 1604 | 
|  | 1605     for my $target_parent (@$target_parents){ | 
|  | 1606 | 
|  | 1607         print STDERR "Checking parent $target_parent for on $chr:$fields[1] $fields[3] -> $fields[4]\n" if grep /dataset_7684.dat/, @ARGV; | 
|  | 1608     my @feature_ranges = @{$feature_range{$chr}->{$target_parent}}; | 
|  | 1609     # Calculate the position of the change within the feature range position | 
|  | 1610     my $strand = $feature_strand{$target_parent}; | 
|  | 1611     my $trans_table = exists $feature_transl_table{$target_parent} ? $feature_transl_table{$target_parent} : $default_transl_table; | 
|  | 1612     $fields[4]=~tr/"//d; # sometime strangely surroundsed by quotes | 
|  | 1613     my @variants = split /,/, $fields[4]; | 
|  | 1614     my @variant_depths = split /,/, $variant_depths; | 
|  | 1615     my @refs = (uc($fields[3])) x scalar(@variants); | 
|  | 1616     my @locations = ($fields[1]) x scalar(@variants); | 
|  | 1617 | 
|  | 1618     for(my $j = 0; $j <= $#variants; $j++){ | 
|  | 1619 	my $ref = $refs[$j]; | 
|  | 1620 	my $location = $locations[$j]; | 
|  | 1621 	my $feature_offset = 0; | 
|  | 1622 	my $feature_num = 0; | 
|  | 1623 	my $variant = uc($variants[$j]); | 
|  | 1624         next if $variant eq "<NON_REF>" or $variant_depths[$j] eq "0"; # GVCF stuff | 
|  | 1625 	my $variant_depth = $variant_depths[$j] || "n/a"; | 
|  | 1626         #print STDERR "Evaluating target parent $target_parent for $chr:$fields[1]-".($fields[1]+length($fields[3]))." -> ",join("/", @$target_parents) , "\n" if $fields[1] == 982994; | 
|  | 1627 | 
|  | 1628         # Break down MNPs into individual SNPs that are phased (TODO: skip if it's an inversion? would require amalgamating SNPs for tools that call them individually, phased :-P) | 
|  | 1629         if(length($variant) > 1 and length($variant) == length($ref)){ | 
|  | 1630            my @subvariants; | 
|  | 1631            my @subrefs; | 
|  | 1632            my @sublocations; | 
|  | 1633            my $phase_range = $location."-".($location+length($ref)-1); | 
|  | 1634            for(my $k = 0; $k < length($variant); $k++){ | 
|  | 1635              my $r = substr($ref, $k, 1); | 
|  | 1636              my $v = substr($variant, $k, 1); | 
|  | 1637              if($r ne $v){ | 
|  | 1638                push @subvariants, $v; | 
|  | 1639                push @subrefs, $r; | 
|  | 1640                push @sublocations, $location+$k; | 
|  | 1641              } | 
|  | 1642              elsif(@variants == 1){ | 
|  | 1643                next; # homo ref call | 
|  | 1644              } | 
|  | 1645              if($zygosity eq "heterozygote"){ | 
|  | 1646                # need to ignore case where a homozygous call (variant or ref) is included in a double non-ref het MNP | 
|  | 1647                if(@variants > 1){ | 
|  | 1648                  my $homo = 1; | 
|  | 1649                  for(my $l = 0; $l <= $#variants; $l++){ # using loop in case we handle oligoploid genomes in the future | 
|  | 1650                    if(length($variants[$l]) <= $k or substr($variants[$l], $k, 1) ne $v){ | 
|  | 1651                      $homo = 0; | 
|  | 1652                      last; | 
|  | 1653                    } | 
|  | 1654                  } | 
|  | 1655                  next if $homo; | 
|  | 1656                } | 
|  | 1657                my $phase_key = "$chr:".($location+$k).":$v"; | 
|  | 1658                my $phase_label = "M$j-$chr:$phase_range"; | 
|  | 1659                if(exists $chr2phase{$phase_key}){ | 
|  | 1660                  if($chr2phase{$phase_key} !~ /$phase_label/){ | 
|  | 1661                    $chr2phase{$phase_key} .= ",$phase_label"; | 
|  | 1662                  } | 
|  | 1663                } | 
|  | 1664                else{ | 
|  | 1665                  $chr2phase{$phase_key} = $phase_label; | 
|  | 1666                } | 
|  | 1667              } | 
|  | 1668            } | 
|  | 1669            # recycle this MNP variant loop to start processing the individual SNPs | 
|  | 1670            splice(@refs, $j, 1, @subrefs); | 
|  | 1671            splice(@variants, $j, 1, @subvariants); | 
|  | 1672            splice(@locations, $j, 1, @sublocations); | 
|  | 1673            splice(@variant_depths, $j, 1, ($variant_depth) x scalar(@subvariants)); | 
|  | 1674            $j--; | 
|  | 1675            next; | 
|  | 1676         } | 
|  | 1677 | 
|  | 1678         if($min_prop != 0 and $variant_depth eq "n/a" or $variant_depth eq "."){ | 
|  | 1679           print STDERR "Could not parse variant depth from $_\n" unless $quiet; | 
|  | 1680           next; | 
|  | 1681         } | 
|  | 1682         next unless $min_prop == 0 or $min_prop and $variant_depth >= $min_depth and $read_depth ne "n/a" and $variant_depth/$read_depth >= $min_prop; | 
|  | 1683         if($zygosity eq "NA"){ | 
|  | 1684           # make the call ourselves | 
|  | 1685           $zygosity = $variant_depths/$read_depth > 1-$min_prop ? "homozygote" : "heterozygote"; | 
|  | 1686         } | 
|  | 1687 	# e.g. chr22   47857671   . CAAAG   AAGAT,AAAAG    29.04  . | 
|  | 1688 	if(length($variant) > 1 and length($variant) == length($ref)){ | 
|  | 1689 	    for(my $k = 0; $k < length($variant); $k++){ | 
|  | 1690 		my $fixed_variant = $variant; | 
|  | 1691 		substr($fixed_variant, $k, 1) = substr($ref, $k, 1); | 
|  | 1692 		if($fixed_variant eq $ref){ # single base difference at base k between the two | 
|  | 1693 		    $ref = substr($ref, $k, 1); | 
|  | 1694 		    $variant = substr($variant, $k, 1); | 
|  | 1695 		    $location += $k; | 
|  | 1696 		    last; | 
|  | 1697 		} | 
|  | 1698 	    } | 
|  | 1699 	} | 
|  | 1700 | 
|  | 1701         # samtools reports indels with long common tails, causing non-canonical HGVS reporting and a problem when looking up the variant in dbSNP etc. | 
|  | 1702         # remove common tails to variant calls in order to try to rectify this | 
|  | 1703         else{ | 
|  | 1704             while(length($ref) > 1 and length($variant) > 1 and substr($ref, length($ref)-1) eq substr($variant, length($variant)-1)){ | 
|  | 1705                 chop $ref; chop $variant; | 
|  | 1706             } | 
|  | 1707         } | 
|  | 1708 | 
|  | 1709         # See if a caveat should be added because the indel was in a polyhomomer region | 
|  | 1710         if(length($ref) > length($variant) and index($ref, $variant) == 0 and $fasta_index->fetch("$chr:".($location+1)."-".($location+length($ref)+1)) =~ /^([ACGT])\1+$/i){ | 
|  | 1711           if(not exists $chr2caveats{"$chr:$location"}){ | 
|  | 1712             $chr2caveats{"$chr:$location"} = "poly".uc($1)." region deletion"; | 
|  | 1713           } | 
|  | 1714           elsif($chr2caveats{"$chr:$location"} !~ /poly/){ | 
|  | 1715             $chr2caveats{"$chr:$location"} .= "; poly".uc($1)." region deletion"; | 
|  | 1716           } | 
|  | 1717         } | 
|  | 1718         elsif(length($ref) < length($variant) and index($variant, $ref) == 0 and substr($variant, 1) =~ /^([ACGT])\1+$/i){ | 
|  | 1719           if(not exists $chr2caveats{"$chr:$location"}){ | 
|  | 1720             $chr2caveats{"$chr:$location"} .= "poly".uc($1)." region insertion"; | 
|  | 1721           } | 
|  | 1722           elsif($chr2caveats{"$chr:$location"} !~ /poly/){ | 
|  | 1723             $chr2caveats{"$chr:$location"} .= "; poly".uc($1)." region insertion"; | 
|  | 1724           } | 
|  | 1725         } | 
|  | 1726 | 
|  | 1727         # Not a protein-coding gene?  Report genomic cooordinates for HGVS | 
|  | 1728         if(not exists $feature_cds_max{$target_parent} or not defined $feature_cds_max{$target_parent} or $feature_cds_max{$target_parent} eq ""){ | 
|  | 1729             if(length($ref) == 1 and length($variant) == 1){ # SNP | 
|  | 1730               record_snv("$target_parent\tg.$location", | 
|  | 1731                          "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1732                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | 
|  | 1733             } | 
|  | 1734             elsif(length($ref) == 1 and length($variant) > 1){ # insertion | 
|  | 1735               record_snv("$target_parent\tg.$location\_",($location+1), | 
|  | 1736                          "ins",substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1737                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | 
|  | 1738             } | 
|  | 1739             elsif(length($variant) == 1 and length($ref) > 1){ # deletion | 
|  | 1740               record_snv("$target_parent\tg.$location\_",($location+length($ref)-1), | 
|  | 1741                          "del",substr($ref, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1742                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | 
|  | 1743             } | 
|  | 1744             else{ # indel | 
|  | 1745               record_snv("$target_parent\tg.$location\_",($location+length($ref)-1), | 
|  | 1746                          "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1747                          join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | 
|  | 1748             } | 
|  | 1749             next; # process next record, we've done all we can with a non-coding-gene SNP | 
|  | 1750         } | 
|  | 1751 | 
|  | 1752 	if($strand eq "-"){ | 
|  | 1753 	    # set up utr offset for correct CDS coordinates | 
|  | 1754 	    for(my $i = $#feature_ranges; $i >= 0; $i--){ | 
|  | 1755 		# exon is completely 5' of the start | 
|  | 1756 		if($feature_ranges[$i]->[0] > $feature_cds_max{$target_parent}){ | 
|  | 1757 		    #print STDERR "Whole 5' UTR exon $i: ",$feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1,"\n"; | 
|  | 1758 		    $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; | 
|  | 1759 		} | 
|  | 1760 		# exon with the cds start | 
|  | 1761 		elsif($feature_ranges[$i]->[1] >= $feature_cds_max{$target_parent} and | 
|  | 1762 		      $feature_ranges[$i]->[0] <= $feature_cds_max{$target_parent}){ | 
|  | 1763 		    #print STDERR "Start codon in exon $i: ", $feature_cds_max{$target_parent} - $feature_ranges[$i]->[1],"\n"; | 
|  | 1764 		    $feature_offset += $feature_cds_max{$target_parent} - $feature_ranges[$i]->[1]; | 
|  | 1765 		    last; | 
|  | 1766 		} | 
|  | 1767 		else{ | 
|  | 1768 		    die "The CDS for $target_parent (on negative strand) ends downstream ", | 
|  | 1769 		    "($feature_cds_max{$target_parent}) of the an exon", | 
|  | 1770 		    " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n"; | 
|  | 1771 		} | 
|  | 1772 	    } | 
|  | 1773 	    for(my $i = $#feature_ranges; $i >= 0; $i--){ | 
|  | 1774 		my $feature = $feature_ranges[$i]; | 
|  | 1775 		# in the 3' UTR region of the gene | 
|  | 1776 		if($location < $feature_cds_min{$target_parent}){ | 
|  | 1777 		    my $feature_exon = 0; | 
|  | 1778 		    $feature = $feature_ranges[$feature_exon]; | 
|  | 1779 		    while($location > $feature->[1]+$flanking_bases and | 
|  | 1780 			  $feature_exon < $#feature_ranges){ | 
|  | 1781 			$feature = $feature_ranges[++$feature_exon]; # find the 3' utr exon in which the variant is located | 
|  | 1782 		    } | 
|  | 1783 		    # utr exons passed entirely | 
|  | 1784 		    my $stop_exon = $feature_exon; | 
|  | 1785 		    while($feature_ranges[$stop_exon]->[1] < $feature_cds_min{$target_parent}){ | 
|  | 1786 			$stop_exon++; | 
|  | 1787 		    } | 
|  | 1788 		    my $post_offset = $feature_cds_min{$target_parent}-$feature_ranges[$stop_exon]->[0]; # offset from the stop codon in the final coding exon | 
|  | 1789 		    while($stop_exon > $feature_exon){ | 
|  | 1790 			$post_offset += $feature_ranges[$stop_exon]->[1]-$feature_ranges[$stop_exon]->[0]+1; | 
|  | 1791 			$stop_exon--; | 
|  | 1792 		    } | 
|  | 1793 | 
|  | 1794 		    my $pos = $feature->[1]-$location+1+$post_offset; | 
|  | 1795                     my $junction_dist; | 
|  | 1796 		    if($location < $feature->[0]){ # after a UTR containing exon? set as .*DD+DD | 
|  | 1797                         $junction_dist = ($feature->[0]-$location); | 
|  | 1798 			$pos = ($post_offset+$feature->[1]-$feature->[0]+1)."+".$junction_dist; | 
|  | 1799 		    } | 
|  | 1800 		    elsif($location > $feature->[1]){ # before a total UTR exon? set as .*DD-DD | 
|  | 1801                         $junction_dist = -($location-$feature->[1]); | 
|  | 1802 			$pos = ($post_offset+1).$junction_dist; | 
|  | 1803 		    } | 
|  | 1804                     else{ # in the UTR exon | 
|  | 1805                         if($location - $feature->[0] < $feature->[1] - $location){ # location is closer to exon donor site | 
|  | 1806                             $junction_dist = -($location - $feature->[0]+1); # +1 for HGVS syntax compatibility (there is no position 0, direct skip from -1 to +1) | 
|  | 1807                         } | 
|  | 1808                         else{ | 
|  | 1809                             $junction_dist = $feature->[1] - $location +1; | 
|  | 1810                         } | 
|  | 1811                     } | 
|  | 1812 		    if(length($ref) == 1 and length($variant) == 1){ | 
|  | 1813                         my $rc = join("",map({$rc{$_}} split(//,reverse($variant)))); | 
|  | 1814 			# 3' UTR SNP | 
|  | 1815 			record_snv("$target_parent\tc.*$pos", | 
|  | 1816 			"$rc{$ref}>$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1817 			#"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1818 			join("\t",prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); | 
|  | 1819 		    } | 
|  | 1820 		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ | 
|  | 1821                         my $rc = join("",map({$rc{$_}} split(//,reverse(substr($variant,1))))); | 
|  | 1822 			# 3' UTR insertion | 
|  | 1823 			record_snv("$target_parent\tc.*", | 
|  | 1824 			hgvs_plus($pos,-1),"_*",$pos, | 
|  | 1825 			"ins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1826 			#"ins",substr($variant,1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1827 			join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); | 
|  | 1828 		    } | 
|  | 1829 		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ | 
|  | 1830                         my $rc = join("",map({$rc{$_}} split(//,reverse($ref)))); | 
|  | 1831                         my $delBases = substr($rc,0,length($rc)-1); | 
|  | 1832 			if(length($ref) == 2){ | 
|  | 1833 			    # 3' UTR single base deletion | 
|  | 1834 			    record_snv("$target_parent\tc.*",hgvs_plus($pos,-1), | 
|  | 1835 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1836 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); | 
|  | 1837 			} | 
|  | 1838 			else{ | 
|  | 1839 			    # longer 3' UTR deletion | 
|  | 1840 			    record_snv("$target_parent\tc.*", | 
|  | 1841 			    hgvs_plus($pos,-length($ref)+1),"_*",hgvs_plus($pos, -1), | 
|  | 1842 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1843 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); | 
|  | 1844 			} | 
|  | 1845 		    } | 
|  | 1846 		    else{ | 
|  | 1847 			my $rc = join("",map({$rc{$_}} split(//,reverse($variant)))); | 
|  | 1848 			if($rc eq $ref and length($variant) > 3){ | 
|  | 1849 			    # 3' UTR inversion | 
|  | 1850 			    record_snv("$target_parent\tc.*", | 
|  | 1851 			    hgvs_plus($pos,-length($ref)+1),"_*",$pos, | 
|  | 1852 			    "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1853 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); | 
|  | 1854 			    last; | 
|  | 1855 			} | 
|  | 1856 | 
|  | 1857 			# complex substitution in 3' UTR | 
|  | 1858 			# Will break if stop codon is modified | 
|  | 1859 			record_snv("$target_parent\tc.*", | 
|  | 1860 			hgvs_plus($pos,-length($ref)+1),"_*", $pos, | 
|  | 1861 			"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1862 			join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); | 
|  | 1863 		    } | 
|  | 1864 		    last; | 
|  | 1865 		} | 
|  | 1866 		# in the feature | 
|  | 1867 		elsif($location >= $feature->[0] and $location <= $feature->[1]){ | 
|  | 1868 		    my $pos = $feature_offset+$feature->[1]-$location+1; | 
|  | 1869 		    if($location > $feature_cds_max{$target_parent}){ #since there is no position 0, the pos is in UTR, subtract one | 
|  | 1870 			$pos = hgvs_plus($pos, -1); | 
|  | 1871 		    } | 
|  | 1872 		    my $first_exon_base = $feature_offset+1; | 
|  | 1873                     my $exon_edge_dist = $feature->[1]-$location+1; # equiv of HGVS +X or -X intron syntax, but for exons | 
|  | 1874                     $exon_edge_dist = $feature->[0]-$location-1 if abs($feature->[0]-$location-1) < $exon_edge_dist; # pick closer of donor and acceptor sites | 
|  | 1875                     #print STDERR "Got ", $feature->[1]-$location+1, "vs. ", $feature->[0]-$location-1, ": chose $exon_edge_dist\n"; | 
|  | 1876 		    if(length($ref) == 1 and length($variant) == 1){ | 
|  | 1877 			# SNP | 
|  | 1878 			record_snv("$target_parent\tc.", | 
|  | 1879 			$pos, | 
|  | 1880 			"$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1881 			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | 
|  | 1882                         ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 1883                         #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 1884 		    } | 
|  | 1885 		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ | 
|  | 1886 			my $rc = join("",map({$rc{$_}} split(//,reverse($variant)))); | 
|  | 1887                         my $insBases = substr($rc,1); | 
|  | 1888 			# insertion | 
|  | 1889 			record_snv("$target_parent\tc.", | 
|  | 1890 			hgvs_plus_exon($pos, -1, $first_exon_base),"_",$pos,"ins$insBases", | 
|  | 1891 			"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1892 			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | 
|  | 1893                         ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 1894                         #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 1895 		    } | 
|  | 1896 		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ | 
|  | 1897 			my $rc = join("",map({$rc{$_}} split(//,reverse($ref)))); | 
|  | 1898                         my $delBases = substr($rc,0,length($rc)-1); | 
|  | 1899 			# single nucleotide deletion | 
|  | 1900 			if(length($ref) == 2){ | 
|  | 1901 			    record_snv("$target_parent\tc.", | 
|  | 1902 			    hgvs_plus_exon($pos, -1, $first_exon_base), | 
|  | 1903 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1904 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | 
|  | 1905                             ($pos-1 < 1 ? "NA" : $pos-1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 1906                             #($pos-1 < 1 ? "NA" : $pos-1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 1907 			} | 
|  | 1908 			# longer deletion | 
|  | 1909 			else{ | 
|  | 1910                             $exon_edge_dist = $feature->[1]-$location-length($ref)+1 if abs($feature->[1]-$location-length($ref)+1) < $exon_edge_dist; | 
|  | 1911 			    record_snv("$target_parent\tc.", | 
|  | 1912 			    hgvs_plus_exon($pos, -length($ref)+1, $first_exon_base),"_", | 
|  | 1913 			    hgvs_plus_exon($pos, -1, $first_exon_base), | 
|  | 1914 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1915 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | 
|  | 1916                             ($pos-1 < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 1917                             #($pos-1 < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 1918 			} | 
|  | 1919 		    } | 
|  | 1920 		    else{ | 
|  | 1921 			# complex substitution | 
|  | 1922                         $exon_edge_dist = $feature->[1]-$location-length($ref)+1 if abs($feature->[1]-$location-length($ref)+1) < $exon_edge_dist; | 
|  | 1923 			my $rc = join("",map({$rc{$_}} split(//,reverse($variant)))); | 
|  | 1924                         if($rc eq $variant and length($variant) > 3){ | 
|  | 1925                             # inversion | 
|  | 1926                             record_snv("$target_parent\tc.", | 
|  | 1927                                        hgvs_plus_exon($pos,-length($ref)+1,$first_exon_base),"_", | 
|  | 1928                                        $pos, | 
|  | 1929                                        "inv", | 
|  | 1930                                        "\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1931                                        join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | 
|  | 1932                                        ($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 1933 | 
|  | 1934                             last; | 
|  | 1935                         } | 
|  | 1936 			record_snv("$target_parent\tc.", | 
|  | 1937 			hgvs_plus_exon($pos,-length($ref)+1,$first_exon_base),"_", | 
|  | 1938 			$pos, | 
|  | 1939 			"delins$rc", | 
|  | 1940 			"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1941 			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | 
|  | 1942                         ($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 1943                         #($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 1944 		    } | 
|  | 1945 		    last; | 
|  | 1946 		} | 
|  | 1947 		# 5' of feature (on negative strand) | 
|  | 1948 		elsif($location > $feature->[1]){ | 
|  | 1949 		    if(length($ref) == 1 and length($variant) == 1){ | 
|  | 1950 			# intronic SNP | 
|  | 1951 			if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ | 
|  | 1952 			    # Closer to acceptor site | 
|  | 1953 			    record_snv("$target_parent\tc.",$feature_offset+1, | 
|  | 1954 			    ($feature->[1]-$location), | 
|  | 1955 			    "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1956 			    #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1957 			    join("\t", prop_info_key($chr,$location,$ref,$variant, $feature->[1]-$location)),"\tNA\n"); | 
|  | 1958 			} | 
|  | 1959 			else{ | 
|  | 1960 			    # Closer to donor site | 
|  | 1961 			    record_snv("$target_parent\tc.",$feature_offset,"+", | 
|  | 1962 			    ($feature_ranges[$i+1]->[0]-$location), | 
|  | 1963 			    "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1964 			    #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1965 			    join("\t", prop_info_key($chr,$location,$ref,$variant, $feature_ranges[$i+1]->[0]-$location)),"\tNA\n"); | 
|  | 1966 			} | 
|  | 1967 		    } | 
|  | 1968 		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ | 
|  | 1969                         my $rc = join("",map({$rc{$_}} split(//,reverse(substr($variant,1))))); | 
|  | 1970 			if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ | 
|  | 1971 			    # intronic insertion near acceptor | 
|  | 1972 			    my $pos = ($feature_offset+1).($feature->[1]-$location); | 
|  | 1973 			    record_snv("$target_parent\tc.", | 
|  | 1974 			    hgvs_plus($pos,-1),"_",$pos, | 
|  | 1975 			    "ins", | 
|  | 1976 			    $rc,"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1977 			    #substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1978 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location-1)),"\tNA\n"); | 
|  | 1979 			} | 
|  | 1980 			else{ | 
|  | 1981 			    # intronic insertion near donor | 
|  | 1982 			    my $pos = $feature_offset."+".($feature_ranges[$i+1]->[0]-$location); | 
|  | 1983 			    record_snv("$target_parent\tc.", | 
|  | 1984 			    hgvs_plus($pos,-1),"_",$pos, | 
|  | 1985 			    "ins", | 
|  | 1986 			    $rc,"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1987 			    #substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 1988 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location+1)),"\tNA\n"); | 
|  | 1989 			} | 
|  | 1990 		    } | 
|  | 1991 		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ | 
|  | 1992 			# intronic deletion | 
|  | 1993 			# single nucleotide deletion | 
|  | 1994 			my $rc = reverse($ref); | 
|  | 1995 			$rc=~tr/ACGT/TGCA/; | 
|  | 1996                         my $delBases = substr($rc, 0, length($rc)-1); | 
|  | 1997 			if(length($ref) == 2){ | 
|  | 1998 			    # single intronic deletion near acceptor | 
|  | 1999 			    if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ | 
|  | 2000                                 my $off = $feature->[1]-$location-1; | 
|  | 2001 				record_snv("$target_parent\tc.", | 
|  | 2002 				($feature_offset+1),$off, | 
|  | 2003 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2004 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n"); | 
|  | 2005 			    } | 
|  | 2006 			    # single intronic deletion near donor | 
|  | 2007 			    else{ | 
|  | 2008 				my $pos = $feature_offset; | 
|  | 2009                                 my $off = $feature_ranges[$i+1]->[0]-$location+1; | 
|  | 2010 				record_snv("$target_parent\tc.", | 
|  | 2011 				hgvs_plus_exon($pos, $off, $pos), | 
|  | 2012 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2013 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n"); | 
|  | 2014 			    } | 
|  | 2015 			} | 
|  | 2016 			# longer deletion | 
|  | 2017 			else{ | 
|  | 2018 			    if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ | 
|  | 2019 				# long intronic deletion near acceptor | 
|  | 2020                                 my $off = $feature->[1]-$location-1; | 
|  | 2021 				my $pos = ($feature_offset+1).$off; | 
|  | 2022 				record_snv("$target_parent\tc.", | 
|  | 2023 				hgvs_plus($pos,-length($ref)+2),"_",$pos, | 
|  | 2024 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2025 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n"); | 
|  | 2026 			    } | 
|  | 2027 			    else{ | 
|  | 2028 				# long intronic deletion near donor | 
|  | 2029                                 my $off = $feature_ranges[$i+1]->[0]-$location+1; | 
|  | 2030 				my $pos = ($feature_offset)."+".$off; | 
|  | 2031 				record_snv("$target_parent\tc.", | 
|  | 2032 				$pos,"_",hgvs_plus($pos,-length($ref)-1), | 
|  | 2033 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2034 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off-length($ref)+1 <= 2 ? "p.?" : "NA"),"\n"); | 
|  | 2035 			    } | 
|  | 2036 			    last; | 
|  | 2037 			} | 
|  | 2038 		    } | 
|  | 2039 		    else{ | 
|  | 2040 			my $rc = reverse($ref); | 
|  | 2041 			$rc=~tr/ACGT/TGCA/; | 
|  | 2042 			if($rc eq $variant and length($variant) > 3){ | 
|  | 2043 			    # intronic inversion near acceptor site | 
|  | 2044 			    if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ | 
|  | 2045 				my $pos = ($feature_offset+1).($feature->[1]-$location); | 
|  | 2046 				record_snv("$target_parent\tc.", | 
|  | 2047 				hgvs_plus($pos,-length($ref)+1),"_",$pos, | 
|  | 2048 				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2049 				join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location)),"\tNA\n"); | 
|  | 2050 			    } | 
|  | 2051 			    else{ | 
|  | 2052 				my $pos = ($feature_offset)."+".($feature_ranges[$i+1]->[0]-$location); | 
|  | 2053 				record_snv("$target_parent\tc.", | 
|  | 2054 				$pos,"_",hgvs_plus($pos, length($ref)-1), | 
|  | 2055 				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2056 				join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location)),"\tNA\n"); | 
|  | 2057 			    } | 
|  | 2058 			    last; | 
|  | 2059 			} | 
|  | 2060                         $rc = reverse($variant); | 
|  | 2061                         $rc=~tr/ACGT/TGCA/; | 
|  | 2062 			# Intronic complex substitution | 
|  | 2063 			if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ | 
|  | 2064 			    # complex intronic substitution near acceptor site | 
|  | 2065 			    my $pos = ($feature_offset+1).($feature->[1]-$location); | 
|  | 2066 			    record_snv("$target_parent\tc.", | 
|  | 2067 			    hgvs_plus($pos, -length($ref)+1),"_",$pos, | 
|  | 2068 			    "delins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2069 			    #"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2070 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location)),"\tNA\n"); | 
|  | 2071 			} | 
|  | 2072 			else{ | 
|  | 2073 			    # complex intronic substitution near donor site | 
|  | 2074 			    my $pos = $feature_offset."+".($feature_ranges[$i+1]->[0]-$location); | 
|  | 2075 			    record_snv("$target_parent\tc.", | 
|  | 2076 			    $pos,"_",hgvs_plus($pos, length($ref)-1), | 
|  | 2077 			    "delins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2078 			    #"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2079 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location)),"\tNA\n"); | 
|  | 2080 			} | 
|  | 2081 		    } | 
|  | 2082 		    last; | 
|  | 2083 		} | 
|  | 2084 		else{ | 
|  | 2085 		    #print STDERR "Offset going from ", $feature_offset, " to ", $feature_offset+$feature->[1]-$feature->[0]+1,"\n"; | 
|  | 2086 		    $feature_offset += $feature->[1]-$feature->[0]+1; | 
|  | 2087 		    #print STDERR "Set feature offset to  $feature_offset by adding ",$feature->[1],"-", $feature->[0],"+1\n"; | 
|  | 2088 		} | 
|  | 2089 	    } | 
|  | 2090 	} | 
|  | 2091 	else{ | 
|  | 2092 	    # forward strand | 
|  | 2093 | 
|  | 2094 	    # set up utr offset for correct CDS coordinates | 
|  | 2095 	    for(my $i = 0; $i <= $#feature_ranges; $i++){ | 
|  | 2096 		# All 5' utr exon | 
|  | 2097 		if($feature_ranges[$i]->[1] < $feature_cds_min{$target_parent}){ | 
|  | 2098 		    $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; | 
|  | 2099 		} | 
|  | 2100 		# exon with the cds start | 
|  | 2101 		elsif($feature_ranges[$i]->[1] >= $feature_cds_min{$target_parent} and | 
|  | 2102 		      $feature_ranges[$i]->[0] <= $feature_cds_min{$target_parent}){ | 
|  | 2103 		    $feature_offset -= $feature_cds_min{$target_parent} - $feature_ranges[$i]->[0]; | 
|  | 2104 		    last; | 
|  | 2105 		} | 
|  | 2106 		else{ | 
|  | 2107 		    die "The CDS for $target_parent starts upstream ($feature_cds_max{$target_parent}) of the first exon", | 
|  | 2108 		    " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n"; | 
|  | 2109 		} | 
|  | 2110 	    } | 
|  | 2111 	    for(my $i = 0; $i <= $#feature_ranges; $i++){ | 
|  | 2112 		my $feature = $feature_ranges[$i]; | 
|  | 2113 		# 3' of last coding position | 
|  | 2114 		if($location > $feature_cds_max{$target_parent}){ | 
|  | 2115 		    # find the exon with the stop codon | 
|  | 2116 		    while($feature->[1] < $feature_cds_max{$target_parent}){ | 
|  | 2117 			$feature = $feature_ranges[++$i]; | 
|  | 2118 		    } | 
|  | 2119 		    my $post_offset = $feature->[0]-$feature_cds_max{$target_parent}; | 
|  | 2120 		    while($location > $feature->[1]+$flanking_bases and | 
|  | 2121 			  $i < $#feature_ranges){ | 
|  | 2122 			$post_offset += $feature->[1]-$feature->[0]+1; | 
|  | 2123 			$feature = $feature_ranges[++$i]; # find the 3' utr exon in which the variant is located | 
|  | 2124 		    } | 
|  | 2125 		    my $pos = $location-$feature->[0]+$post_offset; | 
|  | 2126 		    #print STDERR "Positive strand: got $pos for $location, exon #$i of $#feature_ranges, post_offset is $post_offset\n" if $location-$feature->[1] > $flanking_bases; | 
|  | 2127                     my $off; | 
|  | 2128 		    if($location > $feature->[1]){ # after a UTR containing exon? set as .*DD+DD | 
|  | 2129                         $off = $location-$feature->[1]; | 
|  | 2130 			$pos = ($post_offset+$feature->[1]-$feature->[0]+1)."+".$off; | 
|  | 2131 		    } | 
|  | 2132 		    elsif($location < $feature->[0]){ # before a total UTR exon? set as .*DD-DD | 
|  | 2133                         $off = -($feature->[0]-$location); | 
|  | 2134 			$pos = ($post_offset+1).$off; | 
|  | 2135 		    } | 
|  | 2136                     else{ | 
|  | 2137                         if($location-$feature->[0] < $feature->[1]-$location){ | 
|  | 2138                            $off = $location-$feature->[0]+1; # +1 since HGVS skips right from -1 to +1 at exon boundary | 
|  | 2139                         } | 
|  | 2140                         else{ | 
|  | 2141                            $off = $location-$feature->[1]-1; # will be negative | 
|  | 2142                         } | 
|  | 2143                     } | 
|  | 2144 		    if(length($ref) == 1 and length($variant) == 1){ | 
|  | 2145 			# 3' UTR SNP | 
|  | 2146 			record_snv("$target_parent\tc.*$pos", | 
|  | 2147 			"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2148 			join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n"); | 
|  | 2149 		    } | 
|  | 2150 		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ | 
|  | 2151 			# 3' UTR insertion | 
|  | 2152 			record_snv("$target_parent\tc.*", | 
|  | 2153 			hgvs_plus($pos,1),"_*",hgvs_plus($pos,2), | 
|  | 2154 			"ins",substr($variant,1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2155 			join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n"); | 
|  | 2156 		    } | 
|  | 2157 		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ | 
|  | 2158                         my $delBases = substr($ref, 1); | 
|  | 2159 			if(length($ref) == 2){ | 
|  | 2160 			    # 3' UTR single base deletion | 
|  | 2161 			    record_snv("$target_parent\tc.*",hgvs_plus($pos,1), | 
|  | 2162 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2163 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n"); | 
|  | 2164 			} | 
|  | 2165 			else{ | 
|  | 2166 			    # longer 3' UTR deletion | 
|  | 2167 			    record_snv("$target_parent\tc.*", | 
|  | 2168 			    hgvs_plus($pos,1),"_*",hgvs_plus($pos,length($ref)-1), | 
|  | 2169 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2170 			    join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n"); | 
|  | 2171 			} | 
|  | 2172 		    } | 
|  | 2173 		    else{ | 
|  | 2174 			my $rc = reverse($ref); | 
|  | 2175 			$rc=~tr/ACGT/TGCA/; | 
|  | 2176 			if($rc eq $variant and length($variant) > 3){ | 
|  | 2177 			    # 3' UTR inversion | 
|  | 2178 			    record_snv("$target_parent\tc.*$pos", | 
|  | 2179 			    "_*",hgvs_plus($pos,length($ref)-1), | 
|  | 2180 			    "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2181 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n"); | 
|  | 2182 			    last; | 
|  | 2183 			} | 
|  | 2184 			# complex substitution in 3' UTR | 
|  | 2185 			record_snv("$target_parent\tc.*$pos", | 
|  | 2186 			"_*",hgvs_plus($pos,length($ref)-1), | 
|  | 2187 			"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2188 			join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n"); | 
|  | 2189 		    } | 
|  | 2190 		    last; | 
|  | 2191 		} | 
|  | 2192 		# in the exon | 
|  | 2193 		elsif($location >= $feature->[0] and $location <= $feature->[1]){ | 
|  | 2194 		    my $pos = $feature_offset+$location-$feature->[0]+1; | 
|  | 2195 		    my $last_exon_base = $feature_offset+$feature->[1]-$feature->[0]+1; | 
|  | 2196                     my $exon_edge_dist = $location-$feature->[0]+1; # equiv of HGVS +X or -X intron syntax, but for exons | 
|  | 2197                     $exon_edge_dist = $location-$feature->[1]-1 if abs($location-$feature->[1]-1) < $exon_edge_dist; # pick closer of donor and acceptor sites | 
|  | 2198                     #print STDERR "Got ", $location-$feature->[0]+1, "vs. ", $location-$feature->[1]-1, ": chose $exon_edge_dist\n"; | 
|  | 2199 		    if($location < $feature_cds_min{$target_parent}){ #since there is no position 0, the pos is in UTR, subtract one | 
|  | 2200 			$pos = hgvs_plus($pos, -1); | 
|  | 2201 		    } | 
|  | 2202 		    if(length($ref) == 1 and length($variant) == 1){ | 
|  | 2203 			# SNP | 
|  | 2204 			record_snv("$target_parent\tc.$pos", | 
|  | 2205 			"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2206 			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | 
|  | 2207                         ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 2208                         #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 2209 		    } | 
|  | 2210 		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ | 
|  | 2211 			# insertion | 
|  | 2212 			record_snv("$target_parent\tc.$pos", | 
|  | 2213 			"_",hgvs_plus_exon($pos,1,$last_exon_base),"ins", | 
|  | 2214 			substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2215 			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | 
|  | 2216                         ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 2217                         #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 2218 		    } | 
|  | 2219 		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ | 
|  | 2220                         my $delBases = substr($ref, 1); | 
|  | 2221 			# single nucleotide deletion | 
|  | 2222 			if(length($delBases) == 1){ | 
|  | 2223 			    record_snv("$target_parent\tc.", | 
|  | 2224 			    hgvs_plus_exon($pos,1,$last_exon_base), | 
|  | 2225 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2226 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | 
|  | 2227                             ($pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 2228                             #($pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 2229 			} | 
|  | 2230 			# longer deletion | 
|  | 2231 			else{ | 
|  | 2232                             $exon_edge_dist = $feature->[1]-$location-length($ref)-1 if abs($feature->[1]-$location-length($ref)-1) < $exon_edge_dist; | 
|  | 2233 			    record_snv("$target_parent\tc.", | 
|  | 2234 			    hgvs_plus_exon($pos,1,$last_exon_base),"_", | 
|  | 2235 			    hgvs_plus_exon($pos,length($ref)-1,$last_exon_base), | 
|  | 2236 			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2237 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | 
|  | 2238                             ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 2239                             #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 2240 			} | 
|  | 2241 		    } | 
|  | 2242 		    else{ | 
|  | 2243                         $exon_edge_dist = $feature->[1]-$location-length($ref)-1 if abs($feature->[1]-$location-length($ref)-1) < $exon_edge_dist; | 
|  | 2244 			my $rc = reverse($ref); | 
|  | 2245 			$rc=~tr/ACGT/TGCA/; | 
|  | 2246 			if($rc eq $variant and length($variant) > 3){ | 
|  | 2247 			    # inversion | 
|  | 2248 			    record_snv("$target_parent\tc.$pos", | 
|  | 2249 			    "_",hgvs_plus_exon($pos,length($ref)-1, $last_exon_base), | 
|  | 2250 			    "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2251 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | 
|  | 2252                             ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 2253                             #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 2254 			    last; | 
|  | 2255 			} | 
|  | 2256 			# complex substitution | 
|  | 2257 			record_snv("$target_parent\tc.$pos", | 
|  | 2258 			"_",hgvs_plus_exon($pos, length($ref)-1, $last_exon_base), | 
|  | 2259 			"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2260 			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | 
|  | 2261                         ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 2262                         #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | 
|  | 2263 		    } | 
|  | 2264 		    last; | 
|  | 2265 		} | 
|  | 2266 		# 5' of feature | 
|  | 2267 		elsif($location < $feature->[0]){ | 
|  | 2268 		    if(length($ref) == 1 and length($variant) == 1){ | 
|  | 2269 			# intronic SNP | 
|  | 2270 			if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ | 
|  | 2271 			    # Closer to donor site | 
|  | 2272 			    record_snv("$target_parent\tc.",$feature_offset,"+", | 
|  | 2273 			    ($location-$feature_ranges[$i-1]->[1]), | 
|  | 2274 			    "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2275 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n"); | 
|  | 2276 			} | 
|  | 2277 			else{ | 
|  | 2278 			    # Closer to acceptor site | 
|  | 2279 			    record_snv("$target_parent\tc.",$feature_offset+1, | 
|  | 2280 			    ($location-$feature->[0]), | 
|  | 2281 			    "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2282 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n"); | 
|  | 2283 			} | 
|  | 2284 		    } | 
|  | 2285 		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ | 
|  | 2286 			if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ | 
|  | 2287 			    # intronic insertion near donor | 
|  | 2288 			    my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]); | 
|  | 2289 			    record_snv("$target_parent\tc.", | 
|  | 2290 			    $pos,"_",hgvs_plus($pos,1), | 
|  | 2291 			    "ins", | 
|  | 2292 			    substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2293 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n"); | 
|  | 2294 			} | 
|  | 2295 			else{ | 
|  | 2296 			    # intronic insertion near acceptor | 
|  | 2297 			    my $pos = ($feature_offset+1).($location-$feature->[0]); | 
|  | 2298 			    record_snv("$target_parent\tc.", | 
|  | 2299 			    $pos,"_",hgvs_plus($pos,1), | 
|  | 2300 			    "ins", | 
|  | 2301 			    substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2302 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n"); | 
|  | 2303 			} | 
|  | 2304 		    } | 
|  | 2305 		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ | 
|  | 2306 			# intronic deletion | 
|  | 2307 			# single nucleotide deletion | 
|  | 2308                         my $delBases = substr($ref, 1); | 
|  | 2309 			if(length($ref) == 2){ | 
|  | 2310 			    # single intronic deletion near donor | 
|  | 2311 			    if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ | 
|  | 2312                                 my $off = $location-$feature_ranges[$i-1]->[1]+1; | 
|  | 2313 				record_snv("$target_parent\tc.", | 
|  | 2314 				$feature_offset,"+",$off, | 
|  | 2315 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2316 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n"); | 
|  | 2317 			    } | 
|  | 2318 			    # single intronic deletion near acceptor | 
|  | 2319 			    else{ | 
|  | 2320 				my $pos = ($feature_offset+1); | 
|  | 2321                                 my $off = $location-$feature->[0]; | 
|  | 2322 				record_snv("$target_parent\tc.", | 
|  | 2323 				hgvs_plus_exon($pos, $off, $pos), | 
|  | 2324 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2325 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n"); | 
|  | 2326 			    } | 
|  | 2327 			} | 
|  | 2328 			# longer deletion | 
|  | 2329 			else{ | 
|  | 2330 			    if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ | 
|  | 2331 				# long intronic deletion near donor | 
|  | 2332                                 my $off = $location-$feature_ranges[$i-1]->[1]+1; | 
|  | 2333 				my $pos = $feature_offset."+".$off; | 
|  | 2334 				record_snv("$target_parent\tc.", | 
|  | 2335 				$pos,"_",hgvs_plus($pos,length($ref)-2), | 
|  | 2336 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2337 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n"); | 
|  | 2338 			    } | 
|  | 2339 			    else{ | 
|  | 2340 				# long intronic deletion near acceptor | 
|  | 2341                                 my $off = $location-$feature->[0]+1; | 
|  | 2342 				my $pos = ($feature_offset+1).$off; | 
|  | 2343 				record_snv("$target_parent\tc.", | 
|  | 2344 				$pos,"_",hgvs_plus($pos,length($ref)-2), | 
|  | 2345 				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2346 				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off+length($ref)-2 >= -2 ? "p.?" : "NA"),"\n"); | 
|  | 2347 			    } | 
|  | 2348 			} | 
|  | 2349 		    } | 
|  | 2350 		    else{ | 
|  | 2351 			my $rc = reverse($ref); | 
|  | 2352 			$rc=~tr/ACGT/TGCA/; | 
|  | 2353 			if($rc eq $variant and length($variant) > 3){ | 
|  | 2354 			    # intronic inversion near donor site | 
|  | 2355 			    if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ | 
|  | 2356 				my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]); | 
|  | 2357 				record_snv("$target_parent\tc.", | 
|  | 2358 				$pos,"_",hgvs_plus($pos,length($ref)-1), | 
|  | 2359 				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2360 				join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n"); | 
|  | 2361 			    } | 
|  | 2362 			    else{ | 
|  | 2363 				my $pos = ($feature_offset+1).($location-$feature->[0]); | 
|  | 2364 				record_snv("$target_parent\tc.", | 
|  | 2365 				$pos,"_",hgvs_plus($pos, length($ref)-1), | 
|  | 2366 				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2367 				join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n"); | 
|  | 2368 			    } | 
|  | 2369 			    last; | 
|  | 2370 			} | 
|  | 2371 			# Intronic complex substitution | 
|  | 2372 			# Note: sub maybe have  comma in it to denote two possible variants | 
|  | 2373 			if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ | 
|  | 2374 			    # complex intronic substitution near donor site | 
|  | 2375 			    my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]); | 
|  | 2376 			    record_snv("$target_parent\tc.", | 
|  | 2377 			    $pos,"_",hgvs_plus($pos, length($ref)-1), | 
|  | 2378 			    "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2379 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n"); | 
|  | 2380 			} | 
|  | 2381 			else{ | 
|  | 2382 			    # complex intronic substitution near acceptor site | 
|  | 2383 			    my $pos = ($feature_offset+1).($location-$feature->[0]); | 
|  | 2384 			    record_snv("$target_parent\tc.", | 
|  | 2385 			    $pos,"_",hgvs_plus($pos, length($ref)-1), | 
|  | 2386 			    "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | 
|  | 2387 			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n"); | 
|  | 2388 			} | 
|  | 2389 		    } | 
|  | 2390 		    last; | 
|  | 2391 		} | 
|  | 2392 		else{ | 
|  | 2393 		    # feature is past this exon | 
|  | 2394 		    $feature_offset += $feature->[1]-$feature->[0]+1; | 
|  | 2395 		} | 
|  | 2396 	    } | 
|  | 2397 	} | 
|  | 2398     }  # for each variant on the line | 
|  | 2399   } # for each  gene overlapping the site the VCF describes | 
|  | 2400 } # for each VCF line | 
|  | 2401 print STDERR "\n" unless $quiet; | 
|  | 2402 close(VCFIN); | 
|  | 2403 | 
|  | 2404 # Before we can start printing the variants, we need to look at the phase information and calculate the real haplotype HGVS changes | 
|  | 2405 #if(keys %chr2phase){ | 
|  | 2406   # Note that we could have samtools read-backed haplotype info, MNPs in the VCF, and pre-existing haplotypes in the input VCF (e.g. imputed or based on Mendelian inheritance where trios exist) | 
|  | 2407   # We need to create new disjoint sets of phased blocks from the (consistent) union of these data. | 
|  | 2408 #  my $chr2phase2variants = combine_phase_data(\%chr2phase); | 
|  | 2409 | 
|  | 2410   # TODO: Calculate protein HGVS syntax for each variant, now that all phase data has been incorporated | 
|  | 2411   #for my $chr (keys %$chr2phase2variants){ | 
|  | 2412   #  for my $phase (keys %{$chr2phase2variants{$chr}){ | 
|  | 2413   #    # apply all phased changes to the reference chromosomal seq | 
|  | 2414   #    my $phased_seq = $seq{$chr}; #reference | 
|  | 2415   #    # sort the variants from 3' to 5' so that edits after indels don't need adjustment in their ref coordinate | 
|  | 2416   #    my @sorted_variants = sort {my($a_pos) = $a =~ /:(\d+):/; my($b_pos) = $b =~ /:(\d+):/; $b_pos <=> $a_pos} @{$chr2phase2variants{$chr}->{$phase}}; | 
|  | 2417   #    for my $variant (@sorted_variants){ | 
|  | 2418   #    } | 
|  | 2419   #  } | 
|  | 2420   #} | 
|  | 2421 #} | 
|  | 2422 | 
|  | 2423 # retrieve the MAF info en masse for each chromosome, as this is much more efficient | 
|  | 2424 my @out_lines; | 
|  | 2425 for my $snv (@snvs){ | 
|  | 2426   chomp $snv; | 
|  | 2427   my @fields = split /\t/, $snv; | 
|  | 2428   # For CNVs, all the fields are already filled out | 
|  | 2429   if(@fields > 13){ | 
|  | 2430     push @out_lines, join("\t", $feature_type{$fields[0]}, ($fields[0] =~ /\S/ ? $feature_length{$fields[0]} : "NA"), @fields); | 
|  | 2431     next; | 
|  | 2432   } | 
|  | 2433   my $variant_key = $fields[9]; | 
|  | 2434   $fields[9] = join("\t", prop_info($dbsnp,$internal_snp,$variant_key)); | 
|  | 2435   my $from = $fields[4]; | 
|  | 2436   my $chr_pos_key = $fields[3].":".$from; | 
|  | 2437   my $to = $fields[4]; # true for SNPs and insertions | 
|  | 2438   my @variant_key = split /:/, $variant_key; | 
|  | 2439   # For deletions and complex variants, calculate the affected reference genome range and set the 'to' | 
|  | 2440   if(length($variant_key[2]) > 1){ | 
|  | 2441     $to += length($variant_key[2])-1; | 
|  | 2442   } | 
|  | 2443   splice(@fields, 5, 0, $to); | 
|  | 2444 | 
|  | 2445   # Otherwise expand the key into the relevant MAF values | 
|  | 2446   $fields[0] =~ s/\/chr.*$//;  # some transcript ids are repeated... we expect "id/chr#" in the GTF file to distinguish these, but should get rid of them at reporting time | 
|  | 2447   # the offset from the nearest exon border if coding | 
|  | 2448   push @fields, ($#variant_key > 3 ? $variant_key[4] : ""); | 
|  | 2449   # add gene name(s) | 
|  | 2450   push @fields, range2genes($fields[3], $from, $to+1); | 
|  | 2451   # add caveats | 
|  | 2452   my $c = $fields[3]; | 
|  | 2453   if(not exists $chr2mappability{$c}){ | 
|  | 2454     if($c =~ s/^chr//){ | 
|  | 2455       # nothing more | 
|  | 2456     } | 
|  | 2457     else{ | 
|  | 2458       $c = "chr$c"; | 
|  | 2459     } | 
|  | 2460   } | 
|  | 2461   my $mappability_caveats = exists $chr2mappability{$c} ? $chr2mappability{$c}->fetch($fields[4], $fields[4]+1) : []; | 
|  | 2462   if(ref $mappability_caveats eq "ARRAY" and @$mappability_caveats){ | 
|  | 2463     my %h; | 
|  | 2464     if(exists $chr2caveats{$chr_pos_key}){ | 
|  | 2465       if($chr2caveats{$chr_pos_key} !~ /non-unique/){ | 
|  | 2466          $chr2caveats{$chr_pos_key} = join("; ", grep {not $h{$_}++} @$mappability_caveats)."; ".$chr2caveats{$chr_pos_key}; | 
|  | 2467       } | 
|  | 2468     } | 
|  | 2469     else{ | 
|  | 2470       $chr2caveats{$chr_pos_key} = join("; ", grep {not $h{$_}++} @$mappability_caveats); | 
|  | 2471     } | 
|  | 2472   } | 
|  | 2473   push @fields, (exists $chr2caveats{$chr_pos_key} ? $chr2caveats{$chr_pos_key} : ""); | 
|  | 2474   # add phase | 
|  | 2475   push @fields, find_phase($variant_key); | 
|  | 2476   push @out_lines, join("\t", $feature_type{$fields[0]}, $feature_length{$fields[0]}, @fields); | 
|  | 2477 } | 
|  | 2478 | 
|  | 2479 # Now tabulate the rare variant numbers | 
|  | 2480 my %gene2rares; | 
|  | 2481 my %gene2aa_rares; | 
|  | 2482 for my $line (@out_lines){ | 
|  | 2483   my @F = split /\t/, $line, -1; | 
|  | 2484   if($F[15] eq "NA" or $F[15] < $rare_variant_prop and (!$internal_snp or $F[17] < $rare_variant_prop)){ | 
|  | 2485     my $gene_list = $internal_snp ? $F[20] : $F[19]; | 
|  | 2486     next unless defined $gene_list; | 
|  | 2487     for my $g (split /; /, $gene_list){ | 
|  | 2488       $gene2rares{$g}++; | 
|  | 2489       # Check the cDNA HGVS syntax for relevance | 
|  | 2490       if($F[3] =~ /c.\d+/ or # coding | 
|  | 2491          $F[3] =~ /c.\d+.*-[12]/ or # splicing acceptor | 
|  | 2492          $F[3] =~ /c.\d+\+[12345]/){ # splicing donor | 
|  | 2493         $gene2aa_rares{$g}++; | 
|  | 2494       } | 
|  | 2495     } | 
|  | 2496   } | 
|  | 2497 } | 
|  | 2498 | 
|  | 2499 # Print the results | 
|  | 2500 for my $line (@out_lines){ | 
|  | 2501   my @F = split /\t/, $line, -1; | 
|  | 2502   my $gene_list = $internal_snp ? $F[20] : $F[19]; | 
|  | 2503   if(not defined $gene_list){ | 
|  | 2504     print OUT join("\t", @F, "", ""), "\n"; next; | 
|  | 2505   } | 
|  | 2506 | 
|  | 2507   my $max_gene_rare = 0; | 
|  | 2508   my $max_gene_aa_rare = 0; | 
|  | 2509   for my $g (split /; /, $gene_list){ | 
|  | 2510     next unless exists $gene2rares{$g}; | 
|  | 2511     if($gene2rares{$g} > $max_gene_rare){ | 
|  | 2512       $max_gene_rare = $gene2rares{$g}; | 
|  | 2513     } | 
|  | 2514     next unless exists $gene2aa_rares{$g}; | 
|  | 2515     if($gene2aa_rares{$g} > $max_gene_aa_rare){ | 
|  | 2516       $max_gene_aa_rare = $gene2aa_rares{$g}; | 
|  | 2517     } | 
|  | 2518   } | 
|  | 2519   print OUT join("\t", @F, $max_gene_rare, $max_gene_aa_rare), "\n"; | 
|  | 2520 } | 
|  | 2521 close(OUT); | 
|  | 2522 | 
|  | 2523 sub range2genes{ | 
|  | 2524   my ($c, $from, $to) = @_; | 
|  | 2525   if(not exists $gene_ids{$c}){ | 
|  | 2526     if($c =~ s/^chr//){ | 
|  | 2527       # nothing more | 
|  | 2528     } | 
|  | 2529     else{ | 
|  | 2530       $c = "chr$c"; | 
|  | 2531     } | 
|  | 2532   } | 
|  | 2533   if(exists $gene_ids{$c}){ | 
|  | 2534     my %have; | 
|  | 2535     return join("; ", grep {not $have{$_}++} @{$gene_ids{$c}->fetch($from, $to+1)}); | 
|  | 2536   } | 
|  | 2537   else{ | 
|  | 2538     return ""; | 
|  | 2539   } | 
|  | 2540 } | 
|  | 2541 sub combine_phase_data{ | 
|  | 2542   my ($phases) = @_; # map from variant to its phase data | 
|  | 2543 | 
|  | 2544   # Create a reverse mapping from phase regions to their variants | 
|  | 2545   my %chr2phase_region2variants; | 
|  | 2546   my @variants = keys %$phases; | 
|  | 2547   for my $variant (@variants){ | 
|  | 2548     my ($chr) = $variant =~ /^\S+?-(\S+):/; | 
|  | 2549     $chr2phase_region2variants{$chr} = {} if not exists $chr2phase_region2variants{$chr}; | 
|  | 2550     for my $phase_region (split /,/, $phases->{$variant}){ | 
|  | 2551       $chr2phase_region2variants{$chr}->{$phase_region} = [] if not exists $chr2phase_region2variants{$chr}->{$phase_region}; | 
|  | 2552       push @{$chr2phase_region2variants{$phase_region}}, $variant; | 
|  | 2553     } | 
|  | 2554   } | 
|  | 2555 | 
|  | 2556   # Now for each phased block known so far, see if any variant in it is also part of another block | 
|  | 2557   # If so, do a union since phasing is both transitive and commutative. | 
|  | 2558   # The quickest way to do this is to check for overlapping intervals, then check for common members amongst those that do overlap | 
|  | 2559   for my $chr (keys %chr2phase_region2variants){ | 
|  | 2560     my @ordered_phase_regions = sort {my($a_pos) = $a =~ /:(\d+)/; my($b_pos) = $b =~ /:(\d+)/; $a_pos <=> $b_pos} keys %{$chr2phase_region2variants{$chr}}; | 
|  | 2561     my $sets = new DisjointSets(scalar(@ordered_phase_regions)); | 
|  | 2562 | 
|  | 2563     for (my $i = 0; $i < $#ordered_phase_regions; $i++){ | 
|  | 2564       my ($start, $stop, $variant) = $ordered_phase_regions[$i]; | 
|  | 2565       for (my $j = $i+1; $j <= $#ordered_phase_regions; $j++){ | 
|  | 2566         my ($start2, $stop2, $variant2) = $ordered_phase_regions[$j]; | 
|  | 2567         if($start2 > $stop){ # won't overlap any regions after this in the sorted list | 
|  | 2568           last; | 
|  | 2569         } | 
|  | 2570         # If we get here, it is implicit that $stop >= $start2 and $start < $stop2, i.e. there is overlap | 
|  | 2571         # Now check if there is a shared variant (otherwise we might erroneously join blocks from different physical chromosomal arms) | 
|  | 2572         my $have_shared_variant = 0; | 
|  | 2573         my $overlapping_phase_region = $ordered_phase_regions[$j]; | 
|  | 2574         for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$i]}}){ | 
|  | 2575           if($phases->{$variant} =~ /\b$overlapping_phase_region\b/){ | 
|  | 2576             $have_shared_variant = 1; last; | 
|  | 2577           } | 
|  | 2578         } | 
|  | 2579         # sanity check that there aren't conflicting variants in the new block (i.e. two different variants in the same position) | 
|  | 2580         my %pos2base; | 
|  | 2581         my $have_conflicting_variant = 0; | 
|  | 2582         for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$i]}}, @{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$j]}}){ | 
|  | 2583           my ($pos, $base) = $variant =~ /(\d+):(.+?)$/; | 
|  | 2584           if(exists $pos2base{$pos} and $pos2base{$pos} ne $base){ | 
|  | 2585             # conflict, note with a caveat | 
|  | 2586             if(exists $chr2caveats{"$chr:$pos"}){ | 
|  | 2587               $chr2caveats{"$chr:$pos"} .= "; inconsistent haplotype phasing" unless $chr2caveats{"$chr:$pos"} =~ /inconsistent haplotype phasing/; | 
|  | 2588             } | 
|  | 2589             else{ | 
|  | 2590               $chr2caveats{"$chr:$pos"} = "inconsistent haplotype phasing"; | 
|  | 2591             } | 
|  | 2592             $have_conflicting_variant ||= 1; | 
|  | 2593           } | 
|  | 2594           elsif(not exists $pos2base{$pos}){ | 
|  | 2595             $pos2base{$pos} = $base; | 
|  | 2596           } | 
|  | 2597         } | 
|  | 2598 | 
|  | 2599         $sets->union($i+1, $j+1) if $have_shared_variant and not $have_conflicting_variant; # indexes are one-based for sets rather than 0-based | 
|  | 2600       } | 
|  | 2601     } | 
|  | 2602     my $phase_sets = $sets->sets; #disjoint haplotype sets | 
|  | 2603     my %region_counts; | 
|  | 2604     for my $phase_set (@$phase_sets){ | 
|  | 2605       next if scalar(@$phase_set) == 1; # No change to existing phase region (is disjoint from all others) | 
|  | 2606       # Generate a new haploblock to replace the old ones that are being merged | 
|  | 2607       my $merged_start = 10000000000; | 
|  | 2608       my $merged_end = 0; | 
|  | 2609       for my $ordered_phase_region_index (@$phase_set){ | 
|  | 2610         my ($start, $end) = $ordered_phase_regions[$ordered_phase_region_index-1] =~ /(\d+)-(\d+)$/; | 
|  | 2611         $merged_start = $start if $start < $merged_start; | 
|  | 2612         $merged_end = $end if $end > $merged_end; | 
|  | 2613       } | 
|  | 2614       # At the start of the region is a unique prefix so we can tell the arms apart if two haploblocks have the exact same boundary | 
|  | 2615       my $region_count = $region_counts{"$merged_start-$merged_end"}++; | 
|  | 2616       my $merged_haploblock_name = "Y$region_count-$chr:$merged_start-$merged_end"; | 
|  | 2617       # Assign this new name to overwrite the premerge values for each variant contained within | 
|  | 2618       for my $ordered_phase_region_index (@$phase_set){ | 
|  | 2619         for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$ordered_phase_region_index-1]}}){ # incl. one-based set correction in 0-based array index | 
|  | 2620           print STDERR "Merging $variant from ", $phases->{$variant}, " into new block $merged_haploblock_name\n"; | 
|  | 2621           $phases->{$variant} = $merged_haploblock_name; | 
|  | 2622         } | 
|  | 2623       } | 
|  | 2624     } | 
|  | 2625     # TODO: if there are overlapping phase blocks still, but with different variants in the same position, we can infer that they are on the opposite strands... | 
|  | 2626   } | 
|  | 2627 } | 
|  | 2628 | 
|  | 2629 # Sees if the positions of the variant are in the range of a phased haplotype, returning which allele it belongs to | 
|  | 2630 sub find_phase{ | 
|  | 2631   my ($chr,$pos,$ref,$variant) = split /:/, $_[0]; | 
|  | 2632   return "" if length($ref) != length($variant); # Can only deal with SNPs (and broken down MNPs) for now | 
|  | 2633   for(my $i = 0; $i < length($ref); $i++){ | 
|  | 2634     my $key = "$chr:".($pos+$i).":".substr($variant, $i, 1); | 
|  | 2635     #print STDERR "Checking phase for $key\n" if $pos == 12907379; | 
|  | 2636     if(exists $chr2phase{$key}){ | 
|  | 2637       #print STDERR "returning phase data $chr2phase{$key}\n"; | 
|  | 2638       return $chr2phase{$key}; | 
|  | 2639     } | 
|  | 2640     elsif(exists $chr2phase{"chr".$key}){ | 
|  | 2641       #print STDERR "returning phase data ", $chr2phase{"chr".$key}, "\n"; | 
|  | 2642       return $chr2phase{"chr".$key}; | 
|  | 2643     } | 
|  | 2644   } | 
|  | 2645   return ""; | 
|  | 2646 } | 
|  | 2647 | 
|  | 2648 sub find_earliest_index{ | 
|  | 2649   # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start | 
|  | 2650    my ($query, $array) = @_; | 
|  | 2651 | 
|  | 2652    return 0 if $query < $array->[0]->[0]; | 
|  | 2653 | 
|  | 2654    my ($left, $right, $prevCenter) = (0, $#$array, -1); | 
|  | 2655 | 
|  | 2656    while(1){ | 
|  | 2657       my $center = int (($left + $right)/2); | 
|  | 2658 | 
|  | 2659       my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1); | 
|  | 2660 | 
|  | 2661       return $center if $cmp == 0; | 
|  | 2662       if ($center == $prevCenter) { | 
|  | 2663          return $left; | 
|  | 2664       } | 
|  | 2665       $right = $center if $cmp < 0; | 
|  | 2666       $left  = $center if $cmp > 0; | 
|  | 2667       $prevCenter = $center; | 
|  | 2668    } | 
|  | 2669 } | 
|  | 2670 |