Mercurial > repos > yusuf > poor_gene_coverage

#!/usr/bin/env perl

BEGIN{
  my $prog_dir = `dirname $0`;
  chomp $prog_dir;
  push @INC, $prog_dir; # so DisjointSets.pm can be found no matter the working directory
}

use DisjointSets; # homebrew module
use Bio::DB::Sam; # for FastA reference pulls
use Bio::SeqUtils;
use Bio::Tools::CodonTable;
use Statistics::Zed;
use Getopt::Long;
use Set::IntervalTree;
use strict;
use warnings;
use vars qw($min_prop $zed $codonTable $default_transl_table %transl_except %internal_prop %dbsnp_info %chr2variant_locs %chr2dbsnp_vcf_lines %chr2internal_vcf_lines %chr2caveats %chr2phase @snvs $fasta_index $max_args $quiet);

if(@ARGV == 1 and $ARGV[0] eq "-v"){
  print "Version 1.0\n";
  exit;
}

#$max_args = `getconf ARG_MAX`; # largest number of args you can send to a system command (enviroment included, see limits.h)
#chomp $max_args;
$max_args = 4096; # if not defined $max_args or $max_args < 1; # the minimum since System V
$max_args -= 50;

# find out if a variant appears in the user provided data
sub internal_prop($$$$){
    my ($chr,$pos,$ref,$variant) = @_;

    my $key = "$chr:$pos:$ref:$variant";
    if(exists $internal_prop{$key}){
        return $internal_prop{$key};
    }

    #print STDERR "Checking if internal_prop for $key exists: ";
    if(exists $chr2internal_vcf_lines{$chr}->{$pos}){
      for(@{$chr2internal_vcf_lines{$chr}->{$pos}}){
        my @fields = split /\t/, $_;
        if($pos == $fields[1] and length($fields[3]) == length($ref) and $fields[4] eq $variant){
          #print STDERR "yes\n";
          if(/MAF=(\d\.\d+)/){
            $internal_prop{$key} = $1; # change from percent to proportion
            return $1;
          }
        }
      }
    }
    else{
      #print STDERR "no\n";
    }

    $internal_prop{$key} = "NA";
    return "NA";
}

# find out if a variant appears in the NCBI's dbSNP
sub dbsnp_info($$$$){
    my ($chr,$pos,$ref,$variant) = @_;

    my $key = "$chr:$pos:$ref:$variant";
    if(exists $dbsnp_info{$key}){
        return @{$dbsnp_info{$key}};
    }

    if(exists $chr2dbsnp_vcf_lines{$chr}->{$pos}){
      #print STDERR "Checking existing SNP data for $chr:$pos -> ", join("\n", @{$chr2dbsnp_vcf_lines{$chr}->{$pos}}), "\n";
      for(@{$chr2dbsnp_vcf_lines{$chr}->{$pos}}){
	my @fields = split /\t/, $_;
	for my $var (split /,/, $fields[4]){
            # Allows for different reference seqs between dbSNP and input, assuming patches only
	    if(length($fields[3]) == length($ref) and ($var eq $variant or $ref eq $var and $variant eq $fields[3])){
		my ($freq, $subpop) = ("","");
		$freq = $1 if $fields[7] =~ /(?:\A|;)MMAF=(0\.\d+)(?:;|\Z)/;
		$subpop = $1 if $fields[7] =~ /(?:\A|;)MMAF_SRC=(\S+?)(?:;|\Z)/;
                $dbsnp_info{$key} = [$subpop, $freq || "NA", $fields[2]];
		return @{$dbsnp_info{$key}};
	    }
	}
      }
    }
    $dbsnp_info{$key} = ["novel", "NA", "NA"];
    return @{$dbsnp_info{$key}};
}

sub record_snv{
  my $line = join("", @_);
  push @snvs, $line;

  my @fields = split /\t/, $line;
  my $prop_info_key = $fields[9];
  my ($chr,$pos,$ref,$variant) = split /:/, $prop_info_key;
  $chr2variant_locs{$chr} = {} unless exists $chr2variant_locs{$chr};
  return unless $ref; # ref not defined for CNVs
  # Need to grab whole range for MNPs
  for(my $i = 0; $i < length($ref); $i++){
    $chr2variant_locs{$chr}->{$pos+$i} = 1;
  }
}

sub retrieve_vcf_lines($$$){
  my ($dbsnp_file, $internal_snp_file, $chr) = @_;

  my (%dbsnp_lines, %internal_snp_lines);

  if(not defined $dbsnp_file or not exists $chr2variant_locs{$chr}){
    return ({}, {}, {}, {}); # no data requested for this chromosome
  }

  # build up the request
  my @tabix_regions;
  my @var_locs = keys %{$chr2variant_locs{$chr}};
  # sort by variant start location
  for my $var_loc (sort {$a <=> $b} @var_locs){
    push @tabix_regions, $chr.":".$var_loc."-".$var_loc;
  }
  for(my $i = 0; $i <= $#tabix_regions; $i += $max_args){ # chunkify tabix request if too many for the system to handle
    my $end = $i + $max_args > $#tabix_regions ? $#tabix_regions : $i + $max_args;
    my $regions = "'".join("' '", @tabix_regions[$i..$end])."'";
    # From file is very slow for some reason
    #my $regions_file =  "/tmp/vcf2hgvs_$$.bed";
    #open(REQ_BED, ">$regions_file")
    #  or die "Cannot open $regions_file for writing: $!\n";
    #print REQ_BED join("\n", @tabix_regions), "\n";
    #close(REQ_BED);

    # retrieve the data
    die "Cannot find dbSNP VCF file $dbsnp_file\n" if not -e $dbsnp_file;

    open(VCF, "tabix $dbsnp_file $regions |")
      or die "Cannot run tabix on $dbsnp_file (args ".substr($regions, 0, length($regions)>100? 100 : length($regions))."): $!\n";
    while(<VCF>){
      #if(/^(\S+\t(\d+)(?:\t\S+){6})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible
      if(/^(\S+\t(\d+)(?:\t\S+){6})/ and exists $chr2variant_locs{$chr}->{$2}){ # take only main columns to save room, if possible
        $dbsnp_lines{$2} = [] unless exists $dbsnp_lines{$2};
        push @{$dbsnp_lines{$2}}, $1;
      }
    }
    close(VCF);

    if($internal_snp_file){
      die "Cannot find internal VCF file $internal_snp_file\n" if not -e $internal_snp_file;
      open(VCF, "tabix $internal_snp_file $regions |")
        or die "Cannot run tabix on $internal_snp_file: $!\n";
      while(<VCF>){
        #if(/^(\S+\t(\d+)(?:\t\S+){6})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible
        if(/^(\S+\t(\d+)(?:\t\S+){5})/ and exists $chr2variant_locs{$chr}->{$2}){ # take only main columns to save room, if possible
          $internal_snp_lines{$2} = [] unless exists $internal_snp_lines{$2};
          push @{$internal_snp_lines{$2}}, $1;
        }
      }
      close(VCF);
    }
  }

  #unlink $regions_file;

  return (\%dbsnp_lines, \%internal_snp_lines);
}

sub prop_info_key{
    my($chr,$pos,$ref,$variant,$exon_edge_dist) = @_;

    $chr =~ s/^chr//;
    if($chr eq "M"){
      $chr = "MT"; # NCBI uses different name for mitochondrial chromosome
      $pos-- if $pos >= 3107;  # also, doesn't keep the old positioning (historical)
    }
    return join(":", $chr,$pos,$ref,$variant, ($exon_edge_dist ? $exon_edge_dist : ""));
}

sub prop_info($$$){
    my($snpfile,$internal_snps_file,$prop_info_key) = @_;

    my($chr,$pos,$ref,$variant) = split /:/, $prop_info_key;

    # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse
    if(not exists $chr2dbsnp_vcf_lines{$chr}){
      ($chr2dbsnp_vcf_lines{$chr}, $chr2internal_vcf_lines{$chr}) = retrieve_vcf_lines($snpfile,$internal_snps_file,$chr);
    }
    my $internal_maf = 0;
    if($internal_snps_file){
      $internal_maf = internal_prop($chr,$pos,$ref,$variant);
      $internal_maf = 0 if $internal_maf eq "NA";
    }

    my @results = dbsnp_info($chr,$pos,$ref,$variant);

    # Not all entries have a proportion in dbSNP
    return $internal_snps_file ? ($ref, $variant, @results, $internal_maf) : ($ref, $variant, @results);
}

#offset a given HGVS nomenclature position (single position only) by a given number of bases
sub hgvs_plus($$){
   my ($hgvs, $offset) = @_;
    if($hgvs =~ /^(\S+)(-\d+)(.*)/){
	# all negative
	if($2+$offset<0){
	    return $1.($2+$offset).$3;
	}
	# switches to positive, need to mod
	else{
	    return $1+($2+$offset);
	}
    }
    elsif($hgvs =~ /^(\S+)\+(\d+)(.*)/){
	# all positive
	if($2+$offset>0){
	    return $1."+".($2+$offset).$3;
	}
	# switches to negative, need to mod
	else{
	    return $1+($2+$offset);
	}
    }
    elsif($hgvs =~ /^(-?\d+)(.*)/){
	# special case if offset spans -/+ since there is no position 0
	if($1 < 0 and $1+$offset >= 0){
	    $offset++;
	}
	elsif($1 > 0 and $1+$offset <= 0){
	    $offset--;
	}
	return ($1+$offset).$2;
    }
    else{
	die "Cannot convert $hgvs to a new offset ($offset), only single base position nomenclature is currently supported\n";
    }
}

# offset a given position by a given number of bases,
# taking into account that if the new offset crosses the threshold in the last argument,
# HGVS boundary nomenclature has to be introduced
sub hgvs_plus_exon($$$){
    my ($pos, $offset, $boundary) = @_;

    # special case if offset spans -/+ since there is no position 0
    if($pos =~ /^(-?\d+)(.*)/){
      if($1 < 0 and $1+$offset >= 0){
         $offset++;
      }
      elsif($1 > 0 and $1+$offset <= 0){
         $offset--;
      }
    }
    my $new_pos = $pos + $offset;
    if($new_pos > $boundary and $pos <= $boundary){
	# just moved into an intron 3'
	$new_pos = $boundary."+".($new_pos-$boundary);
    }
    elsif($new_pos < $boundary and $pos >= $boundary){
	# just moved into an intron 5'
	$new_pos = $boundary.($new_pos-$boundary);
    }
    return $new_pos;
}

# given a nucleotide position, calculates the AA there (assumes coding region)
sub getCodonFromSeq($$$$){
  my ($chr_ref, $location, $frame_offset, $strand) = @_;

  my $codon;
  if($strand eq "+"){
    $codon = substr($$chr_ref, $location-1-$frame_offset, 3);
  }
  else{
    $codon = substr($$chr_ref, $location-3+$frame_offset, 3);
    $codon = reverse($codon);
    $codon =~ tr/ACGTacgt/TGCAtgca/;
  }
  return $codon;
}

sub getCodonFromSeqIndex($$$$){
  my ($chr, $location, $frame_offset, $strand) = @_;

  my $codon;
  if($strand eq "+"){
    $codon = $fasta_index->fetch($chr.":".($location-$frame_offset)."-".($location-$frame_offset+2));
  }
  else{
    $codon = $fasta_index->fetch($chr.":".($location-2+$frame_offset)."-".($location+$frame_offset));
    $codon = reverse($codon);
    $codon =~ tr/ACGTacgt/TGCAtgca/;
  }
  return $codon;
}

sub getAAFromSeq($$$$$){
  return $_[4]->translate(getCodonFromSeq($_[0], $_[1], $_[2], $_[3]));
}

sub getAAFromSeqIndex($$$$$){
  # convert codon to AA
  if(exists $transl_except{"$_[0]:$_[1]"}){
    return $transl_except{"$_[0]:$_[1]"};
  }
  else{
    return $_[4]->translate(getCodonFromSeqIndex($_[0], $_[1], $_[2], $_[3]));
  }
}

sub hgvs_protein{
  my ($chr, $location, $ref, $variant, $cdna_pos, $strand, $transl_table) = @_;

  if(substr($ref,0,1) eq substr($variant,0,1)){
    substr($ref,0,1) = "";
    substr($variant,0,1) = "";
    $location++;
    if($strand eq "-"){
      $cdna_pos--;
    }
    else{
      $cdna_pos++;
    }
  }

  if($cdna_pos !~ /^\d+/){
    die "Aborting: got illegal cDNA position ($cdna_pos) for protein HGVS conversion of position ",
        "$location, ref $ref, variant $variant. Please correct the program code.\n";
  }
  # Get the correct frame for the protein translation, to know what codons are affected
  my $aapos = int(($cdna_pos-1)/3)+1;

  # does it destroy the start codon?
  if($cdna_pos < 4){ # assumes animal codon usage
    return "p.0?"; # indicates start codon missing, unsure of effect
  }

  my $table = $transl_table ne $default_transl_table ?  # non standard translation table requested
    Bio::Tools::CodonTable->new(-id=>$transl_table) : $codonTable;

  my $frame_offset = ($cdna_pos-1)%3;
  my $origAA = getAAFromSeqIndex($chr, $location, $frame_offset, $strand, $table);
  # take 100000 bp on either side for translation context of variant seq
  my $five_prime_buffer = $location < 10000 ? $location-1 : 10000;
  my $mutSeq = $fasta_index->fetch($chr.":".($location-$five_prime_buffer)."-".($location+10000));

  # substitute all of the immediately adjacent variants in phase with this one to get the correct local effect
  substr($mutSeq, $five_prime_buffer, length($ref)) = $variant;

  # does it cause a frameshift?
  my $length_diff = length($variant)-length($ref);
  if($length_diff%3){ # insertion or deletion not a multiple of three
    my $fs_codon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1, $frame_offset, $strand);
    my $ext = 0;
    my $newAA;
    do{
       $ext++;
       # The "NA"s below make it so that we don't pick up any translation exceptions from the original reference annotation
       if($strand eq "+"){
         $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1+$ext*3, $frame_offset, $strand, $table);
       }
       else{
         $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1-$ext*3, $frame_offset, $strand, $table);
       }
     } while($newAA ne "*");

    return "p.".$origAA.$aapos.$table->translate($fs_codon)."fs*$ext";
  }

  # does it cause a stop codon to be lost?
  if($origAA eq "*"){
     my $stopChangeCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1, $frame_offset, $strand);
     # still a stop after the mutation (ignore translation exceptions)
     if($table->is_ter_codon($stopChangeCodon)){
        return "p.*$aapos=";
     }
     # calculate the new stop, assuming there aren't mutations downstream in candidate stop codons
     my $ext = 0;
     my $newCodon;
     do{
       if($strand eq "+"){
         $newCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1+(++$ext*3), $frame_offset, $strand);
       }
       else{
         $newCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1-(++$ext*3), $frame_offset, $strand);
       }
     } while(not $table->is_ter_codon($newCodon));

     return "p.*".$aapos.$table->translate($stopChangeCodon)."ext*".$ext;
  }

  # if we get this far, it's a "regular" AA level change
  my $origAAs = "";
  for(my $i = 0; $i < length($ref)+$frame_offset; $i+=3){
    my $oldAA = getAAFromSeqIndex($chr, $location+$i, $frame_offset, $strand, $table);
    if($strand eq "+"){
      $origAAs .= $oldAA;
    }
    else{
      $origAAs = $oldAA . $origAAs;
    }
  }
  my $newAAs = "";
  for(my $i = 0; $i < length($variant)+$frame_offset; $i+=3){
    # NA means we don't take translation exceptions from the original
    my $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1+$i, $frame_offset, $strand, $table);
    if($strand eq "+"){
      $newAAs .= $newAA;
    }
    else{
      $newAAs = $newAA . $newAAs;
    }
  }

  # silent
  if($origAAs eq $newAAs){
      return "p.".$origAAs.$aapos."=";
  }

  # minimize the difference if there are leading or trailing AAs the same
  my $delLength = length($ref);
  while(substr($newAAs, 0, 1) eq substr($origAAs, 0, 1)){
    $newAAs = substr($newAAs, 1);
    $origAAs = substr($origAAs, 1);
    $location+=3;
    $delLength-=3;
    $aapos++;
  }
  while(substr($newAAs, -1) eq substr($origAAs, -1)){
    $newAAs = substr($newAAs, 0, length($newAAs)-1);
    $origAAs = substr($origAAs, 0, length($origAAs)-1);
  }

  # insertion
  if(length($origAAs) == 0){
    my $insAAs = getAAFromSeqIndex($chr,$location-3,$frame_offset,$strand,$table).($aapos-1)."_".
                 getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table);
    return "p.".$insAAs.$aapos."ins".$newAAs;
  }
  # deletion
  elsif(length($newAAs) == 0){
    my $delAAs;
    if(length($origAAs) == 1){
       $delAAs = getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos; # single AA deletion
    }
    else{ # deleting a stretch
       if($strand eq "+"){
         my $endPoint = $location+$delLength-1;
         $delAAs = getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos."_".
                   getAAFromSeqIndex($chr,$endPoint,$frame_offset,$strand,$table).($aapos+int(($delLength-1)/3));
       }
       else{
         my $endPoint = $location-$delLength+1;
         $delAAs = getAAFromSeqIndex($chr,$endPoint,$frame_offset,$strand,$table).($aapos-int(($delLength-1)/3))."_".
                   getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos;
       }
    }
    return "p.".$delAAs."del";
  }
  else{
    # substitution
    if(length($origAAs) == 1 and length($newAAs) == 1){
      return "p.".$origAAs.$aapos.$newAAs;
    }
    # indel
    elsif(length($origAAs) != 1){
       # convert ref stretch into range syntax
       if($strand eq "+"){
         $origAAs = substr($origAAs, 0, 1).$aapos."_".substr($origAAs, -1).($aapos+length($origAAs)-1);
       }
       else{
         $origAAs = substr($origAAs, 0, 1).($aapos-length($origAAs)+1)."_".substr($origAAs, -1).$aapos;
       }
    }
    return "p.".$origAAs."delins".$newAAs;
  }
  return ("NA", "");
}

sub z2p{
  if(not defined $zed){
    $zed = new Statistics::Zed;
  }
  my $p = $zed->z2p(value => $_[0]);
  return $p < 0.0000000001 ? 0 : $p;
}
sub gq2p{
    return $_[0] > 200 ? 0 : 10**($_[0]/-10);
}

my ($multi_phased, $min_depth, $flanking_bases, $dbsnp, $internal_snp, $genename_bed_file, $dir_1000G, $dir_esp6500, $min_pvalue, $mappability_file, $reference_file, $samtools_phasing_file, $exons_file, $input_file, $output_file, $cnv_file, $dgv_file, $which_chr, $enrichment_regions_file, $rare_variant_prop);
$multi_phased = 0;
$min_depth = 2;
$flanking_bases = 30;
$min_pvalue = 0.01;
$min_prop = 0.14;
$rare_variant_prop = 0.05;
$input_file = "-"; # STDIN by default
$output_file = "-"; # STDOUT by default
$default_transl_table = "1"; # assumes NCBI 'Standard' table, unless it is an argument to the script...
&GetOptions("d=i"       => \$min_depth,
            "f=i"       => \$flanking_bases,
            "s=s"       => \$dbsnp,
            "t=s"       => \$dir_1000G,
            "n=s"       => \$dir_esp6500,
            "u=s"       => \$internal_snp,
            "q"         => \$quiet,
            "p=f"       => \$min_pvalue,
            "h=f"       => \$min_prop,
            "m=s"       => \$mappability_file,
            "r=s"       => \$reference_file,
            "z=s"       => \$samtools_phasing_file,
            "e=s"       => \$exons_file,
            "i=s"       => \$input_file,
            "c=s"       => \$cnv_file,
            "g=s"       => \$dgv_file,
            "b=s"       => \$genename_bed_file,
            "w=s"       => \$which_chr,
            "o=s"       => \$output_file,
            "a=i"       => \$default_transl_table,
            "v=f"       => \$rare_variant_prop,
            "x=s"       => \$enrichment_regions_file); # if enrichment regions are specified, variants without a transcript model but in these ranges will be reported

if(($input_file ne "/dev/null" and not defined $reference_file) or
   not defined $exons_file or
   (defined $cnv_file and not defined $dgv_file)){
  die "Usage: $0 [-v(ersion)] [-q(uiet)] [-w(hich) contig_to_report (default is all)] [-d(epth of variant reads req'd) #] [-v(ariant max freq to count as rare)] [-f(lanking exon bases to report) #] [-p(robability of random genotype, maximum to report) 0.#]\n",
      "  [-h(et proportion of variant reads, minimum to report) 0.#] [-c(opy number) variants_file.bed -g(enomic structural) variants_control_db.txt.gz] [-z file_containing_samtools_phase_output.txt]\n",
      "  [-t(housand) genomes_integrated_vcfs_gz_dir] [-n ESP6500_dir] [-u(ser) specified_population.vcf.gz] [-m(appability) crg_file.bed]\n",
      "  [-x enrichment_regions_file.bed] [-a(mino) acid translation table number from NCBI]\n",
      "  [-i(nput) genotypes.vcf <-r(eference) sequence_file.fasta>] [-o(utput) hgvs_file.tsv] [-s(np) database_from_ncbi.vcf.gz]\n",
      "  <-b(ed) file of named gene regions.bed> <-e(xons) file.gtf>\n\n",
      "Input gz files must be indexed with Tabix.\nDefault input is STDIN, default output is STDOUT.  Note: if -c is specified, polyploidies are are assume to be proximal. Other defaults: -d 2, -v 0.05, -f 30, -p 0.01, -h 0.14 -a 1\nReference sequence is not strictly necessary if only CNV are being annotated.\n";
}

print STDERR "Considering $flanking_bases flanking bases for variants as well\n" unless $quiet;

$codonTable = new Bio::Tools::CodonTable(id => $default_transl_table);

my %enrichment_regions;
# Note, we assume the regions are non-overlapping
if(defined $enrichment_regions_file){
  print STDERR "Loading enrichment regions...\n" unless $quiet;
  open(BED, $enrichment_regions_file)
    or die "Cannot open $enrichment_regions_file for reading: $!\n";
  while(<BED>){
    chomp;
    my @F = split /\t/, $_;
    $enrichment_regions{$F[0]} = [] if not exists $enrichment_regions{$F[0]};
    push @{$enrichment_regions{$F[0]}}, [$F[1], $F[2]];
  }
  close(BED);
}
for my $chr (keys %enrichment_regions){ # sort by start
  $enrichment_regions{$chr} = [sort {$a->[0] <=> $b->[0]} @{$enrichment_regions{$chr}}];
}

if(defined $reference_file){
  print STDERR "Scanning reference FastA info\n" unless $quiet;
  if(not -e $reference_file){
    die "Reference FastA file ($reference_file) does not exist.\n";
  }
  if(not -e $reference_file.".fai" and not -w dirname($reference_file)){
    die "Reference FastA file ($reference_file) is not indexed, and the directory is not writable.\n";
  }
  $fasta_index = Bio::DB::Sam::Fai->load($reference_file);
}

my %chr2mappability;
if(defined $mappability_file){
  print STDERR "Reading in mappability data\n" unless $quiet;
  my ($nmer) = $mappability_file =~ /(\d+).*?$/;
  die "Cannot determine nmer from nmer file name $mappability_file, aborting\n" unless $nmer;
  open(MAP, $mappability_file)
    or die "Cannot open mappability data file $mappability_file for reading: $!\n";
  <MAP>; # header
  while(<MAP>){
    next if /^#/;
    chomp;
    my @F = split /\t/, $_;
    my $x = int(1/$F[3]+0.5);
    $chr2mappability{$F[0]} = Set::IntervalTree->new() if not exists $chr2mappability{$F[0]};
    $chr2mappability{$F[0]}->insert("non-unique mapping region (x$x)", $F[1], $F[2]+$nmer-1);
  }
  close(MAP);
}

# Is phasing data provided?
if(defined $samtools_phasing_file){
  print STDERR "Reading in phasing data\n" unless $quiet;
  open(PHASE, $samtools_phasing_file)
    or die "Cannot open phasing data file $samtools_phasing_file for reading: $!\n";
  my $phase_range;
  while(<PHASE>){
    if(/^PS/){
      chomp;
      my @F = split /\t/, $_;
      $phase_range = "$F[2]-$F[3]";
    }
    if(/^M[12]/){
      chomp;
      my @F = split /\t/, $_;
      #ignore strange cases where haplotype reference has no cases (weird samtools call)
      next if $F[9] == 0 or $F[7] == 0;
      my $chr = $F[1];
      next if defined $which_chr and not $chr eq $which_chr;
      my $pos = $F[3];
      #print STDERR "Recording phase for $chr:$pos:$F[4] , $chr:$pos:$F[5] as A-$chr:$phase_range and B-$chr:$phase_range\n" if $pos == 12907379;
      if(($F[10]+$F[8])/($F[9]+$F[7]) >= $min_prop){ # error meets reporting threshold
        $chr2caveats{"$chr:$pos"} .= "; " if exists $chr2caveats{"$chr:$pos"};
        $chr2caveats{"$chr:$pos"} .= "inconsistent haplotype phasing";
      }
      else{ # appears to be a genuine phasing
        $chr2phase{"$chr:$pos:$F[4]"} = "A-$chr:$phase_range"; # grouping for haplotype
        $chr2phase{"$chr:$pos:$F[5]"} = "B-$chr:$phase_range"; # grouping for haplotype
      }
    }
  }
  close(PHASE);
}

# Check the VCF file to see if contains phase data
open(VCFIN, $input_file)
    or die "Cannot open $input_file for reading: $!\n";
my $phase_chr = "";
my @phase_dataA;
my @phase_dataB;
while(<VCFIN>){
    if(/^\s*(?:#|$)/){ # blank or hash comment
       next;
    }
    my @F = split /\t/, $_;
    next if exists $chr2caveats{"$F[0]:$F[1]"} and $chr2caveats{"$F[0]:$F[1]"} =~ /inconsistent haplotype phasing/;
    # | indicates phased
    if($F[8] =~ m(^(\d+)\|(\d+):)){
      next if $1 eq $2;  # not useful to us (actually would mess up phase combining later on), but is provided sometimes
      # start of a phasing block
      if($phase_chr eq ""){
        $phase_chr = $F[0];
      }
      my @vars = split /,/, $F[4];
      if($1 > @vars){
        die "Invalid VCF file (line #$.): First haplotype listed as $1, but only ", scalar(@vars), " variants were provided (", join(",", @vars), "\n";
      }
      if($2 > @vars){
        die "Invalid VCF file (line #$.): Second haplotype listed as $1, but only ", scalar(@vars), " variants were provided (", join(",", @vars), "\n";
      }
      unshift @vars, $F[3];
      push @phase_dataA, [$F[1], $vars[$1]];
      push @phase_dataB, [$F[1], $vars[$2]];
    }
    # non phased het call, ends any phasing block there might be
    elsif($F[8] =~ m(^0/1)){
      # Did we just finish a phased block? If so, output it.
      if(@phase_dataA > 1){
        my $phase_def = "G-$phase_chr:".$phase_dataA[0]->[0]."-".$phase_dataA[$#phase_dataA]->[0];
        for my $d (@phase_dataA){
          my ($p, $v) = @$d;
          if(exists $chr2phase{"$phase_chr:$p:$v"}){
            $chr2phase{"$phase_chr:$p:$v"} .= ",$phase_def";
            $multi_phased ||= 1;
          }
          else{
            $chr2phase{"$phase_chr:$p:$v"} = $phase_def;
          }
        }
        $phase_def = "H-$phase_chr:".$phase_dataB[0]->[0]."-".$phase_dataB[$#phase_dataB]->[0];
        for my $d (@phase_dataB){
          my ($p, $v) = @$d;
          if(exists $chr2phase{"$phase_chr:$p:$v"}){
            $chr2phase{"$phase_chr:$p:$v"} = ",$phase_def";
            $multi_phased ||= 1;
          }
          else{
            $chr2phase{"$phase_chr:$p:$v"} = $phase_def;
          }
        }
      }
      if($phase_chr ne ""){
        $phase_chr = "";
        @phase_dataA = ();
        @phase_dataB = ();
      }
    }
}

print STDERR "Reading in feature GTF data..." unless $quiet;
my %feature_range; # chr => transcript_id => [[genomic_exon_start,genomic_exon_end,cdna_start_pos],...]
my %feature_intervaltree; # chr => transcript_id => [[genomic_exon_start,genomic_exon_end,cdna_start_pos],...]
my %feature_strand; # transcript_id => +|-
my $feature_count = 0;
my %feature_min;
my %feature_max;
my %feature_cds_min;
my %feature_cds_max;
my %feature_contig;
my %feature_length;
my %feature_type;
my %feature_transl_table; # note alternate translation table usage
my %chr_read;
open(GTF, $exons_file)
    or die "Cannot open $exons_file for reading: $!\n";
while(<GTF>){
    next if /^\s*#/;
    my @fields = split /\t/, $_;
    next if defined $which_chr and $fields[0] ne $which_chr and "chr$fields[0]" ne $which_chr and $fields[0] ne "chr$which_chr";

    if($fields[2] eq "exon" or $fields[2] eq "CDS"){
	next unless $fields[$#fields] =~ /transcript_id \"(.*?)\"/o;
	my $parent = $1;
	if(not $quiet and not exists $chr_read{$fields[0]}){
	    print STDERR " $fields[0]";
	    $chr_read{$fields[0]} = 1;
	}
	if(not exists $feature_strand{$parent}){
	    $feature_strand{$parent} = $fields[6];
	    $feature_contig{$parent} = $fields[0];
            if($fields[$#fields] =~ /transcript_type \"(.*?)\"/){
              $feature_type{$parent} = $1;
            }
            else{
              $feature_type{$parent} = "NA";
            }
	}
	if($fields[2] eq "CDS"){
	    #print STDERR "CDS value for $parent is $fields[2]..$fields[3]\n";
	    if(not exists $feature_cds_min{$parent} or $fields[3] < $feature_cds_min{$parent}){
		$feature_cds_min{$parent} = $fields[3];
	    }
	    if(not exists $feature_cds_max{$parent} or $fields[4] > $feature_cds_max{$parent}){
		$feature_cds_max{$parent} = $fields[4];
	    }
            if($fields[$#fields] =~ /transl_table \"(\d+)\"/){
                $feature_transl_table{$parent} = $1; #assume one translation table per CDS, which should be reasonable
            }
            while($fields[$#fields] =~ /transl_except \"pos:(\S+?),aa:(\S+?)\"/g){
                my $pos = $1;
                my $new_aa = $2; # needs to change from three letter code to 1
                if($new_aa =~ /^ter/i){ # can be funny so have special case (allows TERM, etc.)
                  $new_aa = "*";
                }
                elsif(length($new_aa) == 3){
                  $new_aa = Bio::SeqUtils->new()->seq3in($new_aa);
                }
                if($pos =~ /^(\d+)\.\.(\d+)/){
                  for my $p ($1..$2){
                    $transl_except{"$fields[0]:$p"} = $new_aa;
                  }
                }
                else{
                  $transl_except{"$fields[0]:$pos"} = $new_aa;
                }
            }
	    next;
	}
	if(not exists $feature_min{$parent} or $fields[3] < $feature_min{$parent}){
	    $feature_min{$parent} = $fields[3];
	}
	if(not exists $feature_max{$parent} or $fields[4] > $feature_max{$parent}){
	    $feature_max{$parent} = $fields[4];
	}

	$feature_count++;
	if(not exists $feature_range{$fields[0]}){
	    $feature_range{$fields[0]} = {}; # Chr => {parentID => [start,stop]}
            $feature_intervaltree{$fields[0]} = Set::IntervalTree->new();
	}
	if(not exists $feature_range{$fields[0]}->{$parent}){
	    $feature_range{$fields[0]}->{$parent} = [];
	}
        push @{$feature_range{$fields[0]}->{$parent}}, [$fields[3],$fields[4]];
        $feature_intervaltree{$fields[0]}->insert($parent, $fields[3], $fields[4]+1); # ranges need to have positive length for module to work properly
        $feature_length{$parent} += $fields[4]-$fields[3]+1;
    }
}
close(GTF);
print STDERR "\nFound $feature_count exons on ", scalar(keys %feature_range), " contigs in the GTF file\n" unless $quiet;

for my $contig (keys %feature_range){
    for my $parent (keys %{$feature_range{$contig}}){
	# sort by subrange start
	my @feature_ranges = sort {$a->[0] <=> $b->[0]} @{$feature_range{$contig}->{$parent}};
	$feature_range{$contig}->{$parent} = \@feature_ranges;
	$feature_range{"chr".$contig}->{$parent} = \@feature_ranges if not $contig =~ /^chr/;
	$feature_range{$1}->{$parent} = \@feature_ranges if $contig =~ /^chr(\S+)/;
    }
}

# Calculate the cDNA position of the leftmost (reference strand) base for each exon
for my $contig (keys %feature_range){
    for my $parent (keys %{$feature_range{$contig}}){
        my @feature_ranges = @{$feature_range{$contig}->{$parent}};
        if($feature_strand{$parent} eq "-"){
            # set up utr offset for correct CDS coordinates
            my $feature_offset = 0;
            for(my $i = $#feature_ranges; $i >= 0; $i--){
                last if not $feature_cds_max{$parent};
                # exon is completely 5' of the start
                if($feature_ranges[$i]->[0] > $feature_cds_max{$parent}){
                    $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
                }
                # exon with the cds start
                elsif($feature_ranges[$i]->[1] >= $feature_cds_max{$parent} and
                      $feature_ranges[$i]->[0] <= $feature_cds_max{$parent}){
                    $feature_offset += $feature_cds_max{$parent} - $feature_ranges[$i]->[1];
                    last;
                }
                else{
                    die "The CDS for $parent (on negative strand) ends downstream ",
                    "($feature_cds_max{$parent}) of the an exon",
                    " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n";
                }
            }
            for(my $i = $#feature_ranges; $i >= 0; $i--){
              $feature_offset += $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
              $feature_ranges[$i]->[2] = $feature_offset-1;
            }
       }
       else{ # positive strand
            # set up utr offset for correct CDS coordinates
            my $feature_offset = 0;
            for(my $i = 0; $i <= $#feature_ranges; $i++){
                last if not $feature_cds_min{$parent};
                # All 5' utr exon
                if($feature_ranges[$i]->[1] < $feature_cds_min{$parent}){
                    $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
                }
                # exon with the cds start
                elsif($feature_ranges[$i]->[1] >= $feature_cds_min{$parent} and
                      $feature_ranges[$i]->[0] <= $feature_cds_min{$parent}){
                    $feature_offset -= $feature_cds_min{$parent} - $feature_ranges[$i]->[0];
                    last;
                }
                else{
                    die "The CDS for $parent starts upstream ($feature_cds_min{$parent}) of the first exon",
                    " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n";
                }
            }
            # assign cDNA coords for each exon to the third array element
            for(my $i = 0; $i <= $#feature_ranges; $i++){
              $feature_ranges[$i]->[2] = $feature_offset;
              $feature_offset += $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
            }
       }
    }
}

print STDERR "Reading in gene name definitions...\n" unless $quiet;
die "Data file $genename_bed_file does not exist, aborting.\n" if not -e $genename_bed_file;
my %gene_ids;
open(TAB, $genename_bed_file)
  or die "Cannot open gene name BED file $genename_bed_file for reading: $!\n";
while(<TAB>){
  chomp;
  # format should be "chr start stop gene_name ..."
  my @fields = split /\t/, $_;
  next if $#fields < 3;
  my $c = $fields[0];
  if(not exists $gene_ids{$c}){
    $gene_ids{$c} = Set::IntervalTree->new();
  }
  $gene_ids{$c}->insert($fields[3], $fields[1], $fields[2]);
}

# Print output header
open(OUT, ">$output_file")
  or die "Cannot open $output_file for writing: $!\n";

print OUT join("\t", "Feature type", "Transcript length", "Selected transcript", "Transcript HGVS", "Strand", "Chr", "DNA From", "DNA To", "Zygosity", "P-value", "Variant Reads", "Total Reads",
	       "Ref base", "Obs base", "Pop. freq. source", "Pop. freq.", "Variant DB ID"), "\t",
               ($internal_snp ? "Internal pop. freq.\t" : ""),
               join("\t", "Protein HGVS", "Closest exon junction (AA coding variants)", "Gene Name", "Caveats", "Phase", "Num rare variants in gene (MAF <= $rare_variant_prop)", "Num rare coding and splice site variants in gene (MAF <= $rare_variant_prop)"),"\n";

# If there is CNV data, load it.
# BED columns should be chr start stop caveats ploidy . ignored ignored r,g,b
# The dot means the strand doesn't matter.
# where the first five fields are required, others optional
# where r,g,b is overloaded with father,mother ploidies and "b" is integer representing affected status logical AND (father bit mask 1, mother bit mask 2)
if(defined $cnv_file){
  print STDERR "Reading in CNV data...\n" unless $quiet;
  open(CNV, $cnv_file)
    or die "Cannot open $cnv_file for reading: $!\n";
  while(<CNV>){
    chomp;
    my @F = split /\t/,  $_, -1;
    if(@F < 5){
      print STDERR "Skipping unparseable line ($cnv_file #$.): $_\n";
      next;
    }
    my $ploidy = $F[4];
    my $cnv_chr = $F[0];
    next if defined $which_chr and $cnv_chr ne $which_chr and "chr$cnv_chr" ne $which_chr and $cnv_chr ne "chr$which_chr";
    my $cnv_start = $F[1];
    my $cnv_end = $F[2];
    my $p_value = "NA";
    if($F[3] =~ s/p-value=(\S+?)(?:;|$)//){
      $p_value = $1;
      next if $min_pvalue < $p_value;
    }

    # Report a variant line for each gene that is found in this CNV
    my $target_parents = $feature_intervaltree{$cnv_chr}->fetch($cnv_start, $cnv_end+1);

    my $caveats = "";
    if(@F == 9){
      my @parents_ploidy = split /,/, $F[8];
      if($parents_ploidy[2] == 0){ # neither parent affected
        if($ploidy < $parents_ploidy[0] and $ploidy < $parents_ploidy[1]){
          if($ploidy > 2){
            $caveats = "Polyploidy is less severe than in either unaffected parents";
          }
          # else: no caveats, this offspring has fewer copies than normally observed, or in unaffected parents
          elsif($ploidy < 2){
            if($parents_ploidy[0] == 2 and $parents_ploidy[1] == 2){
              $caveats = "De novo copy loss, unaffected parents are diploid";
            }
            else{
              $caveats = "Copy loss is greater than in either unaffected parent";
            }
          }
        }
        elsif($ploidy >= $parents_ploidy[0] and $ploidy <= $parents_ploidy[1] or
           $ploidy >= $parents_ploidy[1] and $ploidy <= $parents_ploidy[0]){
          $caveats = "Aneuploidy likely inherited from an unaffected parent";
        }
        elsif($ploidy > $parents_ploidy[0] and $ploidy > $parents_ploidy[1]){
          if($parents_ploidy[0] > 2){
            if($parents_ploidy[1] > 2){
              $caveats = "Lower polyploidy already exists in both unaffected parents";
            }
            else{
              $caveats = "Lower polyploidy already exists in unaffected father";
            }
          }
          else{
            if($parents_ploidy[1] > 2){
              $caveats = "Lower polyploidy already exists in unaffected mother";
            }
            # else no caveats, because both parents are "normal", yet we have polyploidy in the offspring
            else{
              $caveats = "De novo polyploidy, unaffected parents are diploid";
            }
          }
        }
        # else
        else{
          die "Oops! Error in program logic...how did we get here (unaffected parents)? $_";
        }
      }
      elsif($parents_ploidy[2] == 1){ # father affected
          if($ploidy == $parents_ploidy[1]){ # just like unaffected Mom
            if($ploidy > 2){
              if($ploidy == $parents_ploidy[0]){
                $caveats = "Same polyploidy present in both affected and unaffected parents";
              }
              else{
                $caveats = "Polyploidy inherited from unaffected mother";
              }
            }
            elsif($ploidy < 2){
              if($ploidy == $parents_ploidy[0]){
                $caveats = "Same copy loss in both affected and unaffected parents";
              }
              else{
                $caveats = "Copy loss is shared with unaffected mother";
              }
            }
            else{
              if($ploidy == $parents_ploidy[0]){
                # Why was this even reported? parents and child have diploid status...
                next;
              }
              $caveats = "Diploidy is shared with unaffected mother";
            }
          }
          elsif($ploidy > 2){ # polyploidy
            if($parents_ploidy[0] == 2){
              if($parents_ploidy[1] > 2){
                $caveats = "Unaffected mother has polyploidy (".$parents_ploidy[1]."x), but affected father is diploid";
              }
              elsif($parents_ploidy[1] == 2){
                $caveats = "Both unaffected mother and affected father are diploid";
              }
              else{
                $caveats = "Affected father is diploid, unaffected mother has copy loss (".$parents_ploidy[1]."x)";
              }
            }
            elsif($parents_ploidy[0] < 2){
              $caveats = "Polyploidy found, but affected father had copy loss (".$parents_ploidy[0]."x)";
            }
            elsif($ploidy < $parents_ploidy[1]){
              $caveats = "Polyploidy is less severe than in unaffected mother (".$parents_ploidy[1]."x), or affected father (".$parents_ploidy[0]."x)";
            }
            # past here the ploidy is great than in the unaffected mother
            elsif($parents_ploidy[1] < 2){
              $caveats = "Polyploidy is also severe in affected father (".$parents_ploidy[0]."x), but unaffected mother actually had copy loss (". $parents_ploidy[1]. "x)";
            }
            elsif($parents_ploidy[1] == 2){
              $caveats = "Polyploidy is also severe in affected father (".$parents_ploidy[0]."x), and mother is diploid";
            }
            elsif($ploidy < $parents_ploidy[0]){
              $caveats = "Polyploidy is less severe than in affected father (".$parents_ploidy[0]."x), but more severe than unaffected mother (". $parents_ploidy[1]. "x)";
            }
            elsif($ploidy > $parents_ploidy[0]){
              $caveats = "Polyploidy is more severe than in affected father (".$parents_ploidy[0]."x)";
            }
            else{
              $caveats = "Polyploidy is as severe as in affected father";
            }
          }
          elsif($ploidy == 2){
            # Don't report diploid status, any funny recombination should show up in large indel analysis
            next;
          }
          else{ # copies < 2
            if($ploidy == $parents_ploidy[0]){
              if($ploidy > $parents_ploidy[1]){
                $caveats = "Copy loss is the same as affected father, but less than unaffected mother (". $parents_ploidy[1]. "x)";
              }
              else{
                $caveats = "Copy loss is as severe as in affected father";
              }
            }
            elsif($ploidy > $parents_ploidy[0]){
              if($ploidy > $parents_ploidy[1]){
                if($parents_ploidy[1] == 0 and $parents_ploidy[0] == 0){
                  $caveats = "Poor mapping, or Mendelian inheritence violation is severe: no copies of region in either parent, but present in offspring";
                }
                elsif($ploidy == 2){
                  next; # child got best of both parents, ignore from CNV standpoint (may still have SNPs of course, or translocation, etc.)
                }
                else{
                  $caveats = "Copy loss is less severe than in unaffected mother (".$parents_ploidy[1]."x), or affected father (".$parents_ploidy[0]."x)";
                }
              }
              # else: child has less copies than unaffected mom, but more than affected Dad
              else{
                if($parents_ploidy[1] > 2){
                  $caveats = "Copy loss was more severe in affected father (".$parents_ploidy[0]."x), but unaffected mother had polyploidy (".$parents_ploidy[1]."x)";
                }
                elsif($parents_ploidy[1] == 2){
                  $caveats = "Copy loss was more severe in affected father (".$parents_ploidy[0]."x), but unaffected mother was diploid";
                }
                else{ # unaffected has loss
                  $caveats = "Copy loss is more severe than unaffect mother (".$parents_ploidy[1]."x), but less severe than affected father (".$parents_ploidy[0]."x)";
                }
              }
            }
            # past here, ploidy is less than affected father
            elsif($parents_ploidy[1] > 2){
              $caveats = "Copy loss is more severe than affected father (".$parents_ploidy[0]."x), and unaffected mother had polyploidy (".$parents_ploidy[1]."x)";
            }
            elsif($parents_ploidy[1] == 2){
              $caveats = "Copy loss is more severe than in affected father (".$parents_ploidy[0]."x)";
            }
            else{
              $caveats = "Copy loss is more severe than in both unaffect mother (".$parents_ploidy[1]."x), and affected father (".$parents_ploidy[0]."x)";
            }
          }
      }
      elsif($parents_ploidy[2] == 2){ # mother affected
          if($ploidy == $parents_ploidy[0]){ # just like unaffected Dad
            if($ploidy > 2){
              if($ploidy == $parents_ploidy[1]){
                $caveats = "Same polyploidy present in both affected and unaffected parents";
              }
              else{
                $caveats = "Polyploidy inherited from unaffected father";
              }
            }
            elsif($ploidy < 2){
              if($ploidy == $parents_ploidy[1]){
                $caveats = "Same copy loss in both affected and unaffected parents";
              }
              else{
                $caveats = "Copy loss is shared with unaffected father";
              }
            }
            else{
              if($ploidy == $parents_ploidy[1]){
                # Why was this even reported? parents and child have diploid status...
                next;
              }
              $caveats = "Diploidy is shared with unaffected father";
            }
          }
          elsif($ploidy > 2){ # polyploidy
            if($parents_ploidy[1] == 2){
              if($parents_ploidy[0] > 2){
                $caveats = "Unaffected father has polyploidy (".$parents_ploidy[0]."x), but affected mother is diploid";
              }
              elsif($parents_ploidy[0] == 2){
                $caveats = "Both unaffected father and affected mother are diploid";
              }
              else{
                $caveats = "Affected mother is diploid, unaffected father has copy loss (".$parents_ploidy[1]."x)";
              }
            }
            elsif($parents_ploidy[1] < 2){
              $caveats = "Polyploidy found, but affected mother had copy loss (".$parents_ploidy[1]."x)";
            }
            elsif($ploidy < $parents_ploidy[0]){
              $caveats = "Polyploidy is less severe than in unaffected father (".$parents_ploidy[0]."x), or affected mother (".$parents_ploidy[1]."x)";
            }
            # past here the ploidy is great than in the unaffected father
            elsif($parents_ploidy[0] < 2){
              $caveats = "Polyploidy is also severe in affected mother (".$parents_ploidy[1]."x), but unaffected father actually had copy loss (". $parents_ploidy[0]. "x)";
            }
            elsif($parents_ploidy[0] == 2){
              $caveats = "Polyploidy is also severe in affected mother (".$parents_ploidy[1]."x), and unaffected father is diploid";
            }
            elsif($ploidy < $parents_ploidy[1]){
              $caveats = "Polyploidy is less severe than in affected mother (".$parents_ploidy[1]."x), but more severe than unaffected father (". $parents_ploidy[0]. "x)";
            }
            elsif($ploidy > $parents_ploidy[1]){
              $caveats = "Polyploidy is more severe than in affected mother (".$parents_ploidy[1]."x)";
            }
            else{
              $caveats = "Polyploidy is as severe as in affected mother";
            }
          }
          elsif($ploidy == 2){
            # Don't report diploid status, any funny recombination should show up in large indel analysis
            next;
          }
          else{ # copies < 2
            if($ploidy == $parents_ploidy[1]){
              if($ploidy > $parents_ploidy[0]){
                $caveats = "Copy loss is the same as affected mother, but less than unaffected father (". $parents_ploidy[0]. "x)";
              }
              else{
                $caveats = "Copy loss is as severe as in affected mother";
              }
            }
            elsif($ploidy > $parents_ploidy[1]){
              if($ploidy > $parents_ploidy[0]){
                if($parents_ploidy[1] == 0 and $parents_ploidy[0] == 0){
                  $caveats = "Poor mapping, or Mendelian inheritence violation is severe: no copies of region in either parent, but present in offspring";
                }
                elsif($ploidy == 2){
                  next; # child got best of both parents, ignore from CNV standpoint (may still have SNPs of course, or translocation, etc.)
                }
                else{
                  $caveats = "Copy loss is less severe than in unaffected father (".$parents_ploidy[0]."x), or affected mother (".$parents_ploidy[1]."x)";
                }
              }
              # else: child has less copies than unaffected Dad, but more than affected Mom
              else{
                if($parents_ploidy[0] > 2){
                  $caveats = "Copy loss was more severe in affected mother (".$parents_ploidy[1]."x), but unaffected father had polyploidy (".$parents_ploidy[0]."x)";
                }
                elsif($parents_ploidy[0] == 2){
                  $caveats = "Copy loss was more severe in affected mother (".$parents_ploidy[1]."x), but unaffected father was diploid";
                }
                else{ # unaffected has loss
                  $caveats = "Copy loss is more severe than unaffect father (".$parents_ploidy[0]."x), but less severe than affected mother (".$parents_ploidy[1]."x)";
                }
              }
            }
            # past here, ploidy is less than affected mother
            elsif($parents_ploidy[0] > 2){
              $caveats = "Copy loss is more severe than affected mother (".$parents_ploidy[1]."x), and unaffected father had polyploidy (".$parents_ploidy[0]."x)";
            }
            elsif($parents_ploidy[0] == 2){
              $caveats = "Copy loss is more severe than in affected mother (".$parents_ploidy[1]."x)";
            }
            else{
              $caveats = "Copy loss is more severe than in both unaffect father (".$parents_ploidy[0]."x), and affected mother (".$parents_ploidy[1]."x)";
            }
          }
        }

    }
    if($F[3] and $F[3] ne "-"){ # prexisting caveat from CNV caller
      if(defined $caveats){
        $caveats .= "; $F[3]" unless $caveats =~ /\b$F[3]\b/;
      }
      else{
       $caveats  = $F[3];
      }
    }

    # Sort by start for consistency
    my @target_parents = sort {$feature_range{$cnv_chr}->{$a}->[0]->[0] <=> $feature_range{$cnv_chr}->{$b}->[0]->[0]} @$target_parents;

    for my $target_parent (@target_parents){
      my $target_caveats = $caveats;
      my $strand = $feature_strand{$target_parent};
      # report the gain/loss of each gene separately, for simplicity in downstream analysis
      my $cnv_exon_start = 10000000000; # genomic coords
      my $cnv_exon_end = 0;
      my $cnv_cdna_start = 0; # cDNA coords
      my $cnv_cdna_end = 0;
      my $off5 = 0; # border outside the exon?
      my $off3 = 0;
      my @feature_ranges = @{$feature_range{$cnv_chr}->{$target_parent}};
      # find the first and last exons in the gene that are inside the CNV
      for my $subregion (@feature_ranges){
        # exon overlaps CNV?
        if($subregion->[0] <= $cnv_end and $subregion->[1] >= $cnv_start){
          if($cnv_exon_start > $subregion->[0]){
            if($cnv_start < $subregion->[0]){
              $cnv_exon_start = $subregion->[0]; $off5 = 1;
              $cnv_cdna_start = $subregion->[2];
            }
            else{
              $cnv_exon_start = $cnv_start; $off5 = 0;
              $cnv_cdna_start = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$cnv_start: $cnv_start-$subregion->[0]);
            }
          }
          if($cnv_exon_end < $subregion->[1]){
            if($cnv_end > $subregion->[1]){
              $cnv_exon_end = $subregion->[1]; $off3 = 1;
              $cnv_cdna_end = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$subregion->[1] : $subregion->[1]-$subregion->[0]);
            }
            else{
              $cnv_exon_end = $cnv_end; $off3 = 0;
              $cnv_cdna_end = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$cnv_end : $cnv_end-$subregion->[0]);
            }
          }
        }
      }

    my $ends_internally = 0;
    if($cnv_exon_end == 0){ # ends inside the exon
      $cnv_exon_end = $cnv_end;
      $ends_internally = 1;
    }
    # See if it's in the structural variant database
    my @gain_coverage; $#gain_coverage = $cnv_exon_end-$cnv_exon_start; # preallocate blanks
    my @loss_coverage; $#loss_coverage = $cnv_exon_end-$cnv_exon_start; # preallocate blanks
    my $dgv_loss_id; # report the DGV entry that covers most of the observed structural variant
    my $dgv_loss_length = 0; # report the DGV entry that covers most of the observed structural variant
    my $dgv_gain_id; # report the DGV entry that covers most of the observed structural variant
    my $dgv_gain_length = 0; # report the DGV entry that covers most of the observed structural variant
    my $gains;
    my $losses;
    my $dgv_chr = $cnv_chr;
    $dgv_chr =~ s/^chr//; # no prefix in DGV
    #open(DGV, "tabix $dgv_file $dgv_chr:$cnv_exon_start-$cnv_exon_end |") # check out CNV in this gene model region
    #  or die "Cannot run tabix: $!\n";
    open(DGV, "/dev/null");
    while(<DGV>){
      my @C = split /\t/, $_;
      next if $C[4] ne "CNV"; # todo: handle indels?
      my $dgv_start = $C[2];
      my $dgv_end = $C[3];
      my $dgv_direction = $C[5];
      my $gain_cov_count = 0;
      my $loss_cov_count = 0;
      if($dgv_direction eq "Gain"){
        for(my $i = ($dgv_start < $cnv_exon_start ? $cnv_exon_start : $dgv_start); $i <= $dgv_end and $i <= $cnv_exon_end; $i++){
          $gain_coverage[$i-$cnv_exon_start] = 1 unless defined $gain_coverage[$i-$cnv_exon_start];
          $gain_cov_count++;
        }
      }
      elsif($dgv_direction eq "Loss"){
        for(my $i = ($dgv_start < $cnv_exon_start ? $cnv_exon_start : $dgv_start); $i <= $dgv_end and $i <= $cnv_exon_end; $i++){
          $loss_coverage[$i-$cnv_exon_start] = 1 unless defined $loss_coverage[$i-$cnv_exon_start];
          $loss_cov_count++;
        }
      }
      if($dgv_direction eq "Gain" and $gain_cov_count > $dgv_gain_length){
        $dgv_gain_id = $C[0];
        $dgv_gain_length = $gain_cov_count;
      }
      if($dgv_direction eq "Loss" and $loss_cov_count > $dgv_loss_length){
        $dgv_loss_id = $C[0];
        $dgv_loss_length = $loss_cov_count;
      }
    }
    close(DGV);

    my $gain_coverage = 0;
    for my $count (@gain_coverage){
      $gain_coverage++ if defined $count;
    }
    $gain_coverage = sprintf "%.3f", $gain_coverage/($cnv_exon_end-$cnv_exon_start+1); # make it a proportion
    my $loss_coverage = 0;
    for my $count (@loss_coverage){
      $loss_coverage++ if defined $count;
    }
    $loss_coverage = sprintf "%.3f", $loss_coverage/($cnv_exon_end-$cnv_exon_start+1); # make it a proportion

    my $src = "DGV";
    my $dgv_id = "NA";
    my $dgv_caveat;
    my $dgv_coverage;
    if($ploidy > 2){
      if(not defined $dgv_gain_id){
        if(defined $dgv_loss_id){
          $dgv_id = sprintf "%s/%.3f", $dgv_loss_id, $dgv_loss_length/($cnv_exon_end-$cnv_exon_start+1);
          $dgv_caveat = "; No gains are known in healthy controls, the DGV2 ID reported is for a loss in the same area";
          $dgv_coverage = $loss_coverage;
        }
        else{
          $dgv_id = "novel";
          $dgv_coverage = "NA";
          $src = "NA";
        }
      }
      else{
        $dgv_id = sprintf "%s/%.3f", $dgv_gain_id, $dgv_gain_length/($cnv_exon_end-$cnv_exon_start+1);
        $dgv_coverage = $gain_coverage;
      }
    }
    elsif($ploidy < 2){
      if(not defined $dgv_loss_id){
        if(defined $dgv_gain_id){
          $dgv_id = sprintf "%s/%.3f", $dgv_gain_id, $dgv_gain_length/($cnv_exon_end-$cnv_exon_start+1);
          $dgv_caveat = "; No losses are known in healthy controls, the DGV2 ID reported is for a gain in the same area";
          $dgv_coverage = $gain_coverage;
        }
        else{
          $dgv_id = "novel";
          $dgv_coverage = "NA";
          $src = "NA";
        }
      }
      else{
        $dgv_id = sprintf "%s/%.3f", $dgv_loss_id, $dgv_loss_length/($cnv_exon_end-$cnv_exon_start+1);
        $dgv_coverage = $loss_coverage;
      }
    }

    my $non_coding = 0;
    if(not exists $feature_cds_max{$target_parent} or not defined $feature_cds_max{$target_parent} or $feature_cds_max{$target_parent} eq ""){
        $non_coding = 1;
    }
    $target_caveats .= $dgv_caveat if defined $dgv_caveat and $dgv_id ne "novel" and $target_caveats !~ /\Q$dgv_caveat\E/;
    #print "Recorded $cnv_chr:$cnv_start caveat $caveats\n";
      # if it doesn't overlap an exon, we need to find out which two exons it's between
    if($ends_internally){
        my $intron_found = 0;
        for(my $i = 0; $i < $#feature_ranges; $i++){
          if($feature_ranges[$i]->[1] < $cnv_start and $feature_ranges[$i+1]->[0] > $cnv_end){
            if($ploidy > 2){ # gain
              if($strand eq "-"){
                record_snv("$target_parent\t",
                   ($non_coding ? "g.$cnv_start\_$cnv_end" :
                    "c.".($feature_ranges[$i+1]->[2])."+".($feature_ranges[$i+1]->[0]-$cnv_end)."_".($feature_ranges[$i+1]->[2]+1)."-".($cnv_start-$feature_ranges[$i]->[1])),
                # pos	Zygosity	P-value	Variant Reads	Total Reads	Ref Bases	Var Bases	Population Frequency Source	Pop. freq. or DGV2 gain/loss coverage	dbSNP or DGV2 ID
                   "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\tNA\t$p_value\tNA\tNA\t",
                   "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
              }
              else{
                record_snv("$target_parent\t",
                   ($non_coding ? "g.$cnv_start\_$cnv_end" :
                   "c.".($feature_ranges[$i+1]->[2]-1)."+".($cnv_start-$feature_ranges[$i]->[1])."_".$feature_ranges[$i+1]->[2]."-".($feature_ranges[$i+1]->[0]-$cnv_end)),
                   "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\tNA\t$p_value\tNA\tNA\t",
                   "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
              }
            }
            else{ # loss
              if($strand eq "-"){
                record_snv("$target_parent\t",
                   ($non_coding ? "g.$cnv_start\_$cnv_end" :
                   "c.".($feature_ranges[$i+1]->[2])."+".($feature_ranges[$i+1]->[0]-$cnv_end)."_".($feature_ranges[$i+1]->[2]+1)."-".($cnv_start-$feature_ranges[$i]->[1])),
                   "del\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t",
                   "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
              }
              else{
                record_snv("$target_parent\t",
                   ($non_coding ? "g.$cnv_start\_$cnv_end" :
                   "c.".($feature_ranges[$i+1]->[2]-1)."+".($cnv_start-$feature_ranges[$i]->[1])."_".$feature_ranges[$i+1]->[2]."-".($feature_ranges[$i+1]->[0]-$cnv_end)),
                   "del\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t",
                   "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
              }
            }
            $intron_found = 1; last;
          }
        }
        warn "Logic error: CNV overlaps a gene ($target_parent), but is neither intronic nor exonic. Offending CNV was $_\n" unless $intron_found;
        next;
      }
      if($strand eq "-"){
        my $tmp = $cnv_cdna_start;
        $cnv_cdna_start = $cnv_cdna_end;
        $cnv_cdna_end = $tmp;
      }
      # Make the location label pretty descriptive
      my $cnv_phase = "";
      if($cnv_exon_start > $cnv_start or $cnv_exon_end < $cnv_end){
        $cnv_phase = "CNV-$cnv_chr:$cnv_start-$cnv_end"; # Use phase to note that it's part of a bigger CNV than just the range of this feature
      }
      # if we get here, we're in a gained/deleted exon category
      # CNVs are fuzzy-edged (as opposed to large indels), so produce HGVS syntax that reflect this
      if($ploidy > 2){ # gain
        # find the exons encompassed by the CNV. NOTE that we assume that polyploidies are proximal
        record_snv("$target_parent\t",
                   ($non_coding ? "g.".($cnv_exon_start > $cnv_start ? "$cnv_exon_start-?" : $cnv_start)."_".($cnv_exon_end < $cnv_end ? "$cnv_exon_end+?" : $cnv_end) :
                   "c.$cnv_cdna_start".($off5?"-?":"")."_$cnv_cdna_end".($off3?"+?":"")),
                   "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_exon_start\t$cnv_exon_end\tNA\t$p_value\tNA\tNA\t",
                   "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t$cnv_phase\n");
      }
      else{ # loss
        #translate genome coordinates into cDNA coordinates
        record_snv("$target_parent\t",
                   ($non_coding ? "g.".($cnv_exon_start > $cnv_start ? "$cnv_exon_start-?" : $cnv_start)."_".($cnv_exon_end < $cnv_end ? "$cnv_exon_end+?" : $cnv_end) :
                   "c.$cnv_cdna_start".($off5?"-?":"")."_$cnv_cdna_end".($off3?"+?":"")),
                   "del\t$strand\t$cnv_chr\t$cnv_exon_start\t$cnv_exon_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t",
                   "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t$cnv_phase\n");
      }
    }
  }
  close(CNV);

}


#sort genes by start, then longest if tied
my %rc = qw(A T T A G C C G N N S W W S K M M K Y R R Y X X);
print STDERR "Processing variant calls..." unless $quiet;
%chr_read = ();
open(VCFIN, $input_file)
    or die "Cannot open $input_file for reading: $!\n";
while(<VCFIN>){
    if(/^\s*(?:#|$)/){ # blank or hash comment lines
       next;
    }
    chomp;
    my @fields = split /\t/, $_;

    next unless exists $feature_range{$fields[0]};
    if(not $quiet and not exists $chr_read{$fields[0]}){
	print STDERR " $fields[0]";
	$chr_read{$fields[0]} = 1;
	#print STDERR "(not in reference file!)" unless exists $feature_range{$fields[0]};
    }

    next if $fields[4] eq "<NON_REF>"; # GVCF background stuff
    next if $fields[9] eq "./." or $fields[9] eq "."; # no call
    my $chr = $fields[0];
    next if defined $which_chr and $chr ne $which_chr and $chr ne "chr$which_chr" and "chr$chr" ne $which_chr;
    print STDERR "passes chr and field # test" if grep /dataset_7684.dat/, @ARGV;
    $chr = "chr$chr" if $chr !~ /^chr/;
    $fields[8] =~ s/\s+$//;
    my @values = split /:/, $fields[9];
    #print STDERR join(" / ", @values), "\n" if $. == 84;
    my @keys = split /:/, $fields[8];
    my $zygosity = "n/a";
    my $quality = "n/a";
    my $read_depth = "n/a";
    my $variant_depths = "n/a";
    for(my $i = 0; $i <= $#keys and $i <= $#values; $i++){ # values max index check because some genotypers are nasty and don't provide as many fields as they say they will
	if($keys[$i] eq "GT"){
            if($values[$i] =~ /\./ or $values[$i] =~ /0\/0/){ # one genotype not described
              $zygosity = "none";
              last;
            }
	    else{ # genotypes described
              $zygosity = $values[$i] =~ /[02]/ ? "heterozygote" : "homozygote";
            }
	}
        elsif($keys[$i] eq "DNM_CONFIG" and $zygosity eq "n/a"){ # hack
            $zygosity = $values[$i] =~ /^(.)\1/ ? "homozygote" : "heterozygote";
        }
    	elsif($keys[$i] eq "GQ" and $values[$i] ne "."){
            #print "Checking GQ (index $i) $values[$i] gq2p\n" if $. == 84;
	    $quality = gq2p($values[$i]);
	}
    	elsif($keys[$i] eq "RD"){ # from some tools like denovo variant finders
	    $read_depth = $values[$i];
	}
    	elsif($keys[$i] eq "DP"){
	    $read_depth = $values[$i];
	}
        # the frequency of the variant can go by many names, here we have freebayes (A* are new and old versions) and atlas2_indel
    	elsif($keys[$i] eq "AA" or $keys[$i] eq "VR" or $keys[$i] eq "AO"){
	    $variant_depths = $values[$i];
	}
        # here we have GATK variant freq of form ref#,var#
    	elsif($keys[$i] eq "AD"){
	    $variant_depths = $values[$i];
            $variant_depths =~ s/^\d+,//;
	}
        else{
            #print STDERR "Ignoring field $keys[$i]\n";
        }
    }
    next if $zygosity eq "none"; # GVCF non-ref for example or when multiple samples are reported simultaneously
    $quality = z2p($1) if $fields[7] =~ /Z=(\d+\.\d+)/;
    if($quality eq "n/a" and $fields[5] ne "."){
      $quality = gq2p($fields[5]);
    }
    if($fields[5] eq "0" and $fields[6] eq "PASS"){ # not qual and a PASS in the filter column
      $quality = 1;
    }
    elsif($quality ne "n/a" and $quality > $min_pvalue){ # p-value cutoff
      #print "Checking call quality $fields[5]  gq2p\n" if $. == 84;
      next unless gq2p($fields[5]) <= $min_pvalue; # in some cases, programs like FreeBayes give low genotype quality such as when a call is borderline homo/het
                                                   # in these cases it would be silly to reject the call if their are many supporting reads.
    }

    # Some tools like GATK don't report number of variant reads...infer from other data if possible
    if($variant_depths eq "n/a"){
      my @key_value_pairs = split /;/, $fields[7];
      for my $key_value_pair (@key_value_pairs){
        if($key_value_pair !~ /^(.*?)=(.*)$/){
          next;
          #next if $key_value_pair eq "INDEL"; # samtools peculiarity
          #die "Key-value pair field (column #8) does not have the format key=value for entry $key_value_pair (line #$. of ), please fix the VCF file\n";
        }
        if($1 eq "AB"){ # GATK: for het calls, AB is ref/(ref+var), only one variant reported per line
          $variant_depths = "";
          for my $ab (split /,/, $2){
            $variant_depths .= int((1-$ab)*$read_depth).",";
          }
          chop $variant_depths;
        }
        elsif($1 eq "MLEAC"){
        }
        elsif($1 eq "DP4"){ # samtools: high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases
          my @rds = split /,/, $2;
          $variant_depths = $rds[2]+$rds[3];
          $read_depth = $rds[0]+$rds[1]+$variant_depths;
          if($fields[4] =~ /,/){ # samtools doesn't break down compound het calls into individual frequencies
            my $num_alt_genotypes = $fields[4] =~ tr/,/,/;
            $num_alt_genotypes++;
            my $even_prop = sprintf "%.2f", $variant_depths/$read_depth/$num_alt_genotypes;
            $variant_depths = join(",", ($even_prop) x $num_alt_genotypes);
            if(not exists $chr2caveats{"$chr:$fields[1]"}){
              $chr2caveats{"$chr:$fields[1]"} = "compound het var freq n/a in orig VCF file, auto set to $even_prop";
            }
            else{
              $chr2caveats{"$chr:$fields[1]"} .= "; compound het var freq n/a in orig VCF file, auto set to $even_prop";
            }
          }
        }
      }
    }
    if($variant_depths eq "n/a"){ # usually homo var calls, can only assume all reads are variant
      if($zygosity eq "homozygote"){
        $variant_depths = $read_depth;
        if($read_depth ne "n/a"){
          if(not exists $chr2caveats{"$chr:$fields[1]"}){
            $chr2caveats{"$chr:$fields[1]"} = "homo var freq n/a in orig VCF file, auto set to 1.0";
          }
          else{
            $chr2caveats{"$chr:$fields[1]"} = "; homo var freq n/a in orig VCF file, auto set to 1.0";
          }
        }
      }
      else{
        if($read_depth ne "n/a"){
          $variant_depths = int($read_depth/2);
          if(not exists $chr2caveats{"$chr:$fields[1]"}){
            $chr2caveats{"$chr:$fields[1]"} = "het var freq n/a in orig VCF file, auto set to 0.5";
          }
          else{
            $chr2caveats{"$chr:$fields[1]"} = "; het var freq n/a in orig VCF file, auto set to 0.5";
          }
        }
        else{
          $variant_depths = $read_depth;
        }
      }
    }

    my $target_parents = $feature_intervaltree{$chr}->fetch($fields[1]-$flanking_bases, $fields[1]+length($fields[3])+$flanking_bases);
    # Last ditch, if not in a gene model, is it at least in an enrichment region?
    if(not @$target_parents){
       next if not exists $enrichment_regions{$chr};
       my $regions_ref = $enrichment_regions{$chr};
       my $location = $fields[1];
       my $strand = "+"; # for lack of a better choice
       for(my $i = find_earliest_index($location-$flanking_bases, $regions_ref);
           $i < $#$regions_ref and $location <= $regions_ref->[$i]->[1]+$flanking_bases;
           $i++){
          next unless $regions_ref->[$i]->[0]-$flanking_bases <= $location and $regions_ref->[$i]->[1]+$flanking_bases >= $location;

          my $feature_name = "enrichment_target_$chr:".$regions_ref->[$i]->[0]."-".$regions_ref->[$i]->[1];
          $feature_type{$feature_name} = "misc_enrichment_kit_target";
          $feature_length{$feature_name} = $regions_ref->[$i]->[1]-$regions_ref->[$i]->[0]+1;
          my @variants = split /,/, $fields[4];
          my @variant_depths = split /,/, $variant_depths;
          my $ref = uc($fields[3]);
          for(my $j = 0; $j <= $#variants; $j++){
            my $variant = $variants[$j];
            next if $variant eq "<NON_REF>" or $variant_depths[$j] eq "0"; # GVCF stuff
            my $variant_depth = $variant_depths[$j];
            if($min_prop){
               next unless $variant_depth >= $min_depth and $read_depth ne "n/a" and $variant_depth/$read_depth >= $min_prop;
            }
            if(length($ref) == 1 and length($variant) == 1){ # SNP
              record_snv("$feature_name\tg.$location",
                         "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
            }
            elsif(length($ref) == 1 and length($variant) > 1){ # insertion
              record_snv("$feature_name\tg.$location\_",($location+1),
                         "ins",substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
            }
            elsif(length($variant) == 1 and length($ref) > 1){ # deletion
              record_snv("$feature_name\tg.$location\_",($location+length($ref)-1),
                         "del",substr($ref, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
            }
            else{ # indel
              record_snv("$feature_name\tg.$location\_",($location+length($ref)-1),
                         "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
            }
          } # end for variants
          next; # process next record, we've done all we can with a non-coding-gene SNP
       }
    }

    for my $target_parent (@$target_parents){

        print STDERR "Checking parent $target_parent for on $chr:$fields[1] $fields[3] -> $fields[4]\n" if grep /dataset_7684.dat/, @ARGV;
    my @feature_ranges = @{$feature_range{$chr}->{$target_parent}};
    # Calculate the position of the change within the feature range position
    my $strand = $feature_strand{$target_parent};
    my $trans_table = exists $feature_transl_table{$target_parent} ? $feature_transl_table{$target_parent} : $default_transl_table;
    $fields[4]=~tr/"//d; # sometime strangely surroundsed by quotes
    my @variants = split /,/, $fields[4];
    my @variant_depths = split /,/, $variant_depths;
    my @refs = (uc($fields[3])) x scalar(@variants);
    my @locations = ($fields[1]) x scalar(@variants);

    for(my $j = 0; $j <= $#variants; $j++){
	my $ref = $refs[$j];
	my $location = $locations[$j];
	my $feature_offset = 0;
	my $feature_num = 0;
	my $variant = uc($variants[$j]);
        next if $variant eq "<NON_REF>" or $variant_depths[$j] eq "0"; # GVCF stuff
	my $variant_depth = $variant_depths[$j] || "n/a";
        #print STDERR "Evaluating target parent $target_parent for $chr:$fields[1]-".($fields[1]+length($fields[3]))." -> ",join("/", @$target_parents) , "\n" if $fields[1] == 982994;

        # Break down MNPs into individual SNPs that are phased (TODO: skip if it's an inversion? would require amalgamating SNPs for tools that call them individually, phased :-P)
        if(length($variant) > 1 and length($variant) == length($ref)){
           my @subvariants;
           my @subrefs;
           my @sublocations;
           my $phase_range = $location."-".($location+length($ref)-1);
           for(my $k = 0; $k < length($variant); $k++){
             my $r = substr($ref, $k, 1);
             my $v = substr($variant, $k, 1);
             if($r ne $v){
               push @subvariants, $v;
               push @subrefs, $r;
               push @sublocations, $location+$k;
             }
             elsif(@variants == 1){
               next; # homo ref call
             }
             if($zygosity eq "heterozygote"){
               # need to ignore case where a homozygous call (variant or ref) is included in a double non-ref het MNP
               if(@variants > 1){
                 my $homo = 1;
                 for(my $l = 0; $l <= $#variants; $l++){ # using loop in case we handle oligoploid genomes in the future
                   if(length($variants[$l]) <= $k or substr($variants[$l], $k, 1) ne $v){
                     $homo = 0;
                     last;
                   }
                 }
                 next if $homo;
               }
               my $phase_key = "$chr:".($location+$k).":$v";
               my $phase_label = "M$j-$chr:$phase_range";
               if(exists $chr2phase{$phase_key}){
                 if($chr2phase{$phase_key} !~ /$phase_label/){
                   $chr2phase{$phase_key} .= ",$phase_label";
                 }
               }
               else{
                 $chr2phase{$phase_key} = $phase_label;
               }
             }
           }
           # recycle this MNP variant loop to start processing the individual SNPs
           splice(@refs, $j, 1, @subrefs);
           splice(@variants, $j, 1, @subvariants);
           splice(@locations, $j, 1, @sublocations);
           splice(@variant_depths, $j, 1, ($variant_depth) x scalar(@subvariants));
           $j--;
           next;
        }

        if($min_prop != 0 and $variant_depth eq "n/a" or $variant_depth eq "."){
          print STDERR "Could not parse variant depth from $_\n" unless $quiet;
          next;
        }
        next unless $min_prop == 0 or $min_prop and $variant_depth >= $min_depth and $read_depth ne "n/a" and $variant_depth/$read_depth >= $min_prop;
        if($zygosity eq "NA"){
          # make the call ourselves
          $zygosity = $variant_depths/$read_depth > 1-$min_prop ? "homozygote" : "heterozygote";
        }
	# e.g. chr22   47857671   . CAAAG   AAGAT,AAAAG    29.04  .
	if(length($variant) > 1 and length($variant) == length($ref)){
	    for(my $k = 0; $k < length($variant); $k++){
		my $fixed_variant = $variant;
		substr($fixed_variant, $k, 1) = substr($ref, $k, 1);
		if($fixed_variant eq $ref){ # single base difference at base k between the two
		    $ref = substr($ref, $k, 1);
		    $variant = substr($variant, $k, 1);
		    $location += $k;
		    last;
		}
	    }
	}

        # samtools reports indels with long common tails, causing non-canonical HGVS reporting and a problem when looking up the variant in dbSNP etc.
        # remove common tails to variant calls in order to try to rectify this
        else{
            while(length($ref) > 1 and length($variant) > 1 and substr($ref, length($ref)-1) eq substr($variant, length($variant)-1)){
                chop $ref; chop $variant;
            }
        }

        # See if a caveat should be added because the indel was in a polyhomomer region
        if(length($ref) > length($variant) and index($ref, $variant) == 0 and $fasta_index->fetch("$chr:".($location+1)."-".($location+length($ref)+1)) =~ /^([ACGT])\1+$/i){
          if(not exists $chr2caveats{"$chr:$location"}){
            $chr2caveats{"$chr:$location"} = "poly".uc($1)." region deletion";
          }
          elsif($chr2caveats{"$chr:$location"} !~ /poly/){
            $chr2caveats{"$chr:$location"} .= "; poly".uc($1)." region deletion";
          }
        }
        elsif(length($ref) < length($variant) and index($variant, $ref) == 0 and substr($variant, 1) =~ /^([ACGT])\1+$/i){
          if(not exists $chr2caveats{"$chr:$location"}){
            $chr2caveats{"$chr:$location"} .= "poly".uc($1)." region insertion";
          }
          elsif($chr2caveats{"$chr:$location"} !~ /poly/){
            $chr2caveats{"$chr:$location"} .= "; poly".uc($1)." region insertion";
          }
        }

        # Not a protein-coding gene?  Report genomic cooordinates for HGVS
        if(not exists $feature_cds_max{$target_parent} or not defined $feature_cds_max{$target_parent} or $feature_cds_max{$target_parent} eq ""){
            if(length($ref) == 1 and length($variant) == 1){ # SNP
              record_snv("$target_parent\tg.$location",
                         "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
            }
            elsif(length($ref) == 1 and length($variant) > 1){ # insertion
              record_snv("$target_parent\tg.$location\_",($location+1),
                         "ins",substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
            }
            elsif(length($variant) == 1 and length($ref) > 1){ # deletion
              record_snv("$target_parent\tg.$location\_",($location+length($ref)-1),
                         "del",substr($ref, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
            }
            else{ # indel
              record_snv("$target_parent\tg.$location\_",($location+length($ref)-1),
                         "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
            }
            next; # process next record, we've done all we can with a non-coding-gene SNP
        }

	if($strand eq "-"){
	    # set up utr offset for correct CDS coordinates
	    for(my $i = $#feature_ranges; $i >= 0; $i--){
		# exon is completely 5' of the start
		if($feature_ranges[$i]->[0] > $feature_cds_max{$target_parent}){
		    #print STDERR "Whole 5' UTR exon $i: ",$feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1,"\n";
		    $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
		}
		# exon with the cds start
		elsif($feature_ranges[$i]->[1] >= $feature_cds_max{$target_parent} and
		      $feature_ranges[$i]->[0] <= $feature_cds_max{$target_parent}){
		    #print STDERR "Start codon in exon $i: ", $feature_cds_max{$target_parent} - $feature_ranges[$i]->[1],"\n";
		    $feature_offset += $feature_cds_max{$target_parent} - $feature_ranges[$i]->[1];
		    last;
		}
		else{
		    die "The CDS for $target_parent (on negative strand) ends downstream ",
		    "($feature_cds_max{$target_parent}) of the an exon",
		    " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n";
		}
	    }
	    for(my $i = $#feature_ranges; $i >= 0; $i--){
		my $feature = $feature_ranges[$i];
		# in the 3' UTR region of the gene
		if($location < $feature_cds_min{$target_parent}){
		    my $feature_exon = 0;
		    $feature = $feature_ranges[$feature_exon];
		    while($location > $feature->[1]+$flanking_bases and
			  $feature_exon < $#feature_ranges){
			$feature = $feature_ranges[++$feature_exon]; # find the 3' utr exon in which the variant is located
		    }
		    # utr exons passed entirely
		    my $stop_exon = $feature_exon;
		    while($feature_ranges[$stop_exon]->[1] < $feature_cds_min{$target_parent}){
			$stop_exon++;
		    }
		    my $post_offset = $feature_cds_min{$target_parent}-$feature_ranges[$stop_exon]->[0]; # offset from the stop codon in the final coding exon
		    while($stop_exon > $feature_exon){
			$post_offset += $feature_ranges[$stop_exon]->[1]-$feature_ranges[$stop_exon]->[0]+1;
			$stop_exon--;
		    }

		    my $pos = $feature->[1]-$location+1+$post_offset;
                    my $junction_dist;
		    if($location < $feature->[0]){ # after a UTR containing exon? set as .*DD+DD
                        $junction_dist = ($feature->[0]-$location);
			$pos = ($post_offset+$feature->[1]-$feature->[0]+1)."+".$junction_dist;
		    }
		    elsif($location > $feature->[1]){ # before a total UTR exon? set as .*DD-DD
                        $junction_dist = -($location-$feature->[1]);
			$pos = ($post_offset+1).$junction_dist;
		    }
                    else{ # in the UTR exon
                        if($location - $feature->[0] < $feature->[1] - $location){ # location is closer to exon donor site
                            $junction_dist = -($location - $feature->[0]+1); # +1 for HGVS syntax compatibility (there is no position 0, direct skip from -1 to +1)
                        }
                        else{
                            $junction_dist = $feature->[1] - $location +1;
                        }
                    }
		    if(length($ref) == 1 and length($variant) == 1){
                        my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
			# 3' UTR SNP
			record_snv("$target_parent\tc.*$pos",
			"$rc{$ref}>$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			#"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			join("\t",prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
		    }
		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
                        my $rc = join("",map({$rc{$_}} split(//,reverse(substr($variant,1)))));
			# 3' UTR insertion
			record_snv("$target_parent\tc.*",
			hgvs_plus($pos,-1),"_*",$pos,
			"ins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			#"ins",substr($variant,1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
		    }
		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
                        my $rc = join("",map({$rc{$_}} split(//,reverse($ref))));
                        my $delBases = substr($rc,0,length($rc)-1);
			if(length($ref) == 2){
			    # 3' UTR single base deletion
			    record_snv("$target_parent\tc.*",hgvs_plus($pos,-1),
			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
			}
			else{
			    # longer 3' UTR deletion
			    record_snv("$target_parent\tc.*",
			    hgvs_plus($pos,-length($ref)+1),"_*",hgvs_plus($pos, -1),
			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
			}
		    }
		    else{
			my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
			if($rc eq $ref and length($variant) > 3){
			    # 3' UTR inversion
			    record_snv("$target_parent\tc.*",
			    hgvs_plus($pos,-length($ref)+1),"_*",$pos,
			    "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
			    last;
			}

			# complex substitution in 3' UTR
			# Will break if stop codon is modified
			record_snv("$target_parent\tc.*",
			hgvs_plus($pos,-length($ref)+1),"_*", $pos,
			"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
		    }
		    last;
		}
		# in the feature
		elsif($location >= $feature->[0] and $location <= $feature->[1]){
		    my $pos = $feature_offset+$feature->[1]-$location+1;
		    if($location > $feature_cds_max{$target_parent}){ #since there is no position 0, the pos is in UTR, subtract one
			$pos = hgvs_plus($pos, -1);
		    }
		    my $first_exon_base = $feature_offset+1;
                    my $exon_edge_dist = $feature->[1]-$location+1; # equiv of HGVS +X or -X intron syntax, but for exons
                    $exon_edge_dist = $feature->[0]-$location-1 if abs($feature->[0]-$location-1) < $exon_edge_dist; # pick closer of donor and acceptor sites
                    #print STDERR "Got ", $feature->[1]-$location+1, "vs. ", $feature->[0]-$location-1, ": chose $exon_edge_dist\n";
		    if(length($ref) == 1 and length($variant) == 1){
			# SNP
			record_snv("$target_parent\tc.",
			$pos,
			"$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
                        ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
                        #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
		    }
		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
			my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
                        my $insBases = substr($rc,1);
			# insertion
			record_snv("$target_parent\tc.",
			hgvs_plus_exon($pos, -1, $first_exon_base),"_",$pos,"ins$insBases",
			"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
                        ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
                        #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
		    }
		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
			my $rc = join("",map({$rc{$_}} split(//,reverse($ref))));
                        my $delBases = substr($rc,0,length($rc)-1);
			# single nucleotide deletion
			if(length($ref) == 2){
			    record_snv("$target_parent\tc.",
			    hgvs_plus_exon($pos, -1, $first_exon_base),
			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
                            ($pos-1 < 1 ? "NA" : $pos-1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
                            #($pos-1 < 1 ? "NA" : $pos-1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
			}
			# longer deletion
			else{
                            $exon_edge_dist = $feature->[1]-$location-length($ref)+1 if abs($feature->[1]-$location-length($ref)+1) < $exon_edge_dist;
			    record_snv("$target_parent\tc.",
			    hgvs_plus_exon($pos, -length($ref)+1, $first_exon_base),"_",
			    hgvs_plus_exon($pos, -1, $first_exon_base),
			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
                            ($pos-1 < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
                            #($pos-1 < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
			}
		    }
		    else{
			# complex substitution
                        $exon_edge_dist = $feature->[1]-$location-length($ref)+1 if abs($feature->[1]-$location-length($ref)+1) < $exon_edge_dist;
			my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
                        if($rc eq $variant and length($variant) > 3){
                            # inversion
                            record_snv("$target_parent\tc.",
                                       hgvs_plus_exon($pos,-length($ref)+1,$first_exon_base),"_",
                                       $pos,
                                       "inv",
                                       "\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
                                       join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
                                       ($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");

                            last;
                        }
			record_snv("$target_parent\tc.",
			hgvs_plus_exon($pos,-length($ref)+1,$first_exon_base),"_",
			$pos,
			"delins$rc",
			"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
                        ($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
                        #($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
		    }
		    last;
		}
		# 5' of feature (on negative strand)
		elsif($location > $feature->[1]){
		    if(length($ref) == 1 and length($variant) == 1){
			# intronic SNP
			if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
			    # Closer to acceptor site
			    record_snv("$target_parent\tc.",$feature_offset+1,
			    ($feature->[1]-$location),
			    "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant, $feature->[1]-$location)),"\tNA\n");
			}
			else{
			    # Closer to donor site
			    record_snv("$target_parent\tc.",$feature_offset,"+",
			    ($feature_ranges[$i+1]->[0]-$location),
			    "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant, $feature_ranges[$i+1]->[0]-$location)),"\tNA\n");
			}
		    }
		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
                        my $rc = join("",map({$rc{$_}} split(//,reverse(substr($variant,1)))));
			if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
			    # intronic insertion near acceptor
			    my $pos = ($feature_offset+1).($feature->[1]-$location);
			    record_snv("$target_parent\tc.",
			    hgvs_plus($pos,-1),"_",$pos,
			    "ins",
			    $rc,"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    #substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location-1)),"\tNA\n");
			}
			else{
			    # intronic insertion near donor
			    my $pos = $feature_offset."+".($feature_ranges[$i+1]->[0]-$location);
			    record_snv("$target_parent\tc.",
			    hgvs_plus($pos,-1),"_",$pos,
			    "ins",
			    $rc,"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    #substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location+1)),"\tNA\n");
			}
		    }
		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
			# intronic deletion
			# single nucleotide deletion
			my $rc = reverse($ref);
			$rc=~tr/ACGT/TGCA/;
                        my $delBases = substr($rc, 0, length($rc)-1);
			if(length($ref) == 2){
			    # single intronic deletion near acceptor
			    if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
                                my $off = $feature->[1]-$location-1;
				record_snv("$target_parent\tc.",
				($feature_offset+1),$off,
				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n");
			    }
			    # single intronic deletion near donor
			    else{
				my $pos = $feature_offset;
                                my $off = $feature_ranges[$i+1]->[0]-$location+1;
				record_snv("$target_parent\tc.",
				hgvs_plus_exon($pos, $off, $pos),
				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n");
			    }
			}
			# longer deletion
			else{
			    if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
				# long intronic deletion near acceptor
                                my $off = $feature->[1]-$location-1;
				my $pos = ($feature_offset+1).$off;
				record_snv("$target_parent\tc.",
				hgvs_plus($pos,-length($ref)+2),"_",$pos,
				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n");
			    }
			    else{
				# long intronic deletion near donor
                                my $off = $feature_ranges[$i+1]->[0]-$location+1;
				my $pos = ($feature_offset)."+".$off;
				record_snv("$target_parent\tc.",
				$pos,"_",hgvs_plus($pos,-length($ref)-1),
				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off-length($ref)+1 <= 2 ? "p.?" : "NA"),"\n");
			    }
			    last;
			}
		    }
		    else{
			my $rc = reverse($ref);
			$rc=~tr/ACGT/TGCA/;
			if($rc eq $variant and length($variant) > 3){
			    # intronic inversion near acceptor site
			    if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
				my $pos = ($feature_offset+1).($feature->[1]-$location);
				record_snv("$target_parent\tc.",
				hgvs_plus($pos,-length($ref)+1),"_",$pos,
				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
				join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location)),"\tNA\n");
			    }
			    else{
				my $pos = ($feature_offset)."+".($feature_ranges[$i+1]->[0]-$location);
				record_snv("$target_parent\tc.",
				$pos,"_",hgvs_plus($pos, length($ref)-1),
				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
				join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location)),"\tNA\n");
			    }
			    last;
			}
                        $rc = reverse($variant);
                        $rc=~tr/ACGT/TGCA/;
			# Intronic complex substitution
			if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
			    # complex intronic substitution near acceptor site
			    my $pos = ($feature_offset+1).($feature->[1]-$location);
			    record_snv("$target_parent\tc.",
			    hgvs_plus($pos, -length($ref)+1),"_",$pos,
			    "delins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    #"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location)),"\tNA\n");
			}
			else{
			    # complex intronic substitution near donor site
			    my $pos = $feature_offset."+".($feature_ranges[$i+1]->[0]-$location);
			    record_snv("$target_parent\tc.",
			    $pos,"_",hgvs_plus($pos, length($ref)-1),
			    "delins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    #"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location)),"\tNA\n");
			}
		    }
		    last;
		}
		else{
		    #print STDERR "Offset going from ", $feature_offset, " to ", $feature_offset+$feature->[1]-$feature->[0]+1,"\n";
		    $feature_offset += $feature->[1]-$feature->[0]+1;
		    #print STDERR "Set feature offset to  $feature_offset by adding ",$feature->[1],"-", $feature->[0],"+1\n";
		}
	    }
	}
	else{
	    # forward strand

	    # set up utr offset for correct CDS coordinates
	    for(my $i = 0; $i <= $#feature_ranges; $i++){
		# All 5' utr exon
		if($feature_ranges[$i]->[1] < $feature_cds_min{$target_parent}){
		    $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
		}
		# exon with the cds start
		elsif($feature_ranges[$i]->[1] >= $feature_cds_min{$target_parent} and
		      $feature_ranges[$i]->[0] <= $feature_cds_min{$target_parent}){
		    $feature_offset -= $feature_cds_min{$target_parent} - $feature_ranges[$i]->[0];
		    last;
		}
		else{
		    die "The CDS for $target_parent starts upstream ($feature_cds_max{$target_parent}) of the first exon",
		    " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n";
		}
	    }
	    for(my $i = 0; $i <= $#feature_ranges; $i++){
		my $feature = $feature_ranges[$i];
		# 3' of last coding position
		if($location > $feature_cds_max{$target_parent}){
		    # find the exon with the stop codon
		    while($feature->[1] < $feature_cds_max{$target_parent}){
			$feature = $feature_ranges[++$i];
		    }
		    my $post_offset = $feature->[0]-$feature_cds_max{$target_parent};
		    while($location > $feature->[1]+$flanking_bases and
			  $i < $#feature_ranges){
			$post_offset += $feature->[1]-$feature->[0]+1;
			$feature = $feature_ranges[++$i]; # find the 3' utr exon in which the variant is located
		    }
		    my $pos = $location-$feature->[0]+$post_offset;
		    #print STDERR "Positive strand: got $pos for $location, exon #$i of $#feature_ranges, post_offset is $post_offset\n" if $location-$feature->[1] > $flanking_bases;
                    my $off;
		    if($location > $feature->[1]){ # after a UTR containing exon? set as .*DD+DD
                        $off = $location-$feature->[1];
			$pos = ($post_offset+$feature->[1]-$feature->[0]+1)."+".$off;
		    }
		    elsif($location < $feature->[0]){ # before a total UTR exon? set as .*DD-DD
                        $off = -($feature->[0]-$location);
			$pos = ($post_offset+1).$off;
		    }
                    else{
                        if($location-$feature->[0] < $feature->[1]-$location){
                           $off = $location-$feature->[0]+1; # +1 since HGVS skips right from -1 to +1 at exon boundary
                        }
                        else{
                           $off = $location-$feature->[1]-1; # will be negative
                        }
                    }
		    if(length($ref) == 1 and length($variant) == 1){
			# 3' UTR SNP
			record_snv("$target_parent\tc.*$pos",
			"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n");
		    }
		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
			# 3' UTR insertion
			record_snv("$target_parent\tc.*",
			hgvs_plus($pos,1),"_*",hgvs_plus($pos,2),
			"ins",substr($variant,1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n");
		    }
		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
                        my $delBases = substr($ref, 1);
			if(length($ref) == 2){
			    # 3' UTR single base deletion
			    record_snv("$target_parent\tc.*",hgvs_plus($pos,1),
			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n");
			}
			else{
			    # longer 3' UTR deletion
			    record_snv("$target_parent\tc.*",
			    hgvs_plus($pos,1),"_*",hgvs_plus($pos,length($ref)-1),
			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n");
			}
		    }
		    else{
			my $rc = reverse($ref);
			$rc=~tr/ACGT/TGCA/;
			if($rc eq $variant and length($variant) > 3){
			    # 3' UTR inversion
			    record_snv("$target_parent\tc.*$pos",
			    "_*",hgvs_plus($pos,length($ref)-1),
			    "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n");
			    last;
			}
			# complex substitution in 3' UTR
			record_snv("$target_parent\tc.*$pos",
			"_*",hgvs_plus($pos,length($ref)-1),
			"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n");
		    }
		    last;
		}
		# in the exon
		elsif($location >= $feature->[0] and $location <= $feature->[1]){
		    my $pos = $feature_offset+$location-$feature->[0]+1;
		    my $last_exon_base = $feature_offset+$feature->[1]-$feature->[0]+1;
                    my $exon_edge_dist = $location-$feature->[0]+1; # equiv of HGVS +X or -X intron syntax, but for exons
                    $exon_edge_dist = $location-$feature->[1]-1 if abs($location-$feature->[1]-1) < $exon_edge_dist; # pick closer of donor and acceptor sites
                    #print STDERR "Got ", $location-$feature->[0]+1, "vs. ", $location-$feature->[1]-1, ": chose $exon_edge_dist\n";
		    if($location < $feature_cds_min{$target_parent}){ #since there is no position 0, the pos is in UTR, subtract one
			$pos = hgvs_plus($pos, -1);
		    }
		    if(length($ref) == 1 and length($variant) == 1){
			# SNP
			record_snv("$target_parent\tc.$pos",
			"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
                        ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
                        #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
		    }
		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
			# insertion
			record_snv("$target_parent\tc.$pos",
			"_",hgvs_plus_exon($pos,1,$last_exon_base),"ins",
			substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
                        ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
                        #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
		    }
		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
                        my $delBases = substr($ref, 1);
			# single nucleotide deletion
			if(length($delBases) == 1){
			    record_snv("$target_parent\tc.",
			    hgvs_plus_exon($pos,1,$last_exon_base),
			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
                            ($pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
                            #($pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
			}
			# longer deletion
			else{
                            $exon_edge_dist = $feature->[1]-$location-length($ref)-1 if abs($feature->[1]-$location-length($ref)-1) < $exon_edge_dist;
			    record_snv("$target_parent\tc.",
			    hgvs_plus_exon($pos,1,$last_exon_base),"_",
			    hgvs_plus_exon($pos,length($ref)-1,$last_exon_base),
			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
                            ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
                            #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
			}
		    }
		    else{
                        $exon_edge_dist = $feature->[1]-$location-length($ref)-1 if abs($feature->[1]-$location-length($ref)-1) < $exon_edge_dist;
			my $rc = reverse($ref);
			$rc=~tr/ACGT/TGCA/;
			if($rc eq $variant and length($variant) > 3){
			    # inversion
			    record_snv("$target_parent\tc.$pos",
			    "_",hgvs_plus_exon($pos,length($ref)-1, $last_exon_base),
			    "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
                            ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
                            #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
			    last;
			}
			# complex substitution
			record_snv("$target_parent\tc.$pos",
			"_",hgvs_plus_exon($pos, length($ref)-1, $last_exon_base),
			"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
                        ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
                        #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
		    }
		    last;
		}
		# 5' of feature
		elsif($location < $feature->[0]){
		    if(length($ref) == 1 and length($variant) == 1){
			# intronic SNP
			if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
			    # Closer to donor site
			    record_snv("$target_parent\tc.",$feature_offset,"+",
			    ($location-$feature_ranges[$i-1]->[1]),
			    "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
			}
			else{
			    # Closer to acceptor site
			    record_snv("$target_parent\tc.",$feature_offset+1,
			    ($location-$feature->[0]),
			    "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
			}
		    }
		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
			if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
			    # intronic insertion near donor
			    my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]);
			    record_snv("$target_parent\tc.",
			    $pos,"_",hgvs_plus($pos,1),
			    "ins",
			    substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
			}
			else{
			    # intronic insertion near acceptor
			    my $pos = ($feature_offset+1).($location-$feature->[0]);
			    record_snv("$target_parent\tc.",
			    $pos,"_",hgvs_plus($pos,1),
			    "ins",
			    substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
			}
		    }
		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
			# intronic deletion
			# single nucleotide deletion
                        my $delBases = substr($ref, 1);
			if(length($ref) == 2){
			    # single intronic deletion near donor
			    if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
                                my $off = $location-$feature_ranges[$i-1]->[1]+1;
				record_snv("$target_parent\tc.",
				$feature_offset,"+",$off,
				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n");
			    }
			    # single intronic deletion near acceptor
			    else{
				my $pos = ($feature_offset+1);
                                my $off = $location-$feature->[0];
				record_snv("$target_parent\tc.",
				hgvs_plus_exon($pos, $off, $pos),
				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n");
			    }
			}
			# longer deletion
			else{
			    if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
				# long intronic deletion near donor
                                my $off = $location-$feature_ranges[$i-1]->[1]+1;
				my $pos = $feature_offset."+".$off;
				record_snv("$target_parent\tc.",
				$pos,"_",hgvs_plus($pos,length($ref)-2),
				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n");
			    }
			    else{
				# long intronic deletion near acceptor
                                my $off = $location-$feature->[0]+1;
				my $pos = ($feature_offset+1).$off;
				record_snv("$target_parent\tc.",
				$pos,"_",hgvs_plus($pos,length($ref)-2),
				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off+length($ref)-2 >= -2 ? "p.?" : "NA"),"\n");
			    }
			}
		    }
		    else{
			my $rc = reverse($ref);
			$rc=~tr/ACGT/TGCA/;
			if($rc eq $variant and length($variant) > 3){
			    # intronic inversion near donor site
			    if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
				my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]);
				record_snv("$target_parent\tc.",
				$pos,"_",hgvs_plus($pos,length($ref)-1),
				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
				join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
			    }
			    else{
				my $pos = ($feature_offset+1).($location-$feature->[0]);
				record_snv("$target_parent\tc.",
				$pos,"_",hgvs_plus($pos, length($ref)-1),
				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
				join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
			    }
			    last;
			}
			# Intronic complex substitution
			# Note: sub maybe have  comma in it to denote two possible variants
			if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
			    # complex intronic substitution near donor site
			    my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]);
			    record_snv("$target_parent\tc.",
			    $pos,"_",hgvs_plus($pos, length($ref)-1),
			    "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
			}
			else{
			    # complex intronic substitution near acceptor site
			    my $pos = ($feature_offset+1).($location-$feature->[0]);
			    record_snv("$target_parent\tc.",
			    $pos,"_",hgvs_plus($pos, length($ref)-1),
			    "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
			}
		    }
		    last;
		}
		else{
		    # feature is past this exon
		    $feature_offset += $feature->[1]-$feature->[0]+1;
		}
	    }
	}
    }  # for each variant on the line
  } # for each  gene overlapping the site the VCF describes
} # for each VCF line
print STDERR "\n" unless $quiet;
close(VCFIN);

# Before we can start printing the variants, we need to look at the phase information and calculate the real haplotype HGVS changes
#if(keys %chr2phase){
  # Note that we could have samtools read-backed haplotype info, MNPs in the VCF, and pre-existing haplotypes in the input VCF (e.g. imputed or based on Mendelian inheritance where trios exist)
  # We need to create new disjoint sets of phased blocks from the (consistent) union of these data.
#  my $chr2phase2variants = combine_phase_data(\%chr2phase);

  # TODO: Calculate protein HGVS syntax for each variant, now that all phase data has been incorporated
  #for my $chr (keys %$chr2phase2variants){
  #  for my $phase (keys %{$chr2phase2variants{$chr}){
  #    # apply all phased changes to the reference chromosomal seq
  #    my $phased_seq = $seq{$chr}; #reference
  #    # sort the variants from 3' to 5' so that edits after indels don't need adjustment in their ref coordinate
  #    my @sorted_variants = sort {my($a_pos) = $a =~ /:(\d+):/; my($b_pos) = $b =~ /:(\d+):/; $b_pos <=> $a_pos} @{$chr2phase2variants{$chr}->{$phase}};
  #    for my $variant (@sorted_variants){
  #    }
  #  }
  #}
#}

# retrieve the MAF info en masse for each chromosome, as this is much more efficient
my @out_lines;
for my $snv (@snvs){
  chomp $snv;
  my @fields = split /\t/, $snv;
  # For CNVs, all the fields are already filled out
  if(@fields > 13){
    push @out_lines, join("\t", $feature_type{$fields[0]}, ($fields[0] =~ /\S/ ? $feature_length{$fields[0]} : "NA"), @fields);
    next;
  }
  my $variant_key = $fields[9];
  $fields[9] = join("\t", prop_info($dbsnp,$internal_snp,$variant_key));
  my $from = $fields[4];
  my $chr_pos_key = $fields[3].":".$from;
  my $to = $fields[4]; # true for SNPs and insertions
  my @variant_key = split /:/, $variant_key;
  # For deletions and complex variants, calculate the affected reference genome range and set the 'to'
  if(length($variant_key[2]) > 1){
    $to += length($variant_key[2])-1;
  }
  splice(@fields, 5, 0, $to);

  # Otherwise expand the key into the relevant MAF values
  $fields[0] =~ s/\/chr.*$//;  # some transcript ids are repeated... we expect "id/chr#" in the GTF file to distinguish these, but should get rid of them at reporting time
  # the offset from the nearest exon border if coding
  push @fields, ($#variant_key > 3 ? $variant_key[4] : "");
  # add gene name(s)
  push @fields, range2genes($fields[3], $from, $to+1);
  # add caveats
  my $c = $fields[3];
  if(not exists $chr2mappability{$c}){
    if($c =~ s/^chr//){
      # nothing more
    }
    else{
      $c = "chr$c";
    }
  }
  my $mappability_caveats = exists $chr2mappability{$c} ? $chr2mappability{$c}->fetch($fields[4], $fields[4]+1) : [];
  if(ref $mappability_caveats eq "ARRAY" and @$mappability_caveats){
    my %h;
    if(exists $chr2caveats{$chr_pos_key}){
      if($chr2caveats{$chr_pos_key} !~ /non-unique/){
         $chr2caveats{$chr_pos_key} = join("; ", grep {not $h{$_}++} @$mappability_caveats)."; ".$chr2caveats{$chr_pos_key};
      }
    }
    else{
      $chr2caveats{$chr_pos_key} = join("; ", grep {not $h{$_}++} @$mappability_caveats);
    }
  }
  push @fields, (exists $chr2caveats{$chr_pos_key} ? $chr2caveats{$chr_pos_key} : "");
  # add phase
  push @fields, find_phase($variant_key);
  push @out_lines, join("\t", $feature_type{$fields[0]}, $feature_length{$fields[0]}, @fields);
}

# Now tabulate the rare variant numbers
my %gene2rares;
my %gene2aa_rares;
for my $line (@out_lines){
  my @F = split /\t/, $line, -1;
  if($F[15] eq "NA" or $F[15] < $rare_variant_prop and (!$internal_snp or $F[17] < $rare_variant_prop)){
    my $gene_list = $internal_snp ? $F[20] : $F[19];
    next unless defined $gene_list;
    for my $g (split /; /, $gene_list){
      $gene2rares{$g}++;
      # Check the cDNA HGVS syntax for relevance
      if($F[3] =~ /c.\d+/ or # coding
         $F[3] =~ /c.\d+.*-[12]/ or # splicing acceptor
         $F[3] =~ /c.\d+\+[12345]/){ # splicing donor
        $gene2aa_rares{$g}++;
      }
    }
  }
}

# Print the results
for my $line (@out_lines){
  my @F = split /\t/, $line, -1;
  my $gene_list = $internal_snp ? $F[20] : $F[19];
  if(not defined $gene_list){
    print OUT join("\t", @F, "", ""), "\n"; next;
  }

  my $max_gene_rare = 0;
  my $max_gene_aa_rare = 0;
  for my $g (split /; /, $gene_list){
    next unless exists $gene2rares{$g};
    if($gene2rares{$g} > $max_gene_rare){
      $max_gene_rare = $gene2rares{$g};
    }
    next unless exists $gene2aa_rares{$g};
    if($gene2aa_rares{$g} > $max_gene_aa_rare){
      $max_gene_aa_rare = $gene2aa_rares{$g};
    }
  }
  print OUT join("\t", @F, $max_gene_rare, $max_gene_aa_rare), "\n";
}
close(OUT);

sub range2genes{
  my ($c, $from, $to) = @_;
  if(not exists $gene_ids{$c}){
    if($c =~ s/^chr//){
      # nothing more
    }
    else{
      $c = "chr$c";
    }
  }
  if(exists $gene_ids{$c}){
    my %have;
    return join("; ", grep {not $have{$_}++} @{$gene_ids{$c}->fetch($from, $to+1)});
  }
  else{
    return "";
  }
}
sub combine_phase_data{
  my ($phases) = @_; # map from variant to its phase data

  # Create a reverse mapping from phase regions to their variants
  my %chr2phase_region2variants;
  my @variants = keys %$phases;
  for my $variant (@variants){
    my ($chr) = $variant =~ /^\S+?-(\S+):/;
    $chr2phase_region2variants{$chr} = {} if not exists $chr2phase_region2variants{$chr};
    for my $phase_region (split /,/, $phases->{$variant}){
      $chr2phase_region2variants{$chr}->{$phase_region} = [] if not exists $chr2phase_region2variants{$chr}->{$phase_region};
      push @{$chr2phase_region2variants{$phase_region}}, $variant;
    }
  }

  # Now for each phased block known so far, see if any variant in it is also part of another block
  # If so, do a union since phasing is both transitive and commutative.
  # The quickest way to do this is to check for overlapping intervals, then check for common members amongst those that do overlap
  for my $chr (keys %chr2phase_region2variants){
    my @ordered_phase_regions = sort {my($a_pos) = $a =~ /:(\d+)/; my($b_pos) = $b =~ /:(\d+)/; $a_pos <=> $b_pos} keys %{$chr2phase_region2variants{$chr}};
    my $sets = new DisjointSets(scalar(@ordered_phase_regions));

    for (my $i = 0; $i < $#ordered_phase_regions; $i++){
      my ($start, $stop, $variant) = $ordered_phase_regions[$i];
      for (my $j = $i+1; $j <= $#ordered_phase_regions; $j++){
        my ($start2, $stop2, $variant2) = $ordered_phase_regions[$j];
        if($start2 > $stop){ # won't overlap any regions after this in the sorted list
          last;
        }
        # If we get here, it is implicit that $stop >= $start2 and $start < $stop2, i.e. there is overlap
        # Now check if there is a shared variant (otherwise we might erroneously join blocks from different physical chromosomal arms)
        my $have_shared_variant = 0;
        my $overlapping_phase_region = $ordered_phase_regions[$j];
        for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$i]}}){
          if($phases->{$variant} =~ /\b$overlapping_phase_region\b/){
            $have_shared_variant = 1; last;
          }
        }
        # sanity check that there aren't conflicting variants in the new block (i.e. two different variants in the same position)
        my %pos2base;
        my $have_conflicting_variant = 0;
        for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$i]}}, @{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$j]}}){
          my ($pos, $base) = $variant =~ /(\d+):(.+?)$/;
          if(exists $pos2base{$pos} and $pos2base{$pos} ne $base){
            # conflict, note with a caveat
            if(exists $chr2caveats{"$chr:$pos"}){
              $chr2caveats{"$chr:$pos"} .= "; inconsistent haplotype phasing" unless $chr2caveats{"$chr:$pos"} =~ /inconsistent haplotype phasing/;
            }
            else{
              $chr2caveats{"$chr:$pos"} = "inconsistent haplotype phasing";
            }
            $have_conflicting_variant ||= 1;
          }
          elsif(not exists $pos2base{$pos}){
            $pos2base{$pos} = $base;
          }
        }

        $sets->union($i+1, $j+1) if $have_shared_variant and not $have_conflicting_variant; # indexes are one-based for sets rather than 0-based
      }
    }
    my $phase_sets = $sets->sets; #disjoint haplotype sets
    my %region_counts;
    for my $phase_set (@$phase_sets){
      next if scalar(@$phase_set) == 1; # No change to existing phase region (is disjoint from all others)
      # Generate a new haploblock to replace the old ones that are being merged
      my $merged_start = 10000000000;
      my $merged_end = 0;
      for my $ordered_phase_region_index (@$phase_set){
        my ($start, $end) = $ordered_phase_regions[$ordered_phase_region_index-1] =~ /(\d+)-(\d+)$/;
        $merged_start = $start if $start < $merged_start;
        $merged_end = $end if $end > $merged_end;
      }
      # At the start of the region is a unique prefix so we can tell the arms apart if two haploblocks have the exact same boundary
      my $region_count = $region_counts{"$merged_start-$merged_end"}++;
      my $merged_haploblock_name = "Y$region_count-$chr:$merged_start-$merged_end";
      # Assign this new name to overwrite the premerge values for each variant contained within
      for my $ordered_phase_region_index (@$phase_set){
        for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$ordered_phase_region_index-1]}}){ # incl. one-based set correction in 0-based array index
          print STDERR "Merging $variant from ", $phases->{$variant}, " into new block $merged_haploblock_name\n";
          $phases->{$variant} = $merged_haploblock_name;
        }
      }
    }
    # TODO: if there are overlapping phase blocks still, but with different variants in the same position, we can infer that they are on the opposite strands...
  }
}

# Sees if the positions of the variant are in the range of a phased haplotype, returning which allele it belongs to
sub find_phase{
  my ($chr,$pos,$ref,$variant) = split /:/, $_[0];
  return "" if length($ref) != length($variant); # Can only deal with SNPs (and broken down MNPs) for now
  for(my $i = 0; $i < length($ref); $i++){
    my $key = "$chr:".($pos+$i).":".substr($variant, $i, 1);
    #print STDERR "Checking phase for $key\n" if $pos == 12907379;
    if(exists $chr2phase{$key}){
      #print STDERR "returning phase data $chr2phase{$key}\n";
      return $chr2phase{$key};
    }
    elsif(exists $chr2phase{"chr".$key}){
      #print STDERR "returning phase data ", $chr2phase{"chr".$key}, "\n";
      return $chr2phase{"chr".$key};
    }
  }
  return "";
}

sub find_earliest_index{
  # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start
   my ($query, $array) = @_;

   return 0 if $query < $array->[0]->[0];

   my ($left, $right, $prevCenter) = (0, $#$array, -1);

   while(1){
      my $center = int (($left + $right)/2);

      my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1);

      return $center if $cmp == 0;
      if ($center == $prevCenter) {
         return $left;
      }
      $right = $center if $cmp < 0;
      $left  = $center if $cmp > 0;
      $prevCenter = $center;
   }
}
author	Yusuf Ali <ali@yusuf.email>
date	Wed, 25 Mar 2015 15:49:28 -0600
parents
children