Mercurial > repos > yusuf > poor_gene_coverage

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PoorGeneCoverage.xml	Wed Mar 25 15:49:28 2015 -0600
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+
+<tool id="poor_gene_coverage" name="Report Poor Coverage">
+  <description>regions for a set of genes</description>
+  <version_string>echo 1.0.0</version_string>
+  <command interpreter="perl">annotate_low_coverage $__tool_data_path__  ${captureKitRegions} $poorCaptureRegions $outputTable
+     #if $targetGenes.dataset and str($targetGenes) > '':
+       $targetGenes
+     #else:
+       /dev/null
+     #end if
+     NM_
+  </command>
+  <inputs>
+    <param name="sampleName" type="text" label="Biological sample name" help="Will be used to name the output files"/>
+    <param name="captureKitRegions" type="select" display="radio" dynamic_options="kit_fileOptions()" label="The enrichment (capture) kit used for the sequencing experiment"/>
+    <param format="bed" name="poorCaptureRegions" type="data" label="BED file of low coverage regions"/>
+    <param format="text" name="targetGenes" type="data" optional="true" label="List of genes to report, one per line"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="outputTable"/>
+  </outputs>
+
+  <!-- the following code populates the  selection from the public capture kit BED datasets available in the local Galaxy installation -->
+  <code file="capture_kits.py"/>
+  <tests/>
+
+  <help>This tools summarized low coverage regions on a per gene basis, including information on clinically relevant variants (according to ClinVar) that might have been missed</help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/add_names_to_bed	Wed Mar 25 15:49:28 2015 -0600
@@ -0,0 +1,70 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use Set::IntervalTree;
+
+# This script adds names to the first BED file regions, based on the names for regions provided
+# by the second BED file.  The output BED file is the same as the first BED file, but with names added where possible.
+@ARGV == 4 or die "Usage: $0 <unnamed input regions.bed> <named input regions.bed> <renamed output.bed> <output log.txt>\n";
+
+my %names;
+open(NAMED_BED, $ARGV[1])
+  or die "Cannot open $ARGV[1] for reading: $!\n";
+while(<NAMED_BED>){
+  my @F = split /\t/, $_;
+  next unless @F > 3;
+  $names{$F[0]} = Set::IntervalTree->new() if not exists $names{$F[0]};
+  $names{$F[0]}->insert($F[3], $F[1], $F[2]);
+}
+close(NAMED_BED);
+die "The BED file $ARGV[1] did not contain any named regions, aborting renaming procedure\n" if not keys %names;
+
+open(INPUT_BED, $ARGV[0])
+  or die "Cannot open $ARGV[0] for reading: $!\n";
+open(OUTPUT_BED, ">$ARGV[2]")
+  or die "Cannot open $ARGV[2] for writing: $!\n";
+my $num_datalines = 0;
+my $num_changes = 0;
+while(<INPUT_BED>){
+  chomp;
+  my @F = split /\t/, $_;
+  if(@F < 3){
+    # not a data line
+    print OUTPUT_BED $_, "\n";
+    next;
+  }
+  $num_datalines++;
+  my $chr = $F[0];
+  if(not exists $names{$chr}){ # must be a contig that was in the named BED file
+    if(exists $names{"chr$chr"}){
+      $chr = "chr$chr";
+    }
+    elsif($chr =~ /^chr(\S+)/ and exists $names{$1}){
+      $chr = $1;
+    }
+    else{
+      next;
+    }
+  }
+  my $renames = $names{$chr}->fetch($F[1], $F[2]+1);
+  if(not @$renames){
+    print OUTPUT_BED $_,"\n"; # no mod
+  }
+  else{
+    $num_changes++;
+    my %h;
+    my @renames = grep {not $h{$_}++} @$renames; # just get unique set of names
+    print OUTPUT_BED join("\t", @F[0..2], join("/", @renames), @F[4..$#F]), "\n"; # with new name
+  }
+}
+close(OUTPUT_BED);
+close(INPUT_BED);
+
+if(open(LOG, ">>$ARGV[3]")){
+  print LOG "Changed $num_changes/$num_datalines lines\n";
+}
+else{
+  print "Could not append to log file $ARGV[3]: writing to stdout instead\nChanged $num_changes/$num_datalines lines\n";
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/annotate_low_coverage	Wed Mar 25 15:49:28 2015 -0600
@@ -0,0 +1,194 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Basename;
+#@ARGV >= 4 and @ARGV <= 6 or die "Usage: $0 <capture kit.bed> <poor regions input.bed> <annotated output.hgvs.txt> [target gene list.txt] [gene name pattern for detail reporting]\n";
+
+my $dirname = dirname(__FILE__);
+my $tool_dir = shift @ARGV;
+
+#read config file
+if(not -e "$tool_dir/report_poor_converage" ){
+  system("cp $dirname/tool-data/report_poor_coverage $tool_dir/report_poor_coverage");
+}
+my %configuration_file;
+open FILE, '<', "$tool_dir/report_poor_coverage";
+while(<FILE>){
+	(my $key, my $value) = split(/\s+/,$_);
+	$configuration_file{$key} = $value;
+}
+close FILE;
+my $ref_fasta = $configuration_file{"ref_fasta"};
+my $ref_flat = $configuration_file{"ref_flat"};
+my $ref_gencode = $configuration_file{"ref_gencode"};
+my $clinVar = $configuration_file{"clinVar"};
+my $dgv2 = $configuration_file{"dgv2"};
+
+my $capture_kit_bed = $configuration_file{"capturekits_directory"} . (shift @ARGV) . ".bed" ;
+my $low_input_bed = shift @ARGV;
+my $low_output_hgvs = shift @ARGV;
+my $target_list = @ARGV ? shift @ARGV : undef;
+my $gene_name_regex = @ARGV ? shift @ARGV : undef;
+my $tmp_hgvs = "$$.tmp.hgvs.txt";
+
+my $tmp_bed;
+if(defined $target_list and $target_list ne "/dev/null"){
+  $tmp_bed = "$$.targeted.tmp.bed";
+  #print STDERR "filter_by_list false $low_input_bed $target_list $tmp_bed 1 3\n";
+  system "$dirname/add_names_to_bed $low_input_bed $ref_flat - /dev/null | $dirname/filter_by_list false - $target_list $tmp_bed 1 3"; # 3 indicates 4th column of the bed file, which should have the gene name
+  $low_input_bed = $tmp_bed;
+  open(TARGETS, "$dirname/add_names_to_bed $capture_kit_bed $ref_flat - /dev/null | $dirname/filter_by_list false - $target_list - 1 3 |")
+    or die "Cannot run filter_by_list, aborting: $!\n";
+}
+else{
+  open(TARGETS, "$dirname/add_names_to_bed $capture_kit_bed $ref_flat - /dev/null |")
+    or die "Cannot run add_names_to_bed on $capture_kit_bed: $!\n";
+}
+
+#print STDERR "/export/achri_galaxy/galaxy-dist/tools/achri/vcf2hgvs_table -q -p 0.05 -c $low_input_bed -g /export/achri_galaxy/dbs/DGV2/hg19.2012-03-29.txt.gz -o - -r /export/achri_galaxy/dbs/hg19.fa -e /export/achri_galaxy/dbs/hg19_refGene_gencode_ultrasensitive.gtf -i /dev/null -b /export/achri_galaxy/dbs/hg19_refFlat_2014-07-10.bed | /export/achri_galaxy/galaxy-dist/tools/achri/filter_by_index_gamma /export/achri_galaxy/dbs/ClinVar/ClinVarFullRelease_2014-03.xml. ClinVar - $tmp_hgvs \"\"";
+system "$dirname/vcf2hgvs_table -q -p 0.05 -c $low_input_bed -g $dgv2 -o - -r $ref_fasta -e $ref_gencode -i /dev/null -b $ref_flat | $dirname/filter_by_index_gamma $clinVar. ClinVar - $tmp_hgvs \"\"";
+my $tmp_output = "$$.tmp.annotated.hgvs.txt";
+#system "/export/achri_galaxy/galaxy-dist/tools/achri/hgvs_table_annotate /export/achri_galaxy/dbs/sift/hg19 /export/achri_galaxy/dbs/polyphen2/hg19.txt.gz /export/achri_galaxy/dbs/gerp/hg19 /export/achri_galaxy/dbs/TissueDistributionDBs/human.v2009-07-30.tab /export/achri_galaxy/dbs/pathways/KEGG.human.2012-09-25.txt /export/achri_galaxy/dbs/interpro_supermatch_hg19.bed $tmp_hgvs $tmp_output /export/achri_galaxy/dbs/hg19.fa";
+open(OUT, ">$low_output_hgvs")
+  or die "Cannot open $low_output_hgvs for writing: $!\n";
+open(COLLAPSE, "$dirname/hgvs_collapse_transcripts $tmp_hgvs - 1 |")
+  or die "cannot run hgvs_collapse_transcripts: $!\n";
+my $header = <COLLAPSE>;
+chomp $header;
+my @headers = split /\t/, $header;
+my ($chr_column, $from_column, $to_column, $ftype_column, $clinvar_column, $loc_column, $maf_column, $gene_column, $gene_desc_column, $cdna_hgvs_column, $transcript_column, $zygosity_column, $phase_column);
+my %req_columns = (
+        "Chr" => \$chr_column,
+        "DNA From" => \$from_column,
+        "DNA To" => \$to_column,
+        "Feature type" => \$ftype_column,
+#        "Variant context" => \$loc_column,
+        "Pop. freq." => \$maf_column,
+        "Gene Name" => \$gene_column,
+#        "Gene Function" => \$gene_desc_column,
+        "ClinVar Text Matches" => \$clinvar_column,
+        "Transcript HGVS" => \$cdna_hgvs_column,
+        "Selected transcript" => \$transcript_column,
+        "Zygosity" => \$zygosity_column,
+        "Phase" => \$phase_column);
+&load_header_columns(\%req_columns, \@headers);
+my %col2name = reverse %req_columns;
+
+my %target_sizes;
+<TARGETS>; # header
+while(<TARGETS>){
+  chomp;
+  my @F = split /\t/, $_;
+  next unless $#F >= 3; # nameless records
+  for my $gene_name (split /;\s*/, $F[3]){
+    #print STDERR "Processing $_\n";
+    $target_sizes{$gene_name} += $F[2]-$F[1]+1; # assumes non-overlapping targets in BED file
+  }
+}
+close(TARGETS);
+my $target_total = 0;
+for my $gene_name (keys %target_sizes){
+  #print STDERR "Adding size of $gene_name ($target_sizes{$gene_name})\n";
+  $target_total += $target_sizes{$gene_name};
+}
+
+my %missing_homo;
+my %missing_coding_homo;
+my %missing_het;
+my %missing_coding_het;
+my @detail_lines;
+while(<COLLAPSE>){
+  chomp;
+  my @F = split /\t/, $_, -1;
+
+  for my $gene_name (split /;\s*/, uc($F[$gene_column])){
+    next if not exists $target_sizes{$gene_name}; # not a gene of interest
+    #print STDERR "Processing $_\n";
+    next if defined $gene_name_regex and $F[$transcript_column] !~ /$gene_name_regex/o;
+    #print STDERR "Passes filters\n";
+    if(not exists $missing_homo{$gene_name}){
+      $missing_homo{$gene_name} = {};
+      $missing_coding_homo{$gene_name} = {};
+      $missing_het{$gene_name} = {};
+      $missing_coding_het{$gene_name} = {};
+    }
+
+    if($F[$zygosity_column] =~ /homo/){
+      for my $pos ($F[$from_column]..$F[$to_column]){
+        $missing_homo{$gene_name}->{$pos}++;
+        $missing_coding_homo{$gene_name}->{$pos}++ if $F[$ftype_column] =~ /protein_coding/;# and $F[$loc_column] =~ /exon/;
+      }
+    }
+    else{
+      for my $pos ($F[$from_column]..$F[$to_column]){
+        $missing_het{$gene_name}->{$pos}++;
+        $missing_coding_het{$gene_name}->{$pos}++ if $F[$ftype_column] =~ /protein_coding/;# and $F[$loc_column] =~ /exon/;
+        #print STDERR "Missing $gene_name het $pos ($F[$from_column]..$F[$to_column]) $F[$ftype_column] $F[$loc_column]\n";
+      }
+    }
+    next unless $F[$ftype_column] =~ /protein_coding/;# and $F[$loc_column] =~ /exon/;
+    #push @detail_lines, [$F[$chr_column], $F[$from_column], $F[$to_column], $F[$maf_column], $gene_name, $F[$cdna_hgvs_column], $F[$transcript_column], ($F[$zygosity_column] =~ /homo/ ? "<4x" : "<20x"), $F[$gene_desc_column], $F[$domain_column], $F[$gene_column]];
+    $F[$clinvar_column] = "" if not defined $F[$clinvar_column];
+    $F[$clinvar_column] =~ s/&gt;/>/;
+    push @detail_lines, [$F[$chr_column], $F[$from_column], $F[$to_column], $F[$maf_column], $gene_name, $F[$cdna_hgvs_column], $F[$transcript_column], ($F[$zygosity_column] =~ /homo/ ? "<4x" : "<20x"), $F[$gene_column], $F[$clinvar_column]];
+  }
+}
+my $missing_homo = 0;
+my $missing_coding_homo = 0;
+my $missing_het = 0;
+my $missing_coding_het = 0;
+for my $gene_name (keys %target_sizes){
+  $missing_homo{$gene_name} = defined $missing_homo{$gene_name} ? keys %{$missing_homo{$gene_name}} : 0;
+  $missing_homo += $missing_homo{$gene_name};
+  $missing_coding_homo{$gene_name} = defined $missing_coding_homo{$gene_name} ? keys %{$missing_coding_homo{$gene_name}} : 0;
+  $missing_coding_homo += $missing_coding_homo{$gene_name};
+  $missing_het{$gene_name} = defined $missing_het{$gene_name} ? keys %{$missing_het{$gene_name}} : 0;
+  $missing_het += $missing_het{$gene_name};
+  $missing_coding_het{$gene_name} = defined $missing_coding_het{$gene_name} ? keys %{$missing_coding_het{$gene_name}} : 0;
+  $missing_coding_het += $missing_coding_het{$gene_name};
+}
+if(defined $gene_name_regex){
+  print OUT "Note: only considering coding sequence target bases in gene models with IDs matching \"$gene_name_regex\"\n";
+}
+print OUT "\ttotal size\tcoding <4x coverage\t%\tcoding<20x coverage\t%\ttotal <4x coverage\t%\ttotal <20x coverage\n";
+printf OUT "Total target gene bases designed for capture\t%d\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\n", $target_total, $missing_coding_homo, $missing_coding_homo/$target_total*100, $missing_coding_het, $missing_coding_het/$target_total*100,
+            $missing_homo, $missing_homo/$target_total*100, $missing_het, $missing_het/$target_total*100;
+print OUT "Per gene missing coding sequence coverage breakdown:\n";
+for my $gene_name (sort keys %missing_homo){
+  printf OUT "%s\t%d\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\t%d\t%.1f\n", $gene_name, $target_sizes{$gene_name}, $missing_coding_homo{$gene_name},
+            $missing_coding_homo{$gene_name}/$target_sizes{$gene_name}*100, $missing_coding_het{$gene_name}, $missing_coding_het{$gene_name}/$target_sizes{$gene_name}*100,
+            $missing_homo{$gene_name}, $missing_homo{$gene_name}/$target_sizes{$gene_name}*100,
+            $missing_het{$gene_name}, $missing_het{$gene_name}/$target_sizes{$gene_name}*100;
+}
+# Print revised header that's only a few of the fields
+@detail_lines = sort {$a->[4] cmp $b->[4] or $a->[1] <=> $b->[1]} @detail_lines;
+print OUT "\n\nCoding region low coverage details (alphabetical by gene name):\n";
+print OUT join("\t", $col2name{\$chr_column}, $col2name{\$from_column}, $col2name{\$to_column}, $col2name{\$maf_column}, $col2name{\$gene_column}, $col2name{\$cdna_hgvs_column},
+                     $col2name{\$transcript_column}, "Low coverage type", "Genes in this region (gene overlaps possible)", $col2name{\$clinvar_column}), "\n";
+print OUT join("\n", map {join("\t", @{$_})} @detail_lines), "\n";
+
+unlink($tmp_hgvs);
+unlink($tmp_output);
+unlink($tmp_bed) if defined $tmp_bed;
+
+sub load_header_columns{
+  my ($reqs_hash_ref, $headers_array_ref) = @_;
+  my %unfulfilled;
+  for my $field_name (keys %$reqs_hash_ref){
+    $unfulfilled{$field_name} = 1;
+  }
+  for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){
+    for my $req_header_name (keys %$reqs_hash_ref){
+      if($req_header_name eq $headers_array_ref->[$i]){
+        ${$reqs_hash_ref->{$req_header_name}} = $i;
+        delete $unfulfilled{$req_header_name};
+        last;
+      }
+    }
+  }
+  if(keys %unfulfilled){
+    die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n";
+  }
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/capture_kits.py	Wed Mar 25 15:49:28 2015 -0600
@@ -0,0 +1,29 @@
+import os
+import re
+import sys
+import operator
+import csv
+from galaxy import config
+
+# get tool-data path
+configur =  config.Configuration()
+kitDir = configur.resolve_path("tool-data")
+
+# determine if config file exists
+if not os.path.exists( kitDir +  "/report_poor_coverage" ):
+    kitDir = "/export/achri_galaxy/dbs/CaptureKits/";
+else:
+    with open(kitDir + "/report_poor_coverage", "r") as tsv:
+	     for line in csv.reader(tsv, delimiter="\t"):
+		      if line[0] == 'capturekits_directory':
+				    kitDir = line[1]
+
+def kit_fileOptions():
+    list = os.listdir(kitDir);
+    list.sort()
+    pattern = re.compile('(.*)$')
+    fileOptions = [(s) for s in list if os.path.exists(kitDir + s)]
+    ds = [pattern.match(s) for s in fileOptions]
+    datasets = [(m.group(1), m.group(1), False) for m in ds if m]
+    return datasets
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_bam_by_list	Wed Mar 25 15:49:28 2015 -0600
@@ -0,0 +1,55 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use IO::Handle;
+
+# Report lines of a file that have as one of the column values a value from the pattern file
+@ARGV == 6 or die "Usage: $0 <input.bam> <named genomic regions.bed> <file of patterns> <filtered output.bam> <matched regions.bed> <output messages.txt>\n";
+
+die "Input BAM file $ARGV[0] does not exist\n" if not -e $ARGV[0];
+die "Input BAM file $ARGV[0] is not readable\n" if not -r $ARGV[0];
+
+my @alts;
+if(-e $ARGV[2]){
+  open(PATTERNS, $ARGV[2])
+    or die "Cannot open $ARGV[1] for reading: $!\n";
+  while(<PATTERNS>){
+    chomp;
+    push @alts, $_;
+  }
+  close(PATTERNS);
+}
+else{ # else assume the arg is a list of names directly
+  @alts = split /\s+/, $ARGV[2];
+}
+
+my @regions;
+my %seen;
+my $regex = "^(?:".join("|", @alts).")\$";
+open(TAB, $ARGV[1])
+  or die "Cannot open $ARGV[1] for reading: $!\n";
+while(<TAB>){
+  chomp;
+  my @F = split /\t/, $_;
+  next unless @F > 3;
+  if($F[3] =~ /$regex/io){
+    next if $seen{"$F[0]:$F[1]-$F[2]"}++; # sometimes regions are repeated, don't send these repeats to samtools
+    push @regions, $_;
+  }
+}
+close(TAB);
+
+die "No matches to desired names in the provided named genomic regions BED file, aborting filtered BAM file creation" if not @regions;
+
+open(BED, ">$ARGV[4]")
+  or die "Cannot open $ARGV[4] for writing: $!\n";
+print BED join("\n", @regions), "\n";
+close(BED);
+
+open(ERRFILE, ">$ARGV[5]") or die "Cannot open $ARGV[5] for writing: $!\n";
+STDOUT->fdopen(\*ERRFILE, "w") or die "Cannot redirect stdout to $ARGV[5]: $!\n";
+STDERR->fdopen(\*ERRFILE, "w") or die "Cannot redirect stderr to $ARGV[5]: $!\n";
+system("samtools view -h -L $ARGV[4] $ARGV[0] | samtools view -S -b - > $ARGV[3]") >> 8
+  and die "Samtools failed: exit status ", ($?>>8), "\n";
+system("samtools index $ARGV[3]");
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_by_index_gamma	Wed Mar 25 15:49:28 2015 -0600
@@ -0,0 +1,492 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use DB_File;
+use Parse::BooleanLogic;
+use Math::CDF qw(pgamma qgamma); # relevance score -> gamma p-value
+use PDL qw(pdl);
+use PDL::Stats::Distr qw(mme_gamma); # gamma dist parameter estimates
+use vars qw($parser %cached_sentences %sentence_index);
+
+my $quiet = 0;
+if(@ARGV and $ARGV[0] =~ /^-q/){
+  $quiet = 1;
+  shift @ARGV;
+}
+
+@ARGV == 5 or die "Usage: $0 [-q(uiet)] <index filename base> <db name> <hgvs_annotated.txt> <output.txt> <query>\nWhere query has the format \"this or that\", \"this and that\", etc.\n";
+
+my $signal_p = 0.95; # signal is top 5% of scores
+my $index_filename_base = shift @ARGV;
+my $db_name = shift @ARGV;
+my $hgvs_file = shift @ARGV;
+my $out_file = shift @ARGV;
+my $orig_query = shift @ARGV;
+
+$parser = new Parse::BooleanLogic(operators => ['and', 'or']);
+my $query_tree = $parser->as_array($orig_query, error_cb => sub {die "Could not parse query: @_\n"});
+# For simplicity, turn the tree into a base set of or statements (which means expanding "A and (B or C)" into "A and B or A and C") a.k.a. "sum of products/minterms"
+my @query_terms = flatten_query($query_tree);
+
+my $df_index_filename = $index_filename_base."df_index";
+my %df_index;
+my $df_index_handle = tie %df_index, "DB_File", $df_index_filename, O_RDONLY, 0400, $DB_BTREE
+			or die "Cannot open $df_index_filename: $!\n";
+my $gene_record_count = $df_index{"__DOC_COUNT__"};
+
+my $sentence_index_filename = $index_filename_base."sentence_index";
+my $sentence_index_handle = tie %sentence_index, "DB_File", $sentence_index_filename, O_RDONLY, 0400, $DB_HASH
+			or die "Cannot open $sentence_index_filename: $!\n";
+
+# Get the list of gene symbols we'll need
+open(HGVS, $hgvs_file)
+  or die "Cannot open $hgvs_file for reading: $!\n";
+my $header = <HGVS>;
+chomp $header;
+my @header_columns = split /\t/, $header;
+my ($gene_name_column, $chr_column, $from_column, $to_column);
+for(my $i = 0; $i <= $#header_columns; $i++){
+  if($header_columns[$i] eq "Gene Name"){
+    $gene_name_column = $i;
+  }
+  elsif($header_columns[$i] eq "Chr"){
+    $chr_column = $i;
+  }
+  elsif($header_columns[$i] eq "DNA From"){
+    $from_column = $i;
+  }
+  elsif($header_columns[$i] eq "DNA To"){
+    $to_column = $i;
+  }
+}
+my $blank_query = not @query_terms;
+# Special case of empty query means print all info for variant ranges listed in the input HGVS file (assuming the DB was indexed to include chr:pos keys)
+if($blank_query){
+  #print STDERR "Running blank query\n";
+  if(not defined $chr_column){
+    die "Could not find 'Chr' column in the input header, aborting\n";
+  }
+  if(not defined $from_column){
+    die "Could not find 'DNA From' column in the input header, aborting\n";
+  }
+  if(not defined $to_column){
+    die "Could not find 'DNA To' column in the input header, aborting\n";
+  }
+  # Build the list of locations that will need to be searched in the index
+
+  open(OUT, ">$out_file")
+    or die "Cannot open $out_file for writing: $!\n";
+  print OUT $header, "\t$db_name Text Matches\n";
+
+  while(<HGVS>){
+    chomp;
+    my @F = split /\t/, $_, -1;
+    my @pos_data;
+    for my $pos ($F[$from_column]..$F[$to_column]){ # for each position in the range
+      my $pos_match_data = fetch_sentence("$F[$chr_column]:$pos", -1); # fetch all data for this position
+      push @pos_data, "*$F[$chr_column]:$pos* ".$pos_match_data if defined $pos_match_data;
+    }
+    print OUT join("\t", @F, join(" // ", @pos_data)),"\n";
+  }
+  close(OUT);
+  exit;
+}
+elsif(not defined $gene_name_column){
+  die "Could not find 'Gene Name' column in the input header, aborting\n";
+}
+#print STDERR "Query terms: " , scalar(@query_terms), "\n";
+my %gene_to_query_match_ranges;
+# Determine the set of genes that might match the query, based on the word index
+for my $query_term (@query_terms){
+  #print STDERR "Query term $query_term\n";
+  my %doc_hits; # how many needed words match the document?
+  my $contiguous = 1; #by default multiword queries must be contiguous
+  # Unless it's an AND query
+  if($query_term =~ s/ and / /g){
+    $contiguous = 0;
+  }
+
+  my @words = split /\s+/, $query_term; # can be multi-word term like "mental retardation"
+  for(my $i = 0; $i <= $#words; $i++){
+    my $word = mc($words[$i]); # can be a stem word, like hypoton
+    #print STDERR "Checking word $word...";
+    if($i == 0){
+      my $first_word_docs = get_doc_offsets($df_index_handle, $word); # get all words' docs off this stem
+      #print STDERR scalar(keys %$first_word_docs), " documents found\n";
+      for my $doc (keys %$first_word_docs){
+        $doc_hits{$doc} = $first_word_docs->{$doc}; # populate initial hit list that'll be whittled down in subsequent outer loops of multiword phrase members
+      }
+      next;
+    }
+    my @candidate_docs = keys %doc_hits;
+    last if not @candidate_docs; # short circuit searches guaranteed to fail
+
+    # each additional word must directly follow an existing match
+    my $word_doc_offsets_ref = get_doc_offsets($df_index_handle, $word); # get all words' docs off this stem
+    #print STDERR scalar(keys %$word_doc_offsets_ref), " documents found\n";
+    for my $doc (@candidate_docs){
+      my $num_matches = 0;
+      if(not exists $word_doc_offsets_ref->{$doc}){ # required word missing, eliminate doc from consideration
+        delete $doc_hits{$doc};
+        next;
+      }
+      # see if any of the instances of the additional words directly follow the last word we successfully matched
+      my $so_far_matches_ref = $doc_hits{$doc};
+      my $next_word_matches_ref = $word_doc_offsets_ref->{$doc};
+      for (my $j=0; $j <= $#{$so_far_matches_ref}; $j++){
+        my $existing_match_extended = 0;
+        next unless defined $so_far_matches_ref->[$j]->[2]; # every once in a while there is no article id parsed
+        for (my $k=0; $k <= $#{$next_word_matches_ref}; $k++){
+          # Same article?
+          next unless defined $next_word_matches_ref->[$k]->[2] and $next_word_matches_ref->[$k]->[2] eq $so_far_matches_ref->[$j]->[2];
+          if(not $contiguous){
+            $so_far_matches_ref->[$j]->[4] .= " AND ".$next_word_matches_ref->[$k]->[4]; # update the matched term to include the extension too
+            if(ref $so_far_matches_ref->[$j]->[3] ne "ARRAY"){ # match does not yet span multiple sentences
+              last if $next_word_matches_ref->[$k]->[3] == $so_far_matches_ref->[$j]->[3]; # same sentence
+              $so_far_matches_ref->[$j]->[3] = [$so_far_matches_ref->[$j]->[3], $next_word_matches_ref->[$k]->[3]]; # change from scalar to array (of sentence numbers)
+            }
+            elsif(not grep {$_ eq $next_word_matches_ref->[$k]->[3]} @{$so_far_matches_ref->[$j]->[3]}){
+              push @{$so_far_matches_ref->[$j]->[3]}, $next_word_matches_ref->[$k]->[3]; # add top spanning sentences list of not already there
+            }
+          }
+          # else contiguous word occurences required.
+          # Same sentence?
+          next unless $next_word_matches_ref->[$k]->[3] == $so_far_matches_ref->[$j]->[3];
+
+          my $space_between_match_words = $next_word_matches_ref->[$k]->[0] - $so_far_matches_ref->[$j]->[1];
+          if($space_between_match_words <= 2){
+            $existing_match_extended = 1;
+            $so_far_matches_ref->[$j]->[1] = $next_word_matches_ref->[$k]->[1]; # move the match cursor to include the new extending word
+            $so_far_matches_ref->[$j]->[4] .= " ".$next_word_matches_ref->[$k]->[4]; # update the matched term to include the extension too
+            last;
+          }
+          elsif($space_between_match_words > 2){ # more than two typographical symbols between words, consider non-continuous
+            last;  # since the offsets are in order, any further k would only yield a larger spacing, so shortcircuit
+          }
+        }
+        if(not $existing_match_extended){
+          splice(@$so_far_matches_ref, $j, 1);
+          $j--;
+        }
+        else{
+          $num_matches++;
+        }
+      }
+      if(not $num_matches){
+        delete $doc_hits{$doc};
+      }
+    }
+  }
+  # the only keys that get to this point should be those that match all terms
+  for my $doc (keys %doc_hits){
+    $gene_to_query_match_ranges{$doc} = [] if not exists $gene_to_query_match_ranges{$doc};
+    push @{$gene_to_query_match_ranges{$doc}}, [$query_term, @{$doc_hits{$doc}}];
+  }
+}
+
+my @matched_genes = keys %gene_to_query_match_ranges;
+#print STDERR "Found ", scalar(@matched_genes), "/$gene_record_count records in cached iHOP matching the query\n" unless $quiet;
+my %query_gene_counts;
+my %ntf;
+for my $gene (keys %gene_to_query_match_ranges){
+  my $max_doc_word_count = $df_index{"__DOC_MAX_WC_$gene"};
+  for my $count_record (@{$gene_to_query_match_ranges{$gene}}){
+    my ($query_term, @query_term_match_ranges_in_this_gene) = @$count_record;
+    # next if $query_term eq $gene; # slightly controversial? exclude references to genes from the score if the gene is the record being talked about (obviously it will be highly scored)
+    # allows us to find first degree interactors (i.e. points for "A interacts with B", in the record describing A) without creating crazy high score for doc describing gene B if B was in the original query without any phenotype query terms
+    $query_gene_counts{$query_term}++;
+
+    $ntf{$gene} = {} unless exists $ntf{$gene};
+    # atypical use of log in order to weigh heavy use of a common term less than occasional use of a rare term
+    $ntf{$gene}->{$query_term} = log($#query_term_match_ranges_in_this_gene+2)/log($max_doc_word_count+1);
+  }
+  #print STDERR "Doc max word count is $max_doc_word_count for $gene, ntf keys = ", keys %{$ntf{$gene}}, "\n";
+}
+
+my %idf;
+for my $query_term (@query_terms){ # convert %idf values from documents-with-the-query-term-count to actual IDF
+  next unless exists $query_gene_counts{$query_term}; # query not in the document collection
+  $idf{$query_term} = log($gene_record_count/$query_gene_counts{$query_term});
+  #print STDERR "$query_term IDF is $idf{$query_term}\n";
+}
+
+# Create a relevance score using a normalized term frequency - inverse document frequency summation
+my %relevance_score;
+my %matched_query_terms;
+for my $gene_symbol (keys %gene_to_query_match_ranges){
+  my $relevance_score = 0;
+  # Hmm, take average, sum or max of TF-IDFs?
+  my $max_query_score = 0;
+  my @matched_query_terms;
+  my $query_score = 0;
+  for (my $i = 0; $i <= $#query_terms; $i++){
+    my $query_term = $query_terms[$i];
+    next unless exists $idf{$query_term};
+    next unless exists $ntf{$gene_symbol}->{$query_term};
+    $query_score += $ntf{$gene_symbol}->{$query_term}*$idf{$query_term};
+    push @matched_query_terms, $query_term;
+    $query_score *= 1-$i/scalar(@query_terms)/2 if scalar(@query_terms) > 2;# adjust the query score so the first terms are weighted more heavily if a bunch of terms are being searched
+    $max_query_score = $query_score if $query_score > $max_query_score;
+    $relevance_score += $query_score;
+  }
+  # this square root trick will not affect the score of a single term query, but will penalize a high total score that is comprised of a bunch of low value individual term scores)
+  $relevance_score{$gene_symbol} = sqrt($relevance_score*$max_query_score);
+  #print STDERR "Relevance score for $gene_symbol is $relevance_score{$gene_symbol}\n";
+  $matched_query_terms{$gene_symbol} = \@matched_query_terms;
+}
+
+# Characterize relevance score as a gamma statistical distribution and convert to probability
+my $max_relevance_score = 0;
+for my $relevance_score (values %relevance_score){
+  $max_relevance_score = $relevance_score if $relevance_score > $max_relevance_score;
+}
+# Remove top end scores as signal, characterize the rest as noise.
+# Iterative estimation of gamma parameters and removing data within range where CDF>99%
+my $noise_data = pdl(values %relevance_score);
+my ($shape, $scale) = $noise_data->mme_gamma();
+#print STDERR "Initial gamma distribution estimates: $shape, $scale (max observation $max_relevance_score)\n";
+my $signal_cutoff = qgamma($signal_p, $shape, 1/$scale);
+my @noise_data;
+for my $gene_symbol (keys %relevance_score){
+  my $score = $relevance_score{$gene_symbol};
+  push @noise_data, $score if $score < $signal_cutoff;
+}
+$noise_data = pdl(@noise_data);
+($shape, $scale) = $noise_data->mme_gamma();
+#print STDERR "Revised gamma distribution estimates (noise estimate at $signal_cutoff (CDF $signal_p)): $shape, $scale\n";
+# Convert scores to probabilities
+for my $gene_symbol (keys %relevance_score){
+  $relevance_score{$gene_symbol} = 1-pgamma($relevance_score{$gene_symbol}, $shape, 1/$scale);
+}
+
+#TODO: create summary stats for each query term so the user gets an idea of each's contribution?
+
+my %pubmed_matches;
+for my $gene_symbol (keys %gene_to_query_match_ranges){
+  my $query_match_ranges_ref = $gene_to_query_match_ranges{$gene_symbol};
+  my %matching_sentences;
+  for my $count_record (@$query_match_ranges_ref){
+    my ($query_term, @query_term_match_ranges_in_this_gene) = @$count_record;
+    for my $occ_info (@query_term_match_ranges_in_this_gene){
+      my $id = $occ_info->[2];
+      my $sentence_number = $occ_info->[3];
+      my $query_match_word = $occ_info->[4];
+      # Fetch the preparsed sentence from the sentence index based on id and sentence number
+      # Will automatically *HIGHLIGHT* the query terms fetched in the sentence over the course of this script
+      if(ref $sentence_number eq "ARRAY"){ # match spans multiple sentences
+        for my $s (@$sentence_number){
+          for my $word (split / AND /, $query_match_word){
+            #print STDERR "Highlighting $word in $id #$s for query term $query_term (multisentence match)\n";
+            $matching_sentences{fetch_sentence_key($id, $s, $word)}++;
+          }
+        }
+      }
+      else{ # single sentence match
+        #print STDERR "Highlighting $query_match_word in $id #$sentence_number for query term $query_term\n";
+        $matching_sentences{fetch_sentence_key($id, $sentence_number, $query_match_word)}++;
+      }
+    }
+  }
+  $gene_symbol =~ s/_/\//; # didn't have a forward slash in a gene name for disk caching purposes
+  if(keys %matching_sentences){
+    $pubmed_matches{$gene_symbol} = [] unless exists $pubmed_matches{$gene_symbol};
+    for my $new_match_ref (keys %matching_sentences){
+      push @{$pubmed_matches{$gene_symbol}}, $new_match_ref unless grep {$_ eq $new_match_ref} @{$pubmed_matches{$gene_symbol}}; # only put in new sentences, no need to dup
+    }
+  }
+}
+
+$orig_query =~ s/\s+/ /; # normalized whitespace
+$orig_query =~ s/ and / and /i; # lc()
+my @orig_query_terms = split /\s+or\s+/, $orig_query;
+
+open(OUT, ">$out_file")
+  or die "Cannot open $out_file for writing: $!\n";
+my $new_header = $header;
+$new_header .= "\t$db_name p-value (log normalized TF-IDF score, gamma dist)\t$db_name Matching Terms ($orig_query)\t$db_name Text Matches";
+print OUT $new_header, "\n";
+
+# Check if any of the variants in the annotated HGVS table are in genes from the OMIM match list
+while(<HGVS>){
+  chomp;
+  my @F = split /\t/, $_, -1;
+  # order the ids from highest number of sentence matches to lowest, from highest ranked term to least
+  my (%id2match_count, %id2sentences);
+  my @matched_genes;
+  my $relevance_score_final = 1;
+  my @matched_query_terms;
+  for my $gene_name (split /\s*;\s*/, $F[$gene_name_column]){
+    next unless exists $pubmed_matches{$gene_name};
+    push @matched_genes, $gene_name;
+    for my $sentence_ref (@{$pubmed_matches{$gene_name}}){ # 0 == always fetch the title which is stored in sentence index 0
+      my $pubmed_record = fetch_sentence($sentence_ref);
+      $id2match_count{$pubmed_record->[0]}++; # key = id
+      if(not exists $id2sentences{$pubmed_record->[0]}){
+        $id2sentences{$pubmed_record->[0]} = {};
+        my $title_record = fetch_sentence(fetch_sentence_key($pubmed_record->[0], 0, ""));
+        next unless $title_record->[0];
+        print STDERR "No $index_filename_base sentence number for ", $title_record->[0], "\n" if not defined $title_record->[1];
+        print STDERR "No $index_filename_base sentence text for ", $title_record->[0], " sentence #", $title_record->[1], "\n" if not defined $title_record->[2];
+        $id2sentences{$title_record->[0]}->{$title_record->[2]} = $title_record->[1];
+      }
+      # Only print sentences that match a query term other than the gene name for the record key, if that gene name is part of the query
+      my $non_self_query_ref = 0;
+      while($pubmed_record->[2] =~ /\*(.+?)\*/g){
+        if($1 ne $gene_name){
+          $non_self_query_ref = 1;
+          last;
+        }
+      }
+      #print STDERR "rejected $gene_name self-only sentence ",$pubmed_record->[2],"\n" unless $non_self_query_ref;
+      next unless $non_self_query_ref;
+      $id2sentences{$pubmed_record->[0]}->{$pubmed_record->[2]} = $pubmed_record->[1]; # value = sentence order within pubmed text
+    }
+    $relevance_score_final *= $relevance_score{$gene_name};
+    push @matched_query_terms, @{$matched_query_terms{$gene_name}};
+  }
+
+  # If we get here, there were matches
+  my @ordered_ids = sort {$id2match_count{$b} <=> $id2match_count{$a}} keys %id2match_count;
+
+  # print sentences in each id in order, with ellipsis if not contiguous
+  my %h;
+  print OUT join("\t", @F, ($relevance_score_final != 1 ? $relevance_score_final : ""), (@matched_query_terms ? join("; ", sort grep {not $h{$_}++} @matched_query_terms) : "")), "\t";
+  my $first_record = 1;
+  for my $id (@ordered_ids){
+    my $sentence2order = $id2sentences{$id};
+    my @ordered_sentences = sort {$sentence2order->{$a} <=> $sentence2order->{$b}} keys %$sentence2order;
+    next if scalar(@ordered_sentences) == 1; # due to self-gene only referencing filter above, we may have no matching sentences in a record.  Skip in this case.
+    if($first_record){
+      $first_record = 0;
+    }
+    else{
+      print OUT " // ";
+    }
+    my $title = shift(@ordered_sentences);
+    print OUT "$db_name $id",(defined $title ? " $title": ""),":"; # first sentence is always the record title
+    my $last_ordinal = 0;
+    for my $s (@ordered_sentences){
+      if($last_ordinal and $sentence2order->{$s} != $last_ordinal+1){
+        print OUT "..";
+      }
+      print OUT " ",$s;
+      $last_ordinal = $sentence2order->{$s};
+    }
+  }
+  print OUT "\n";
+}
+
+sub get_doc_offsets{
+  my ($db_handle, $word_stem) = @_;
+  my %doc2offsets;
+
+  my $is_uc = $word_stem =~ /^[A-Z0-9]+$/;
+  my $has_wildcard = $word_stem =~ s/\*$//;
+  my $value = 0;
+  my $cursor_key = $word_stem;
+  # retrieves the first
+  for(my $status = $db_handle->seq($cursor_key, $value, R_CURSOR);
+      $status == 0;
+      $status = $db_handle->seq($cursor_key, $value, R_NEXT)){
+    if(CORE::index($cursor_key,$word_stem) != 0){
+      last; # outside the records that have the requested stem now
+    }
+    for my $record (split /\n/s, $value){
+      my ($doc, @occ_infos) = split /:/, $record;
+      $doc2offsets{$doc} = [] if not exists $doc2offsets{$doc};
+      for my $occ_info (@occ_infos){
+        my ($term_offset, $id, $sentence_number) = split /,/, $occ_info, -1;
+        # record start and end of word to facilitate partial key consecutive word matching algorithm used in this script
+        push @{$doc2offsets{$doc}}, [$term_offset, $term_offset+length($cursor_key), $id, $sentence_number, $cursor_key];
+      }
+    }
+    last if $is_uc and not $has_wildcard; # only exact matches for upper case words like gene names
+  }
+  return \%doc2offsets;
+}
+
+sub mc{
+  if($_[0] =~ /^[A-Z][a-z]+$/){
+    return lc($_[0]); # sentence case normalization to lower case for regular words
+  }
+  else{
+    return $_[0]; # as-is for gene names, etc
+  }
+}
+
+sub fetch_sentence_key{
+  my ($id, $sentence_number, $query_term) = @_;
+
+  $sentence_number = 0 if not defined $sentence_number;
+  return ":$sentence_number" if not $id;
+  my $key = "$id:$sentence_number";
+  if(not exists $cached_sentences{$key}){
+    my @sentences = split /\n/, $sentence_index{$id};
+    $cached_sentences{$key} = $sentences[$sentence_number];
+  }
+  $cached_sentences{$key} =~ s/\b\Q$query_term\E\b(?!\*)/"*".uc($query_term)."*"/ge unless $query_term eq "";
+  #print STDERR "Highlighted $query_term in $cached_sentences{$key}\n" if $query_term =~ /cirrhosis/;
+  return $key;
+}
+
+sub fetch_sentence{
+  if(@_ == 1){ # from cache
+    return [split(/:/, $_[0]), $cached_sentences{$_[0]}];
+  }
+  else{ # if more than one arg, DIRECT FROM index key as first arg, sentence # is second arg
+    return undef if not exists $sentence_index{$_[0]};
+    my @sentences = split /\n/, $sentence_index{$_[0]};
+    if($_[1] < 0){ # all sentences request
+      return join("; ", @sentences);
+    }
+    return $sentences[$_[1]];
+  }
+}
+
+
+# boolean operator tree to flat expanded single depth "or" op query
+sub flatten_query{
+  my $tree = shift @_;
+  my @or_queries;
+
+  # Base case: the tree is just a leaf (denoted by a hash reference). Return value of the operand it represents.
+  if(ref $tree eq "HASH"){
+    return ($tree->{"operand"});
+  }
+
+  elsif(not ref $tree){
+    return $tree;
+  }
+
+  # Otherwise it's an operation array
+  if(ref $tree ne "ARRAY"){
+    die "Could not parse $tree, logic error in the query parser\n";
+  }
+
+  # Deal with AND first since it has higher precedence
+  for (my $i = 1; $i < $#{$tree}; $i++){
+    if($tree->[$i] eq "and"){
+      my @expanded_term;
+      my @t1_terms = flatten_query($tree->[$i-1]);
+      my @t2_terms = flatten_query($tree->[$i+1]);
+      #print STDERR "need to expand ", $tree->[$i-1], "(@t1_terms) AND ", $tree->[$i+1], "(@t2_terms)\n";
+      for my $term1 (@t1_terms){
+        for my $term2 (@t2_terms){
+          #print STDERR "Expanding to $term1 and $term2\n";
+          push @expanded_term, "$term1 and $term2";
+        }
+      }
+      splice(@$tree, $i-1, 3, @expanded_term);
+      $i--; # list has been shortened
+    }
+  }
+  # Should be only "OR" ops left
+  # Resolve any OR subtrees
+  for(my $i = 0; $i <= $#{$tree}; $i++){
+    next if $tree->[$i] eq "or";
+    push @or_queries, flatten_query($tree->[$i]); # otherwise recursive parse
+  }
+
+  return @or_queries;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgvs_collapse_transcripts	Wed Mar 25 15:49:28 2015 -0600
@@ -0,0 +1,201 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+# reports the variant in transcripts separately only if the AA change is different, or distance from splicing site is different
+@ARGV == 2 or @ARGV == 3 or die "Usage: $0 <hgvs input.txt> <output.txt> [ignore splice distance diff]\n";
+
+my %ftype_rank = ( protein_coding => 100,
+              processed_transcript => 90,
+              antisense => 80,
+              retained_intron => 70,
+              lincRNA => 60,
+              nonsense_mediated_decay => 50,
+              misc_enrichment_kit_target => 0);
+
+my %mtype_rank = ( nonsense => 100,
+                      frameshift => 99,
+                      nonstop => 90,
+                      missense => 80,
+                      silent => 50,
+                      "non-coding" => 40);
+open(IN, $ARGV[0])
+  or die "Cannot open $ARGV[0] for reading: $!\n";
+open(OUT, ">$ARGV[1]")
+  or die "Cannot open $ARGV[1] for writing: $!\n";
+my $succinct = (@ARGV == 3);
+my @lines;
+my $last_chr = "";
+my $last_pos = "";
+my $last_alt = "";
+my %buffered_F;
+my %buffered_id_rank;
+my $header = <IN>;
+print OUT $header;
+
+my ($chr_column, $pos_column, $alt_column, $cdna_hgvs_column, $aa_hgvs_column, $phase_column, $transcript_column, $transcript_length_column, $exon_dist_column, $ftype_column, $mtype_column, $splicing_score_column, $splicing_effect_column, $sources_column);
+chomp $header;
+my @headers = split /\t/, $header;
+for(my $i = 0; $i <= $#headers; $i++){
+  if($headers[$i] eq "Chr"){
+    $chr_column = $i;
+  }
+  elsif($headers[$i] eq "Feature type"){
+    $ftype_column = $i;
+  }
+  elsif($headers[$i] eq "Variant type"){
+    $mtype_column = $i;
+  }
+  elsif($headers[$i] eq "DNA From" or $headers[$i] eq "DNA Pos"){
+    $pos_column = $i;
+  }
+  elsif($headers[$i] eq "Obs base"){
+    $alt_column = $i;
+  }
+  elsif($headers[$i] eq "Transcript HGVS"){
+    $cdna_hgvs_column = $i;
+  }
+  elsif($headers[$i] eq "Protein HGVS"){
+    $aa_hgvs_column = $i;
+  }
+  elsif($headers[$i] eq "Phase"){
+    $phase_column = $i;
+  }
+  elsif($headers[$i] eq "Selected transcript"){
+    $transcript_column = $i;
+  }
+  elsif($headers[$i] eq "Transcript length"){
+    $transcript_length_column = $i;
+  }
+  elsif($headers[$i] eq "Closest exon junction (AA coding variants)"){
+    $exon_dist_column = $i;
+  }
+  elsif($headers[$i] eq "Splicing alteration potential"){
+    $splicing_score_column = $i;
+  }
+  elsif($headers[$i] eq "Splicing alteration details"){
+    $splicing_effect_column = $i;
+  }
+  elsif($headers[$i] eq "Sources"){
+    $sources_column = $i;
+  }
+}
+if(not defined $chr_column){
+  die "Could not find Chr header in $ARGV[0], aborting.\n";
+}
+if(not defined $pos_column){
+  die "Could not find 'DNA From' header in $ARGV[0], aborting.\n";
+}
+if(not defined $alt_column){
+  die "Could not find 'Obs base' header in $ARGV[0], aborting.\n";
+}
+if(not defined $cdna_hgvs_column){
+  die "Could not find 'Transcript HGVS' header in $ARGV[0], aborting.\n";
+}
+if(not defined $aa_hgvs_column){
+  die "Could not find 'Protein HGVS' header in $ARGV[0], aborting.\n";
+}
+if(not defined $phase_column){
+  die "Could not find Phase header in $ARGV[0], aborting.\n";
+}
+if(not defined $transcript_column){
+  die "Could not find 'Selected transcript' header in $ARGV[0], aborting.\n";
+}
+if(not defined $transcript_length_column){
+  die "Could not find 'Transcript length' header in $ARGV[0], aborting.\n";
+}
+#if(not defined $mtype_column){
+#  die "Could not find 'Variant type' header in $ARGV[0], aborting.\n";
+#}
+if(not defined $ftype_column){
+  die "Could not find 'Feature type' header in $ARGV[0], aborting.\n";
+}
+if(not defined $exon_dist_column){
+  die "Could not find 'Closest exon junction (AA coding variants)' header in $ARGV[0], aborting.\n";
+}
+
+while(<IN>){
+  chomp;
+  my @F = split /\t/, $_, -1;
+  my $chr = $F[$chr_column];
+  my $pos = $F[$pos_column];
+  my $alt = $F[$alt_column];
+  my $cdna_hgvs = $F[$cdna_hgvs_column];
+  my $hgvs = $F[$aa_hgvs_column];
+  my $ftype = $F[$ftype_column];
+  my $mtype;
+  $mtype = $F[$mtype_column] if defined $mtype_column;
+  $hgvs =~ s/\d+//g; # look only at the non-position parts of the AA syntax
+  my $exon_edge_distance = $F[$exon_dist_column];
+  my $phase = $F[$phase_column] || "";
+  # in the case of large indels (i.e. CNVs), their positions may not be the same, but effectively they should be reported as such
+  if($phase =~ /CNV-\S+?:(\S+)/){
+    $pos = $1;
+  }
+  my $preferred_id = $succinct ? ($F[$transcript_column] =~ /^$succinct/o) : 0;
+  my $id_rank = $F[$transcript_column];
+  $id_rank =~ s/^.*?0*(\d+)(\.\d+)?$/$1/; # look at only trailing non-padded number (and no .version suffix)
+
+  my $collapse_key = $succinct ? "$chr:$pos:$alt" : "$chr:$pos:$hgvs:$exon_edge_distance:$phase";
+  # Same variant
+  if($chr eq $last_chr and $pos eq $last_pos and $alt eq $last_alt){
+    # same AA effect
+    if(not exists $buffered_F{$collapse_key}){
+      $buffered_F{$collapse_key} = \@F;
+      $buffered_id_rank{$collapse_key} = $id_rank;
+    }
+    else{
+      $buffered_F{$collapse_key}->[$sources_column] .= "; $F[$sources_column]" if defined $sources_column;
+      $ftype_rank{$ftype} = 0 if not defined $ftype_rank{$ftype};
+      $mtype_rank{$mtype} = 0 if defined $mtype and not exists $mtype_rank{$mtype};
+      my $buf_ftype = $buffered_F{$collapse_key}->[$ftype_column];
+      $ftype_rank{$buf_ftype} = 0 if not defined $ftype_rank{$buf_ftype};
+      my $buf_mtype;
+      $buf_mtype = $buffered_F{$collapse_key}->[$mtype_column] if defined $mtype_column;
+      $mtype_rank{$buf_mtype} = 0 if defined $buf_mtype and not exists $mtype_rank{$buf_mtype};
+      # see if this transcript is "earlier" than the other, based on ID #
+      if($ftype_rank{$ftype} > $ftype_rank{$buf_ftype} or
+         (defined $mtype and $mtype_rank{$mtype} > $mtype_rank{$buf_mtype}) or
+         $preferred_id or
+          ($id_rank =~ /^\d+$/ and $buffered_id_rank{$collapse_key} =~ /^\d+$/ and $id_rank < $buffered_id_rank{$collapse_key})
+         or $F[$transcript_column] lt $buffered_F{$collapse_key}->[$transcript_column]){ # alphabetical as backup
+        # make this the first one reported (and use its HGVS syntax)
+        $buffered_F{$collapse_key}->[$transcript_length_column] = $F[$transcript_length_column]."; ".$buffered_F{$collapse_key}->[$transcript_length_column];
+        $buffered_F{$collapse_key}->[$transcript_column] = $F[$transcript_column]."; ".$buffered_F{$collapse_key}->[$transcript_column];
+        $buffered_F{$collapse_key}->[$cdna_hgvs_column] = $F[$cdna_hgvs_column];
+        $buffered_F{$collapse_key}->[$aa_hgvs_column] = $F[$aa_hgvs_column];
+        $buffered_id_rank{$collapse_key} = $id_rank;
+      }
+      else{
+        # just append it to the list of IDs
+        $buffered_F{$collapse_key}->[$transcript_length_column] .= "; $F[$transcript_length_column]";
+        $buffered_F{$collapse_key}->[$transcript_column] .= "; $F[$transcript_column]";
+      }
+      if($succinct and defined $splicing_score_column and $F[$splicing_score_column] ne "NA" and $F[$splicing_score_column] > $buffered_F{$collapse_key}->[$splicing_score_column]){
+        $buffered_F{$collapse_key}->[$splicing_score_column] = $F[$splicing_score_column];
+        $buffered_F{$collapse_key}->[$splicing_effect_column] = $F[$splicing_effect_column] ." (transcript model ".$F[$transcript_column].")";
+      }
+    }
+  }
+  # Different variant from the last line, dump any buffered data
+  else{
+    for my $collapse_key (keys %buffered_F){
+      if(defined $sources_column){
+        my %seen;
+        $buffered_F{$collapse_key}->[$sources_column] = join("; ", grep {not $seen{$_}++} split(/; /, $buffered_F{$collapse_key}->[$sources_column]));
+      }
+      print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n";
+    }
+    undef %buffered_F;
+    $last_chr = $chr;
+    $last_pos = $pos;
+    $last_alt = $alt;
+    $buffered_F{$collapse_key} = \@F;
+    $buffered_id_rank{$collapse_key} = $id_rank;
+  }
+}
+for my $collapse_key (keys %buffered_F){
+  print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n";
+}
+close(IN);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/report_poor_coverage	Wed Mar 25 15:49:28 2015 -0600
@@ -0,0 +1,6 @@
+capturekits_directory	/export/achri_galaxy/dbs/CaptureKits/
+ref_fasta	/export/achri_galaxy/dbs/hg19.fa
+ref_flat	/export/achri_galaxy/dbs/hg19_refFlat_2014-07-10.bed
+ref_gencode	/export/achri_galaxy/dbs/hg19_refGene_gencode_ultrasensitive.gtf
+clinVar	/export/achri_galaxy/dbs/ClinVar/ClinVarFullRelease_2014-03.xml
+dgv2	/export/achri_galaxy/dbs/DGV2/hg19.2012-03-29.txt.gz
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf2hgvs_table	Wed Mar 25 15:49:28 2015 -0600
@@ -0,0 +1,2670 @@
+#!/usr/bin/env perl
+
+BEGIN{
+  my $prog_dir = `dirname $0`;
+  chomp $prog_dir;
+  push @INC, $prog_dir; # so DisjointSets.pm can be found no matter the working directory
+}
+
+use DisjointSets; # homebrew module
+use Bio::DB::Sam; # for FastA reference pulls
+use Bio::SeqUtils;
+use Bio::Tools::CodonTable;
+use Statistics::Zed;
+use Getopt::Long;
+use Set::IntervalTree;
+use strict;
+use warnings;
+use vars qw($min_prop $zed $codonTable $default_transl_table %transl_except %internal_prop %dbsnp_info %chr2variant_locs %chr2dbsnp_vcf_lines %chr2internal_vcf_lines %chr2caveats %chr2phase @snvs $fasta_index $max_args $quiet);
+
+if(@ARGV == 1 and $ARGV[0] eq "-v"){
+  print "Version 1.0\n";
+  exit;
+}
+
+#$max_args = `getconf ARG_MAX`; # largest number of args you can send to a system command (enviroment included, see limits.h)
+#chomp $max_args;
+$max_args = 4096; # if not defined $max_args or $max_args < 1; # the minimum since System V
+$max_args -= 50;
+
+# find out if a variant appears in the user provided data
+sub internal_prop($$$$){
+    my ($chr,$pos,$ref,$variant) = @_;
+
+    my $key = "$chr:$pos:$ref:$variant";
+    if(exists $internal_prop{$key}){
+        return $internal_prop{$key};
+    }
+
+    #print STDERR "Checking if internal_prop for $key exists: ";
+    if(exists $chr2internal_vcf_lines{$chr}->{$pos}){
+      for(@{$chr2internal_vcf_lines{$chr}->{$pos}}){
+        my @fields = split /\t/, $_;
+        if($pos == $fields[1] and length($fields[3]) == length($ref) and $fields[4] eq $variant){
+          #print STDERR "yes\n";
+          if(/MAF=(\d\.\d+)/){
+            $internal_prop{$key} = $1; # change from percent to proportion
+            return $1;
+          }
+        }
+      }
+    }
+    else{
+      #print STDERR "no\n";
+    }
+
+    $internal_prop{$key} = "NA";
+    return "NA";
+}
+
+# find out if a variant appears in the NCBI's dbSNP
+sub dbsnp_info($$$$){
+    my ($chr,$pos,$ref,$variant) = @_;
+
+    my $key = "$chr:$pos:$ref:$variant";
+    if(exists $dbsnp_info{$key}){
+        return @{$dbsnp_info{$key}};
+    }
+
+    if(exists $chr2dbsnp_vcf_lines{$chr}->{$pos}){
+      #print STDERR "Checking existing SNP data for $chr:$pos -> ", join("\n", @{$chr2dbsnp_vcf_lines{$chr}->{$pos}}), "\n";
+      for(@{$chr2dbsnp_vcf_lines{$chr}->{$pos}}){
+	my @fields = split /\t/, $_;
+	for my $var (split /,/, $fields[4]){
+            # Allows for different reference seqs between dbSNP and input, assuming patches only
+	    if(length($fields[3]) == length($ref) and ($var eq $variant or $ref eq $var and $variant eq $fields[3])){
+		my ($freq, $subpop) = ("","");
+		$freq = $1 if $fields[7] =~ /(?:\A|;)MMAF=(0\.\d+)(?:;|\Z)/;
+		$subpop = $1 if $fields[7] =~ /(?:\A|;)MMAF_SRC=(\S+?)(?:;|\Z)/;
+                $dbsnp_info{$key} = [$subpop, $freq || "NA", $fields[2]];
+		return @{$dbsnp_info{$key}};
+	    }
+	}
+      }
+    }
+    $dbsnp_info{$key} = ["novel", "NA", "NA"];
+    return @{$dbsnp_info{$key}};
+}
+
+sub record_snv{
+  my $line = join("", @_);
+  push @snvs, $line;
+
+  my @fields = split /\t/, $line;
+  my $prop_info_key = $fields[9];
+  my ($chr,$pos,$ref,$variant) = split /:/, $prop_info_key;
+  $chr2variant_locs{$chr} = {} unless exists $chr2variant_locs{$chr};
+  return unless $ref; # ref not defined for CNVs
+  # Need to grab whole range for MNPs
+  for(my $i = 0; $i < length($ref); $i++){
+    $chr2variant_locs{$chr}->{$pos+$i} = 1;
+  }
+}
+
+sub retrieve_vcf_lines($$$){
+  my ($dbsnp_file, $internal_snp_file, $chr) = @_;
+
+  my (%dbsnp_lines, %internal_snp_lines);
+
+  if(not defined $dbsnp_file or not exists $chr2variant_locs{$chr}){
+    return ({}, {}, {}, {}); # no data requested for this chromosome
+  }
+
+  # build up the request
+  my @tabix_regions;
+  my @var_locs = keys %{$chr2variant_locs{$chr}};
+  # sort by variant start location
+  for my $var_loc (sort {$a <=> $b} @var_locs){
+    push @tabix_regions, $chr.":".$var_loc."-".$var_loc;
+  }
+  for(my $i = 0; $i <= $#tabix_regions; $i += $max_args){ # chunkify tabix request if too many for the system to handle
+    my $end = $i + $max_args > $#tabix_regions ? $#tabix_regions : $i + $max_args;
+    my $regions = "'".join("' '", @tabix_regions[$i..$end])."'";
+    # From file is very slow for some reason
+    #my $regions_file =  "/tmp/vcf2hgvs_$$.bed";
+    #open(REQ_BED, ">$regions_file")
+    #  or die "Cannot open $regions_file for writing: $!\n";
+    #print REQ_BED join("\n", @tabix_regions), "\n";
+    #close(REQ_BED);
+
+    # retrieve the data
+    die "Cannot find dbSNP VCF file $dbsnp_file\n" if not -e $dbsnp_file;
+
+    open(VCF, "tabix $dbsnp_file $regions |")
+      or die "Cannot run tabix on $dbsnp_file (args ".substr($regions, 0, length($regions)>100? 100 : length($regions))."): $!\n";
+    while(<VCF>){
+      #if(/^(\S+\t(\d+)(?:\t\S+){6})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible
+      if(/^(\S+\t(\d+)(?:\t\S+){6})/ and exists $chr2variant_locs{$chr}->{$2}){ # take only main columns to save room, if possible
+        $dbsnp_lines{$2} = [] unless exists $dbsnp_lines{$2};
+        push @{$dbsnp_lines{$2}}, $1;
+      }
+    }
+    close(VCF);
+
+    if($internal_snp_file){
+      die "Cannot find internal VCF file $internal_snp_file\n" if not -e $internal_snp_file;
+      open(VCF, "tabix $internal_snp_file $regions |")
+        or die "Cannot run tabix on $internal_snp_file: $!\n";
+      while(<VCF>){
+        #if(/^(\S+\t(\d+)(?:\t\S+){6})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible
+        if(/^(\S+\t(\d+)(?:\t\S+){5})/ and exists $chr2variant_locs{$chr}->{$2}){ # take only main columns to save room, if possible
+          $internal_snp_lines{$2} = [] unless exists $internal_snp_lines{$2};
+          push @{$internal_snp_lines{$2}}, $1;
+        }
+      }
+      close(VCF);
+    }
+  }
+
+  #unlink $regions_file;
+
+  return (\%dbsnp_lines, \%internal_snp_lines);
+}
+
+sub prop_info_key{
+    my($chr,$pos,$ref,$variant,$exon_edge_dist) = @_;
+
+    $chr =~ s/^chr//;
+    if($chr eq "M"){
+      $chr = "MT"; # NCBI uses different name for mitochondrial chromosome
+      $pos-- if $pos >= 3107;  # also, doesn't keep the old positioning (historical)
+    }
+    return join(":", $chr,$pos,$ref,$variant, ($exon_edge_dist ? $exon_edge_dist : ""));
+}
+
+sub prop_info($$$){
+    my($snpfile,$internal_snps_file,$prop_info_key) = @_;
+
+    my($chr,$pos,$ref,$variant) = split /:/, $prop_info_key;
+
+    # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse
+    if(not exists $chr2dbsnp_vcf_lines{$chr}){
+      ($chr2dbsnp_vcf_lines{$chr}, $chr2internal_vcf_lines{$chr}) = retrieve_vcf_lines($snpfile,$internal_snps_file,$chr);
+    }
+    my $internal_maf = 0;
+    if($internal_snps_file){
+      $internal_maf = internal_prop($chr,$pos,$ref,$variant);
+      $internal_maf = 0 if $internal_maf eq "NA";
+    }
+
+    my @results = dbsnp_info($chr,$pos,$ref,$variant);
+
+    # Not all entries have a proportion in dbSNP
+    return $internal_snps_file ? ($ref, $variant, @results, $internal_maf) : ($ref, $variant, @results);
+}
+
+#offset a given HGVS nomenclature position (single position only) by a given number of bases
+sub hgvs_plus($$){
+   my ($hgvs, $offset) = @_;
+    if($hgvs =~ /^(\S+)(-\d+)(.*)/){
+	# all negative
+	if($2+$offset<0){
+	    return $1.($2+$offset).$3;
+	}
+	# switches to positive, need to mod
+	else{
+	    return $1+($2+$offset);
+	}
+    }
+    elsif($hgvs =~ /^(\S+)\+(\d+)(.*)/){
+	# all positive
+	if($2+$offset>0){
+	    return $1."+".($2+$offset).$3;
+	}
+	# switches to negative, need to mod
+	else{
+	    return $1+($2+$offset);
+	}
+    }
+    elsif($hgvs =~ /^(-?\d+)(.*)/){
+	# special case if offset spans -/+ since there is no position 0
+	if($1 < 0 and $1+$offset >= 0){
+	    $offset++;
+	}
+	elsif($1 > 0 and $1+$offset <= 0){
+	    $offset--;
+	}
+	return ($1+$offset).$2;
+    }
+    else{
+	die "Cannot convert $hgvs to a new offset ($offset), only single base position nomenclature is currently supported\n";
+    }
+}
+
+# offset a given position by a given number of bases,
+# taking into account that if the new offset crosses the threshold in the last argument,
+# HGVS boundary nomenclature has to be introduced
+sub hgvs_plus_exon($$$){
+    my ($pos, $offset, $boundary) = @_;
+
+    # special case if offset spans -/+ since there is no position 0
+    if($pos =~ /^(-?\d+)(.*)/){
+      if($1 < 0 and $1+$offset >= 0){
+         $offset++;
+      }
+      elsif($1 > 0 and $1+$offset <= 0){
+         $offset--;
+      }
+    }
+    my $new_pos = $pos + $offset;
+    if($new_pos > $boundary and $pos <= $boundary){
+	# just moved into an intron 3'
+	$new_pos = $boundary."+".($new_pos-$boundary);
+    }
+    elsif($new_pos < $boundary and $pos >= $boundary){
+	# just moved into an intron 5'
+	$new_pos = $boundary.($new_pos-$boundary);
+    }
+    return $new_pos;
+}
+
+# given a nucleotide position, calculates the AA there (assumes coding region)
+sub getCodonFromSeq($$$$){
+  my ($chr_ref, $location, $frame_offset, $strand) = @_;
+
+  my $codon;
+  if($strand eq "+"){
+    $codon = substr($$chr_ref, $location-1-$frame_offset, 3);
+  }
+  else{
+    $codon = substr($$chr_ref, $location-3+$frame_offset, 3);
+    $codon = reverse($codon);
+    $codon =~ tr/ACGTacgt/TGCAtgca/;
+  }
+  return $codon;
+}
+
+sub getCodonFromSeqIndex($$$$){
+  my ($chr, $location, $frame_offset, $strand) = @_;
+
+  my $codon;
+  if($strand eq "+"){
+    $codon = $fasta_index->fetch($chr.":".($location-$frame_offset)."-".($location-$frame_offset+2));
+  }
+  else{
+    $codon = $fasta_index->fetch($chr.":".($location-2+$frame_offset)."-".($location+$frame_offset));
+    $codon = reverse($codon);
+    $codon =~ tr/ACGTacgt/TGCAtgca/;
+  }
+  return $codon;
+}
+
+sub getAAFromSeq($$$$$){
+  return $_[4]->translate(getCodonFromSeq($_[0], $_[1], $_[2], $_[3]));
+}
+
+sub getAAFromSeqIndex($$$$$){
+  # convert codon to AA
+  if(exists $transl_except{"$_[0]:$_[1]"}){
+    return $transl_except{"$_[0]:$_[1]"};
+  }
+  else{
+    return $_[4]->translate(getCodonFromSeqIndex($_[0], $_[1], $_[2], $_[3]));
+  }
+}
+
+sub hgvs_protein{
+  my ($chr, $location, $ref, $variant, $cdna_pos, $strand, $transl_table) = @_;
+
+  if(substr($ref,0,1) eq substr($variant,0,1)){
+    substr($ref,0,1) = "";
+    substr($variant,0,1) = "";
+    $location++;
+    if($strand eq "-"){
+      $cdna_pos--;
+    }
+    else{
+      $cdna_pos++;
+    }
+  }
+
+  if($cdna_pos !~ /^\d+/){
+    die "Aborting: got illegal cDNA position ($cdna_pos) for protein HGVS conversion of position ",
+        "$location, ref $ref, variant $variant. Please correct the program code.\n";
+  }
+  # Get the correct frame for the protein translation, to know what codons are affected
+  my $aapos = int(($cdna_pos-1)/3)+1;
+
+  # does it destroy the start codon?
+  if($cdna_pos < 4){ # assumes animal codon usage
+    return "p.0?"; # indicates start codon missing, unsure of effect
+  }
+
+  my $table = $transl_table ne $default_transl_table ?  # non standard translation table requested
+    Bio::Tools::CodonTable->new(-id=>$transl_table) : $codonTable;
+
+  my $frame_offset = ($cdna_pos-1)%3;
+  my $origAA = getAAFromSeqIndex($chr, $location, $frame_offset, $strand, $table);
+  # take 100000 bp on either side for translation context of variant seq
+  my $five_prime_buffer = $location < 10000 ? $location-1 : 10000;
+  my $mutSeq = $fasta_index->fetch($chr.":".($location-$five_prime_buffer)."-".($location+10000));
+
+  # substitute all of the immediately adjacent variants in phase with this one to get the correct local effect
+  substr($mutSeq, $five_prime_buffer, length($ref)) = $variant;
+
+  # does it cause a frameshift?
+  my $length_diff = length($variant)-length($ref);
+  if($length_diff%3){ # insertion or deletion not a multiple of three
+    my $fs_codon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1, $frame_offset, $strand);
+    my $ext = 0;
+    my $newAA;
+    do{
+       $ext++;
+       # The "NA"s below make it so that we don't pick up any translation exceptions from the original reference annotation
+       if($strand eq "+"){
+         $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1+$ext*3, $frame_offset, $strand, $table);
+       }
+       else{
+         $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1-$ext*3, $frame_offset, $strand, $table);
+       }
+     } while($newAA ne "*");
+
+    return "p.".$origAA.$aapos.$table->translate($fs_codon)."fs*$ext";
+  }
+
+  # does it cause a stop codon to be lost?
+  if($origAA eq "*"){
+     my $stopChangeCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1, $frame_offset, $strand);
+     # still a stop after the mutation (ignore translation exceptions)
+     if($table->is_ter_codon($stopChangeCodon)){
+        return "p.*$aapos=";
+     }
+     # calculate the new stop, assuming there aren't mutations downstream in candidate stop codons
+     my $ext = 0;
+     my $newCodon;
+     do{
+       if($strand eq "+"){
+         $newCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1+(++$ext*3), $frame_offset, $strand);
+       }
+       else{
+         $newCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1-(++$ext*3), $frame_offset, $strand);
+       }
+     } while(not $table->is_ter_codon($newCodon));
+
+     return "p.*".$aapos.$table->translate($stopChangeCodon)."ext*".$ext;
+  }
+
+  # if we get this far, it's a "regular" AA level change
+  my $origAAs = "";
+  for(my $i = 0; $i < length($ref)+$frame_offset; $i+=3){
+    my $oldAA = getAAFromSeqIndex($chr, $location+$i, $frame_offset, $strand, $table);
+    if($strand eq "+"){
+      $origAAs .= $oldAA;
+    }
+    else{
+      $origAAs = $oldAA . $origAAs;
+    }
+  }
+  my $newAAs = "";
+  for(my $i = 0; $i < length($variant)+$frame_offset; $i+=3){
+    # NA means we don't take translation exceptions from the original
+    my $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1+$i, $frame_offset, $strand, $table);
+    if($strand eq "+"){
+      $newAAs .= $newAA;
+    }
+    else{
+      $newAAs = $newAA . $newAAs;
+    }
+  }
+
+  # silent
+  if($origAAs eq $newAAs){
+      return "p.".$origAAs.$aapos."=";
+  }
+
+  # minimize the difference if there are leading or trailing AAs the same
+  my $delLength = length($ref);
+  while(substr($newAAs, 0, 1) eq substr($origAAs, 0, 1)){
+    $newAAs = substr($newAAs, 1);
+    $origAAs = substr($origAAs, 1);
+    $location+=3;
+    $delLength-=3;
+    $aapos++;
+  }
+  while(substr($newAAs, -1) eq substr($origAAs, -1)){
+    $newAAs = substr($newAAs, 0, length($newAAs)-1);
+    $origAAs = substr($origAAs, 0, length($origAAs)-1);
+  }
+
+  # insertion
+  if(length($origAAs) == 0){
+    my $insAAs = getAAFromSeqIndex($chr,$location-3,$frame_offset,$strand,$table).($aapos-1)."_".
+                 getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table);
+    return "p.".$insAAs.$aapos."ins".$newAAs;
+  }
+  # deletion
+  elsif(length($newAAs) == 0){
+    my $delAAs;
+    if(length($origAAs) == 1){
+       $delAAs = getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos; # single AA deletion
+    }
+    else{ # deleting a stretch
+       if($strand eq "+"){
+         my $endPoint = $location+$delLength-1;
+         $delAAs = getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos."_".
+                   getAAFromSeqIndex($chr,$endPoint,$frame_offset,$strand,$table).($aapos+int(($delLength-1)/3));
+       }
+       else{
+         my $endPoint = $location-$delLength+1;
+         $delAAs = getAAFromSeqIndex($chr,$endPoint,$frame_offset,$strand,$table).($aapos-int(($delLength-1)/3))."_".
+                   getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos;
+       }
+    }
+    return "p.".$delAAs."del";
+  }
+  else{
+    # substitution
+    if(length($origAAs) == 1 and length($newAAs) == 1){
+      return "p.".$origAAs.$aapos.$newAAs;
+    }
+    # indel
+    elsif(length($origAAs) != 1){
+       # convert ref stretch into range syntax
+       if($strand eq "+"){
+         $origAAs = substr($origAAs, 0, 1).$aapos."_".substr($origAAs, -1).($aapos+length($origAAs)-1);
+       }
+       else{
+         $origAAs = substr($origAAs, 0, 1).($aapos-length($origAAs)+1)."_".substr($origAAs, -1).$aapos;
+       }
+    }
+    return "p.".$origAAs."delins".$newAAs;
+  }
+  return ("NA", "");
+}
+
+sub z2p{
+  if(not defined $zed){
+    $zed = new Statistics::Zed;
+  }
+  my $p = $zed->z2p(value => $_[0]);
+  return $p < 0.0000000001 ? 0 : $p;
+}
+sub gq2p{
+    return $_[0] > 200 ? 0 : 10**($_[0]/-10);
+}
+
+my ($multi_phased, $min_depth, $flanking_bases, $dbsnp, $internal_snp, $genename_bed_file, $dir_1000G, $dir_esp6500, $min_pvalue, $mappability_file, $reference_file, $samtools_phasing_file, $exons_file, $input_file, $output_file, $cnv_file, $dgv_file, $which_chr, $enrichment_regions_file, $rare_variant_prop);
+$multi_phased = 0;
+$min_depth = 2;
+$flanking_bases = 30;
+$min_pvalue = 0.01;
+$min_prop = 0.14;
+$rare_variant_prop = 0.05;
+$input_file = "-"; # STDIN by default
+$output_file = "-"; # STDOUT by default
+$default_transl_table = "1"; # assumes NCBI 'Standard' table, unless it is an argument to the script...
+&GetOptions("d=i"       => \$min_depth,
+            "f=i"       => \$flanking_bases,
+            "s=s"       => \$dbsnp,
+            "t=s"       => \$dir_1000G,
+            "n=s"       => \$dir_esp6500,
+            "u=s"       => \$internal_snp,
+            "q"         => \$quiet,
+            "p=f"       => \$min_pvalue,
+            "h=f"       => \$min_prop,
+            "m=s"       => \$mappability_file,
+            "r=s"       => \$reference_file,
+            "z=s"       => \$samtools_phasing_file,
+            "e=s"       => \$exons_file,
+            "i=s"       => \$input_file,
+            "c=s"       => \$cnv_file,
+            "g=s"       => \$dgv_file,
+            "b=s"       => \$genename_bed_file,
+            "w=s"       => \$which_chr,
+            "o=s"       => \$output_file,
+            "a=i"       => \$default_transl_table,
+            "v=f"       => \$rare_variant_prop,
+            "x=s"       => \$enrichment_regions_file); # if enrichment regions are specified, variants without a transcript model but in these ranges will be reported
+
+if(($input_file ne "/dev/null" and not defined $reference_file) or
+   not defined $exons_file or
+   (defined $cnv_file and not defined $dgv_file)){
+  die "Usage: $0 [-v(ersion)] [-q(uiet)] [-w(hich) contig_to_report (default is all)] [-d(epth of variant reads req'd) #] [-v(ariant max freq to count as rare)] [-f(lanking exon bases to report) #] [-p(robability of random genotype, maximum to report) 0.#]\n",
+      "  [-h(et proportion of variant reads, minimum to report) 0.#] [-c(opy number) variants_file.bed -g(enomic structural) variants_control_db.txt.gz] [-z file_containing_samtools_phase_output.txt]\n",
+      "  [-t(housand) genomes_integrated_vcfs_gz_dir] [-n ESP6500_dir] [-u(ser) specified_population.vcf.gz] [-m(appability) crg_file.bed]\n",
+      "  [-x enrichment_regions_file.bed] [-a(mino) acid translation table number from NCBI]\n",
+      "  [-i(nput) genotypes.vcf <-r(eference) sequence_file.fasta>] [-o(utput) hgvs_file.tsv] [-s(np) database_from_ncbi.vcf.gz]\n",
+      "  <-b(ed) file of named gene regions.bed> <-e(xons) file.gtf>\n\n",
+      "Input gz files must be indexed with Tabix.\nDefault input is STDIN, default output is STDOUT.  Note: if -c is specified, polyploidies are are assume to be proximal. Other defaults: -d 2, -v 0.05, -f 30, -p 0.01, -h 0.14 -a 1\nReference sequence is not strictly necessary if only CNV are being annotated.\n";
+}
+
+print STDERR "Considering $flanking_bases flanking bases for variants as well\n" unless $quiet;
+
+$codonTable = new Bio::Tools::CodonTable(id => $default_transl_table);
+
+my %enrichment_regions;
+# Note, we assume the regions are non-overlapping
+if(defined $enrichment_regions_file){
+  print STDERR "Loading enrichment regions...\n" unless $quiet;
+  open(BED, $enrichment_regions_file)
+    or die "Cannot open $enrichment_regions_file for reading: $!\n";
+  while(<BED>){
+    chomp;
+    my @F = split /\t/, $_;
+    $enrichment_regions{$F[0]} = [] if not exists $enrichment_regions{$F[0]};
+    push @{$enrichment_regions{$F[0]}}, [$F[1], $F[2]];
+  }
+  close(BED);
+}
+for my $chr (keys %enrichment_regions){ # sort by start
+  $enrichment_regions{$chr} = [sort {$a->[0] <=> $b->[0]} @{$enrichment_regions{$chr}}];
+}
+
+if(defined $reference_file){
+  print STDERR "Scanning reference FastA info\n" unless $quiet;
+  if(not -e $reference_file){
+    die "Reference FastA file ($reference_file) does not exist.\n";
+  }
+  if(not -e $reference_file.".fai" and not -w dirname($reference_file)){
+    die "Reference FastA file ($reference_file) is not indexed, and the directory is not writable.\n";
+  }
+  $fasta_index = Bio::DB::Sam::Fai->load($reference_file);
+}
+
+my %chr2mappability;
+if(defined $mappability_file){
+  print STDERR "Reading in mappability data\n" unless $quiet;
+  my ($nmer) = $mappability_file =~ /(\d+).*?$/;
+  die "Cannot determine nmer from nmer file name $mappability_file, aborting\n" unless $nmer;
+  open(MAP, $mappability_file)
+    or die "Cannot open mappability data file $mappability_file for reading: $!\n";
+  <MAP>; # header
+  while(<MAP>){
+    next if /^#/;
+    chomp;
+    my @F = split /\t/, $_;
+    my $x = int(1/$F[3]+0.5);
+    $chr2mappability{$F[0]} = Set::IntervalTree->new() if not exists $chr2mappability{$F[0]};
+    $chr2mappability{$F[0]}->insert("non-unique mapping region (x$x)", $F[1], $F[2]+$nmer-1);
+  }
+  close(MAP);
+}
+
+# Is phasing data provided?
+if(defined $samtools_phasing_file){
+  print STDERR "Reading in phasing data\n" unless $quiet;
+  open(PHASE, $samtools_phasing_file)
+    or die "Cannot open phasing data file $samtools_phasing_file for reading: $!\n";
+  my $phase_range;
+  while(<PHASE>){
+    if(/^PS/){
+      chomp;
+      my @F = split /\t/, $_;
+      $phase_range = "$F[2]-$F[3]";
+    }
+    if(/^M[12]/){
+      chomp;
+      my @F = split /\t/, $_;
+      #ignore strange cases where haplotype reference has no cases (weird samtools call)
+      next if $F[9] == 0 or $F[7] == 0;
+      my $chr = $F[1];
+      next if defined $which_chr and not $chr eq $which_chr;
+      my $pos = $F[3];
+      #print STDERR "Recording phase for $chr:$pos:$F[4] , $chr:$pos:$F[5] as A-$chr:$phase_range and B-$chr:$phase_range\n" if $pos == 12907379;
+      if(($F[10]+$F[8])/($F[9]+$F[7]) >= $min_prop){ # error meets reporting threshold
+        $chr2caveats{"$chr:$pos"} .= "; " if exists $chr2caveats{"$chr:$pos"};
+        $chr2caveats{"$chr:$pos"} .= "inconsistent haplotype phasing";
+      }
+      else{ # appears to be a genuine phasing
+        $chr2phase{"$chr:$pos:$F[4]"} = "A-$chr:$phase_range"; # grouping for haplotype
+        $chr2phase{"$chr:$pos:$F[5]"} = "B-$chr:$phase_range"; # grouping for haplotype
+      }
+    }
+  }
+  close(PHASE);
+}
+
+# Check the VCF file to see if contains phase data
+open(VCFIN, $input_file)
+    or die "Cannot open $input_file for reading: $!\n";
+my $phase_chr = "";
+my @phase_dataA;
+my @phase_dataB;
+while(<VCFIN>){
+    if(/^\s*(?:#|$)/){ # blank or hash comment
+       next;
+    }
+    my @F = split /\t/, $_;
+    next if exists $chr2caveats{"$F[0]:$F[1]"} and $chr2caveats{"$F[0]:$F[1]"} =~ /inconsistent haplotype phasing/;
+    # | indicates phased
+    if($F[8] =~ m(^(\d+)\|(\d+):)){
+      next if $1 eq $2;  # not useful to us (actually would mess up phase combining later on), but is provided sometimes
+      # start of a phasing block
+      if($phase_chr eq ""){
+        $phase_chr = $F[0];
+      }
+      my @vars = split /,/, $F[4];
+      if($1 > @vars){
+        die "Invalid VCF file (line #$.): First haplotype listed as $1, but only ", scalar(@vars), " variants were provided (", join(",", @vars), "\n";
+      }
+      if($2 > @vars){
+        die "Invalid VCF file (line #$.): Second haplotype listed as $1, but only ", scalar(@vars), " variants were provided (", join(",", @vars), "\n";
+      }
+      unshift @vars, $F[3];
+      push @phase_dataA, [$F[1], $vars[$1]];
+      push @phase_dataB, [$F[1], $vars[$2]];
+    }
+    # non phased het call, ends any phasing block there might be
+    elsif($F[8] =~ m(^0/1)){
+      # Did we just finish a phased block? If so, output it.
+      if(@phase_dataA > 1){
+        my $phase_def = "G-$phase_chr:".$phase_dataA[0]->[0]."-".$phase_dataA[$#phase_dataA]->[0];
+        for my $d (@phase_dataA){
+          my ($p, $v) = @$d;
+          if(exists $chr2phase{"$phase_chr:$p:$v"}){
+            $chr2phase{"$phase_chr:$p:$v"} .= ",$phase_def";
+            $multi_phased ||= 1;
+          }
+          else{
+            $chr2phase{"$phase_chr:$p:$v"} = $phase_def;
+          }
+        }
+        $phase_def = "H-$phase_chr:".$phase_dataB[0]->[0]."-".$phase_dataB[$#phase_dataB]->[0];
+        for my $d (@phase_dataB){
+          my ($p, $v) = @$d;
+          if(exists $chr2phase{"$phase_chr:$p:$v"}){
+            $chr2phase{"$phase_chr:$p:$v"} = ",$phase_def";
+            $multi_phased ||= 1;
+          }
+          else{
+            $chr2phase{"$phase_chr:$p:$v"} = $phase_def;
+          }
+        }
+      }
+      if($phase_chr ne ""){
+        $phase_chr = "";
+        @phase_dataA = ();
+        @phase_dataB = ();
+      }
+    }
+}
+
+print STDERR "Reading in feature GTF data..." unless $quiet;
+my %feature_range; # chr => transcript_id => [[genomic_exon_start,genomic_exon_end,cdna_start_pos],...]
+my %feature_intervaltree; # chr => transcript_id => [[genomic_exon_start,genomic_exon_end,cdna_start_pos],...]
+my %feature_strand; # transcript_id => +|-
+my $feature_count = 0;
+my %feature_min;
+my %feature_max;
+my %feature_cds_min;
+my %feature_cds_max;
+my %feature_contig;
+my %feature_length;
+my %feature_type;
+my %feature_transl_table; # note alternate translation table usage
+my %chr_read;
+open(GTF, $exons_file)
+    or die "Cannot open $exons_file for reading: $!\n";
+while(<GTF>){
+    next if /^\s*#/;
+    my @fields = split /\t/, $_;
+    next if defined $which_chr and $fields[0] ne $which_chr and "chr$fields[0]" ne $which_chr and $fields[0] ne "chr$which_chr";
+
+    if($fields[2] eq "exon" or $fields[2] eq "CDS"){
+	next unless $fields[$#fields] =~ /transcript_id \"(.*?)\"/o;
+	my $parent = $1;
+	if(not $quiet and not exists $chr_read{$fields[0]}){
+	    print STDERR " $fields[0]";
+	    $chr_read{$fields[0]} = 1;
+	}
+	if(not exists $feature_strand{$parent}){
+	    $feature_strand{$parent} = $fields[6];
+	    $feature_contig{$parent} = $fields[0];
+            if($fields[$#fields] =~ /transcript_type \"(.*?)\"/){
+              $feature_type{$parent} = $1;
+            }
+            else{
+              $feature_type{$parent} = "NA";
+            }
+	}
+	if($fields[2] eq "CDS"){
+	    #print STDERR "CDS value for $parent is $fields[2]..$fields[3]\n";
+	    if(not exists $feature_cds_min{$parent} or $fields[3] < $feature_cds_min{$parent}){
+		$feature_cds_min{$parent} = $fields[3];
+	    }
+	    if(not exists $feature_cds_max{$parent} or $fields[4] > $feature_cds_max{$parent}){
+		$feature_cds_max{$parent} = $fields[4];
+	    }
+            if($fields[$#fields] =~ /transl_table \"(\d+)\"/){
+                $feature_transl_table{$parent} = $1; #assume one translation table per CDS, which should be reasonable
+            }
+            while($fields[$#fields] =~ /transl_except \"pos:(\S+?),aa:(\S+?)\"/g){
+                my $pos = $1;
+                my $new_aa = $2; # needs to change from three letter code to 1
+                if($new_aa =~ /^ter/i){ # can be funny so have special case (allows TERM, etc.)
+                  $new_aa = "*";
+                }
+                elsif(length($new_aa) == 3){
+                  $new_aa = Bio::SeqUtils->new()->seq3in($new_aa);
+                }
+                if($pos =~ /^(\d+)\.\.(\d+)/){
+                  for my $p ($1..$2){
+                    $transl_except{"$fields[0]:$p"} = $new_aa;
+                  }
+                }
+                else{
+                  $transl_except{"$fields[0]:$pos"} = $new_aa;
+                }
+            }
+	    next;
+	}
+	if(not exists $feature_min{$parent} or $fields[3] < $feature_min{$parent}){
+	    $feature_min{$parent} = $fields[3];
+	}
+	if(not exists $feature_max{$parent} or $fields[4] > $feature_max{$parent}){
+	    $feature_max{$parent} = $fields[4];
+	}
+
+	$feature_count++;
+	if(not exists $feature_range{$fields[0]}){
+	    $feature_range{$fields[0]} = {}; # Chr => {parentID => [start,stop]}
+            $feature_intervaltree{$fields[0]} = Set::IntervalTree->new();
+	}
+	if(not exists $feature_range{$fields[0]}->{$parent}){
+	    $feature_range{$fields[0]}->{$parent} = [];
+	}
+        push @{$feature_range{$fields[0]}->{$parent}}, [$fields[3],$fields[4]];
+        $feature_intervaltree{$fields[0]}->insert($parent, $fields[3], $fields[4]+1); # ranges need to have positive length for module to work properly
+        $feature_length{$parent} += $fields[4]-$fields[3]+1;
+    }
+}
+close(GTF);
+print STDERR "\nFound $feature_count exons on ", scalar(keys %feature_range), " contigs in the GTF file\n" unless $quiet;
+
+for my $contig (keys %feature_range){
+    for my $parent (keys %{$feature_range{$contig}}){
+	# sort by subrange start
+	my @feature_ranges = sort {$a->[0] <=> $b->[0]} @{$feature_range{$contig}->{$parent}};
+	$feature_range{$contig}->{$parent} = \@feature_ranges;
+	$feature_range{"chr".$contig}->{$parent} = \@feature_ranges if not $contig =~ /^chr/;
+	$feature_range{$1}->{$parent} = \@feature_ranges if $contig =~ /^chr(\S+)/;
+    }
+}
+
+# Calculate the cDNA position of the leftmost (reference strand) base for each exon
+for my $contig (keys %feature_range){
+    for my $parent (keys %{$feature_range{$contig}}){
+        my @feature_ranges = @{$feature_range{$contig}->{$parent}};
+        if($feature_strand{$parent} eq "-"){
+            # set up utr offset for correct CDS coordinates
+            my $feature_offset = 0;
+            for(my $i = $#feature_ranges; $i >= 0; $i--){
+                last if not $feature_cds_max{$parent};
+                # exon is completely 5' of the start
+                if($feature_ranges[$i]->[0] > $feature_cds_max{$parent}){
+                    $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
+                }
+                # exon with the cds start
+                elsif($feature_ranges[$i]->[1] >= $feature_cds_max{$parent} and
+                      $feature_ranges[$i]->[0] <= $feature_cds_max{$parent}){
+                    $feature_offset += $feature_cds_max{$parent} - $feature_ranges[$i]->[1];
+                    last;
+                }
+                else{
+                    die "The CDS for $parent (on negative strand) ends downstream ",
+                    "($feature_cds_max{$parent}) of the an exon",
+                    " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n";
+                }
+            }
+            for(my $i = $#feature_ranges; $i >= 0; $i--){
+              $feature_offset += $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
+              $feature_ranges[$i]->[2] = $feature_offset-1;
+            }
+       }
+       else{ # positive strand
+            # set up utr offset for correct CDS coordinates
+            my $feature_offset = 0;
+            for(my $i = 0; $i <= $#feature_ranges; $i++){
+                last if not $feature_cds_min{$parent};
+                # All 5' utr exon
+                if($feature_ranges[$i]->[1] < $feature_cds_min{$parent}){
+                    $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
+                }
+                # exon with the cds start
+                elsif($feature_ranges[$i]->[1] >= $feature_cds_min{$parent} and
+                      $feature_ranges[$i]->[0] <= $feature_cds_min{$parent}){
+                    $feature_offset -= $feature_cds_min{$parent} - $feature_ranges[$i]->[0];
+                    last;
+                }
+                else{
+                    die "The CDS for $parent starts upstream ($feature_cds_min{$parent}) of the first exon",
+                    " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n";
+                }
+            }
+            # assign cDNA coords for each exon to the third array element
+            for(my $i = 0; $i <= $#feature_ranges; $i++){
+              $feature_ranges[$i]->[2] = $feature_offset;
+              $feature_offset += $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
+            }
+       }
+    }
+}
+
+print STDERR "Reading in gene name definitions...\n" unless $quiet;
+die "Data file $genename_bed_file does not exist, aborting.\n" if not -e $genename_bed_file;
+my %gene_ids;
+open(TAB, $genename_bed_file)
+  or die "Cannot open gene name BED file $genename_bed_file for reading: $!\n";
+while(<TAB>){
+  chomp;
+  # format should be "chr start stop gene_name ..."
+  my @fields = split /\t/, $_;
+  next if $#fields < 3;
+  my $c = $fields[0];
+  if(not exists $gene_ids{$c}){
+    $gene_ids{$c} = Set::IntervalTree->new();
+  }
+  $gene_ids{$c}->insert($fields[3], $fields[1], $fields[2]);
+}
+
+# Print output header
+open(OUT, ">$output_file")
+  or die "Cannot open $output_file for writing: $!\n";
+
+print OUT join("\t", "Feature type", "Transcript length", "Selected transcript", "Transcript HGVS", "Strand", "Chr", "DNA From", "DNA To", "Zygosity", "P-value", "Variant Reads", "Total Reads",
+	       "Ref base", "Obs base", "Pop. freq. source", "Pop. freq.", "Variant DB ID"), "\t",
+               ($internal_snp ? "Internal pop. freq.\t" : ""),
+               join("\t", "Protein HGVS", "Closest exon junction (AA coding variants)", "Gene Name", "Caveats", "Phase", "Num rare variants in gene (MAF <= $rare_variant_prop)", "Num rare coding and splice site variants in gene (MAF <= $rare_variant_prop)"),"\n";
+
+# If there is CNV data, load it.
+# BED columns should be chr start stop caveats ploidy . ignored ignored r,g,b
+# The dot means the strand doesn't matter.
+# where the first five fields are required, others optional
+# where r,g,b is overloaded with father,mother ploidies and "b" is integer representing affected status logical AND (father bit mask 1, mother bit mask 2)
+if(defined $cnv_file){
+  print STDERR "Reading in CNV data...\n" unless $quiet;
+  open(CNV, $cnv_file)
+    or die "Cannot open $cnv_file for reading: $!\n";
+  while(<CNV>){
+    chomp;
+    my @F = split /\t/,  $_, -1;
+    if(@F < 5){
+      print STDERR "Skipping unparseable line ($cnv_file #$.): $_\n";
+      next;
+    }
+    my $ploidy = $F[4];
+    my $cnv_chr = $F[0];
+    next if defined $which_chr and $cnv_chr ne $which_chr and "chr$cnv_chr" ne $which_chr and $cnv_chr ne "chr$which_chr";
+    my $cnv_start = $F[1];
+    my $cnv_end = $F[2];
+    my $p_value = "NA";
+    if($F[3] =~ s/p-value=(\S+?)(?:;|$)//){
+      $p_value = $1;
+      next if $min_pvalue < $p_value;
+    }
+
+    # Report a variant line for each gene that is found in this CNV
+    my $target_parents = $feature_intervaltree{$cnv_chr}->fetch($cnv_start, $cnv_end+1);
+
+    my $caveats = "";
+    if(@F == 9){
+      my @parents_ploidy = split /,/, $F[8];
+      if($parents_ploidy[2] == 0){ # neither parent affected
+        if($ploidy < $parents_ploidy[0] and $ploidy < $parents_ploidy[1]){
+          if($ploidy > 2){
+            $caveats = "Polyploidy is less severe than in either unaffected parents";
+          }
+          # else: no caveats, this offspring has fewer copies than normally observed, or in unaffected parents
+          elsif($ploidy < 2){
+            if($parents_ploidy[0] == 2 and $parents_ploidy[1] == 2){
+              $caveats = "De novo copy loss, unaffected parents are diploid";
+            }
+            else{
+              $caveats = "Copy loss is greater than in either unaffected parent";
+            }
+          }
+        }
+        elsif($ploidy >= $parents_ploidy[0] and $ploidy <= $parents_ploidy[1] or
+           $ploidy >= $parents_ploidy[1] and $ploidy <= $parents_ploidy[0]){
+          $caveats = "Aneuploidy likely inherited from an unaffected parent";
+        }
+        elsif($ploidy > $parents_ploidy[0] and $ploidy > $parents_ploidy[1]){
+          if($parents_ploidy[0] > 2){
+            if($parents_ploidy[1] > 2){
+              $caveats = "Lower polyploidy already exists in both unaffected parents";
+            }
+            else{
+              $caveats = "Lower polyploidy already exists in unaffected father";
+            }
+          }
+          else{
+            if($parents_ploidy[1] > 2){
+              $caveats = "Lower polyploidy already exists in unaffected mother";
+            }
+            # else no caveats, because both parents are "normal", yet we have polyploidy in the offspring
+            else{
+              $caveats = "De novo polyploidy, unaffected parents are diploid";
+            }
+          }
+        }
+        # else
+        else{
+          die "Oops! Error in program logic...how did we get here (unaffected parents)? $_";
+        }
+      }
+      elsif($parents_ploidy[2] == 1){ # father affected
+          if($ploidy == $parents_ploidy[1]){ # just like unaffected Mom
+            if($ploidy > 2){
+              if($ploidy == $parents_ploidy[0]){
+                $caveats = "Same polyploidy present in both affected and unaffected parents";
+              }
+              else{
+                $caveats = "Polyploidy inherited from unaffected mother";
+              }
+            }
+            elsif($ploidy < 2){
+              if($ploidy == $parents_ploidy[0]){
+                $caveats = "Same copy loss in both affected and unaffected parents";
+              }
+              else{
+                $caveats = "Copy loss is shared with unaffected mother";
+              }
+            }
+            else{
+              if($ploidy == $parents_ploidy[0]){
+                # Why was this even reported? parents and child have diploid status...
+                next;
+              }
+              $caveats = "Diploidy is shared with unaffected mother";
+            }
+          }
+          elsif($ploidy > 2){ # polyploidy
+            if($parents_ploidy[0] == 2){
+              if($parents_ploidy[1] > 2){
+                $caveats = "Unaffected mother has polyploidy (".$parents_ploidy[1]."x), but affected father is diploid";
+              }
+              elsif($parents_ploidy[1] == 2){
+                $caveats = "Both unaffected mother and affected father are diploid";
+              }
+              else{
+                $caveats = "Affected father is diploid, unaffected mother has copy loss (".$parents_ploidy[1]."x)";
+              }
+            }
+            elsif($parents_ploidy[0] < 2){
+              $caveats = "Polyploidy found, but affected father had copy loss (".$parents_ploidy[0]."x)";
+            }
+            elsif($ploidy < $parents_ploidy[1]){
+              $caveats = "Polyploidy is less severe than in unaffected mother (".$parents_ploidy[1]."x), or affected father (".$parents_ploidy[0]."x)";
+            }
+            # past here the ploidy is great than in the unaffected mother
+            elsif($parents_ploidy[1] < 2){
+              $caveats = "Polyploidy is also severe in affected father (".$parents_ploidy[0]."x), but unaffected mother actually had copy loss (". $parents_ploidy[1]. "x)";
+            }
+            elsif($parents_ploidy[1] == 2){
+              $caveats = "Polyploidy is also severe in affected father (".$parents_ploidy[0]."x), and mother is diploid";
+            }
+            elsif($ploidy < $parents_ploidy[0]){
+              $caveats = "Polyploidy is less severe than in affected father (".$parents_ploidy[0]."x), but more severe than unaffected mother (". $parents_ploidy[1]. "x)";
+            }
+            elsif($ploidy > $parents_ploidy[0]){
+              $caveats = "Polyploidy is more severe than in affected father (".$parents_ploidy[0]."x)";
+            }
+            else{
+              $caveats = "Polyploidy is as severe as in affected father";
+            }
+          }
+          elsif($ploidy == 2){
+            # Don't report diploid status, any funny recombination should show up in large indel analysis
+            next;
+          }
+          else{ # copies < 2
+            if($ploidy == $parents_ploidy[0]){
+              if($ploidy > $parents_ploidy[1]){
+                $caveats = "Copy loss is the same as affected father, but less than unaffected mother (". $parents_ploidy[1]. "x)";
+              }
+              else{
+                $caveats = "Copy loss is as severe as in affected father";
+              }
+            }
+            elsif($ploidy > $parents_ploidy[0]){
+              if($ploidy > $parents_ploidy[1]){
+                if($parents_ploidy[1] == 0 and $parents_ploidy[0] == 0){
+                  $caveats = "Poor mapping, or Mendelian inheritence violation is severe: no copies of region in either parent, but present in offspring";
+                }
+                elsif($ploidy == 2){
+                  next; # child got best of both parents, ignore from CNV standpoint (may still have SNPs of course, or translocation, etc.)
+                }
+                else{
+                  $caveats = "Copy loss is less severe than in unaffected mother (".$parents_ploidy[1]."x), or affected father (".$parents_ploidy[0]."x)";
+                }
+              }
+              # else: child has less copies than unaffected mom, but more than affected Dad
+              else{
+                if($parents_ploidy[1] > 2){
+                  $caveats = "Copy loss was more severe in affected father (".$parents_ploidy[0]."x), but unaffected mother had polyploidy (".$parents_ploidy[1]."x)";
+                }
+                elsif($parents_ploidy[1] == 2){
+                  $caveats = "Copy loss was more severe in affected father (".$parents_ploidy[0]."x), but unaffected mother was diploid";
+                }
+                else{ # unaffected has loss
+                  $caveats = "Copy loss is more severe than unaffect mother (".$parents_ploidy[1]."x), but less severe than affected father (".$parents_ploidy[0]."x)";
+                }
+              }
+            }
+            # past here, ploidy is less than affected father
+            elsif($parents_ploidy[1] > 2){
+              $caveats = "Copy loss is more severe than affected father (".$parents_ploidy[0]."x), and unaffected mother had polyploidy (".$parents_ploidy[1]."x)";
+            }
+            elsif($parents_ploidy[1] == 2){
+              $caveats = "Copy loss is more severe than in affected father (".$parents_ploidy[0]."x)";
+            }
+            else{
+              $caveats = "Copy loss is more severe than in both unaffect mother (".$parents_ploidy[1]."x), and affected father (".$parents_ploidy[0]."x)";
+            }
+          }
+      }
+      elsif($parents_ploidy[2] == 2){ # mother affected
+          if($ploidy == $parents_ploidy[0]){ # just like unaffected Dad
+            if($ploidy > 2){
+              if($ploidy == $parents_ploidy[1]){
+                $caveats = "Same polyploidy present in both affected and unaffected parents";
+              }
+              else{
+                $caveats = "Polyploidy inherited from unaffected father";
+              }
+            }
+            elsif($ploidy < 2){
+              if($ploidy == $parents_ploidy[1]){
+                $caveats = "Same copy loss in both affected and unaffected parents";
+              }
+              else{
+                $caveats = "Copy loss is shared with unaffected father";
+              }
+            }
+            else{
+              if($ploidy == $parents_ploidy[1]){
+                # Why was this even reported? parents and child have diploid status...
+                next;
+              }
+              $caveats = "Diploidy is shared with unaffected father";
+            }
+          }
+          elsif($ploidy > 2){ # polyploidy
+            if($parents_ploidy[1] == 2){
+              if($parents_ploidy[0] > 2){
+                $caveats = "Unaffected father has polyploidy (".$parents_ploidy[0]."x), but affected mother is diploid";
+              }
+              elsif($parents_ploidy[0] == 2){
+                $caveats = "Both unaffected father and affected mother are diploid";
+              }
+              else{
+                $caveats = "Affected mother is diploid, unaffected father has copy loss (".$parents_ploidy[1]."x)";
+              }
+            }
+            elsif($parents_ploidy[1] < 2){
+              $caveats = "Polyploidy found, but affected mother had copy loss (".$parents_ploidy[1]."x)";
+            }
+            elsif($ploidy < $parents_ploidy[0]){
+              $caveats = "Polyploidy is less severe than in unaffected father (".$parents_ploidy[0]."x), or affected mother (".$parents_ploidy[1]."x)";
+            }
+            # past here the ploidy is great than in the unaffected father
+            elsif($parents_ploidy[0] < 2){
+              $caveats = "Polyploidy is also severe in affected mother (".$parents_ploidy[1]."x), but unaffected father actually had copy loss (". $parents_ploidy[0]. "x)";
+            }
+            elsif($parents_ploidy[0] == 2){
+              $caveats = "Polyploidy is also severe in affected mother (".$parents_ploidy[1]."x), and unaffected father is diploid";
+            }
+            elsif($ploidy < $parents_ploidy[1]){
+              $caveats = "Polyploidy is less severe than in affected mother (".$parents_ploidy[1]."x), but more severe than unaffected father (". $parents_ploidy[0]. "x)";
+            }
+            elsif($ploidy > $parents_ploidy[1]){
+              $caveats = "Polyploidy is more severe than in affected mother (".$parents_ploidy[1]."x)";
+            }
+            else{
+              $caveats = "Polyploidy is as severe as in affected mother";
+            }
+          }
+          elsif($ploidy == 2){
+            # Don't report diploid status, any funny recombination should show up in large indel analysis
+            next;
+          }
+          else{ # copies < 2
+            if($ploidy == $parents_ploidy[1]){
+              if($ploidy > $parents_ploidy[0]){
+                $caveats = "Copy loss is the same as affected mother, but less than unaffected father (". $parents_ploidy[0]. "x)";
+              }
+              else{
+                $caveats = "Copy loss is as severe as in affected mother";
+              }
+            }
+            elsif($ploidy > $parents_ploidy[1]){
+              if($ploidy > $parents_ploidy[0]){
+                if($parents_ploidy[1] == 0 and $parents_ploidy[0] == 0){
+                  $caveats = "Poor mapping, or Mendelian inheritence violation is severe: no copies of region in either parent, but present in offspring";
+                }
+                elsif($ploidy == 2){
+                  next; # child got best of both parents, ignore from CNV standpoint (may still have SNPs of course, or translocation, etc.)
+                }
+                else{
+                  $caveats = "Copy loss is less severe than in unaffected father (".$parents_ploidy[0]."x), or affected mother (".$parents_ploidy[1]."x)";
+                }
+              }
+              # else: child has less copies than unaffected Dad, but more than affected Mom
+              else{
+                if($parents_ploidy[0] > 2){
+                  $caveats = "Copy loss was more severe in affected mother (".$parents_ploidy[1]."x), but unaffected father had polyploidy (".$parents_ploidy[0]."x)";
+                }
+                elsif($parents_ploidy[0] == 2){
+                  $caveats = "Copy loss was more severe in affected mother (".$parents_ploidy[1]."x), but unaffected father was diploid";
+                }
+                else{ # unaffected has loss
+                  $caveats = "Copy loss is more severe than unaffect father (".$parents_ploidy[0]."x), but less severe than affected mother (".$parents_ploidy[1]."x)";
+                }
+              }
+            }
+            # past here, ploidy is less than affected mother
+            elsif($parents_ploidy[0] > 2){
+              $caveats = "Copy loss is more severe than affected mother (".$parents_ploidy[1]."x), and unaffected father had polyploidy (".$parents_ploidy[0]."x)";
+            }
+            elsif($parents_ploidy[0] == 2){
+              $caveats = "Copy loss is more severe than in affected mother (".$parents_ploidy[1]."x)";
+            }
+            else{
+              $caveats = "Copy loss is more severe than in both unaffect father (".$parents_ploidy[0]."x), and affected mother (".$parents_ploidy[1]."x)";
+            }
+          }
+        }
+
+    }
+    if($F[3] and $F[3] ne "-"){ # prexisting caveat from CNV caller
+      if(defined $caveats){
+        $caveats .= "; $F[3]" unless $caveats =~ /\b$F[3]\b/;
+      }
+      else{
+       $caveats  = $F[3];
+      }
+    }
+
+    # Sort by start for consistency
+    my @target_parents = sort {$feature_range{$cnv_chr}->{$a}->[0]->[0] <=> $feature_range{$cnv_chr}->{$b}->[0]->[0]} @$target_parents;
+
+    for my $target_parent (@target_parents){
+      my $target_caveats = $caveats;
+      my $strand = $feature_strand{$target_parent};
+      # report the gain/loss of each gene separately, for simplicity in downstream analysis
+      my $cnv_exon_start = 10000000000; # genomic coords
+      my $cnv_exon_end = 0;
+      my $cnv_cdna_start = 0; # cDNA coords
+      my $cnv_cdna_end = 0;
+      my $off5 = 0; # border outside the exon?
+      my $off3 = 0;
+      my @feature_ranges = @{$feature_range{$cnv_chr}->{$target_parent}};
+      # find the first and last exons in the gene that are inside the CNV
+      for my $subregion (@feature_ranges){
+        # exon overlaps CNV?
+        if($subregion->[0] <= $cnv_end and $subregion->[1] >= $cnv_start){
+          if($cnv_exon_start > $subregion->[0]){
+            if($cnv_start < $subregion->[0]){
+              $cnv_exon_start = $subregion->[0]; $off5 = 1;
+              $cnv_cdna_start = $subregion->[2];
+            }
+            else{
+              $cnv_exon_start = $cnv_start; $off5 = 0;
+              $cnv_cdna_start = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$cnv_start: $cnv_start-$subregion->[0]);
+            }
+          }
+          if($cnv_exon_end < $subregion->[1]){
+            if($cnv_end > $subregion->[1]){
+              $cnv_exon_end = $subregion->[1]; $off3 = 1;
+              $cnv_cdna_end = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$subregion->[1] : $subregion->[1]-$subregion->[0]);
+            }
+            else{
+              $cnv_exon_end = $cnv_end; $off3 = 0;
+              $cnv_cdna_end = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$cnv_end : $cnv_end-$subregion->[0]);
+            }
+          }
+        }
+      }
+
+    my $ends_internally = 0;
+    if($cnv_exon_end == 0){ # ends inside the exon
+      $cnv_exon_end = $cnv_end;
+      $ends_internally = 1;
+    }
+    # See if it's in the structural variant database
+    my @gain_coverage; $#gain_coverage = $cnv_exon_end-$cnv_exon_start; # preallocate blanks
+    my @loss_coverage; $#loss_coverage = $cnv_exon_end-$cnv_exon_start; # preallocate blanks
+    my $dgv_loss_id; # report the DGV entry that covers most of the observed structural variant
+    my $dgv_loss_length = 0; # report the DGV entry that covers most of the observed structural variant
+    my $dgv_gain_id; # report the DGV entry that covers most of the observed structural variant
+    my $dgv_gain_length = 0; # report the DGV entry that covers most of the observed structural variant
+    my $gains;
+    my $losses;
+    my $dgv_chr = $cnv_chr;
+    $dgv_chr =~ s/^chr//; # no prefix in DGV
+    #open(DGV, "tabix $dgv_file $dgv_chr:$cnv_exon_start-$cnv_exon_end |") # check out CNV in this gene model region
+    #  or die "Cannot run tabix: $!\n";
+    open(DGV, "/dev/null");
+    while(<DGV>){
+      my @C = split /\t/, $_;
+      next if $C[4] ne "CNV"; # todo: handle indels?
+      my $dgv_start = $C[2];
+      my $dgv_end = $C[3];
+      my $dgv_direction = $C[5];
+      my $gain_cov_count = 0;
+      my $loss_cov_count = 0;
+      if($dgv_direction eq "Gain"){
+        for(my $i = ($dgv_start < $cnv_exon_start ? $cnv_exon_start : $dgv_start); $i <= $dgv_end and $i <= $cnv_exon_end; $i++){
+          $gain_coverage[$i-$cnv_exon_start] = 1 unless defined $gain_coverage[$i-$cnv_exon_start];
+          $gain_cov_count++;
+        }
+      }
+      elsif($dgv_direction eq "Loss"){
+        for(my $i = ($dgv_start < $cnv_exon_start ? $cnv_exon_start : $dgv_start); $i <= $dgv_end and $i <= $cnv_exon_end; $i++){
+          $loss_coverage[$i-$cnv_exon_start] = 1 unless defined $loss_coverage[$i-$cnv_exon_start];
+          $loss_cov_count++;
+        }
+      }
+      if($dgv_direction eq "Gain" and $gain_cov_count > $dgv_gain_length){
+        $dgv_gain_id = $C[0];
+        $dgv_gain_length = $gain_cov_count;
+      }
+      if($dgv_direction eq "Loss" and $loss_cov_count > $dgv_loss_length){
+        $dgv_loss_id = $C[0];
+        $dgv_loss_length = $loss_cov_count;
+      }
+    }
+    close(DGV);
+
+    my $gain_coverage = 0;
+    for my $count (@gain_coverage){
+      $gain_coverage++ if defined $count;
+    }
+    $gain_coverage = sprintf "%.3f", $gain_coverage/($cnv_exon_end-$cnv_exon_start+1); # make it a proportion
+    my $loss_coverage = 0;
+    for my $count (@loss_coverage){
+      $loss_coverage++ if defined $count;
+    }
+    $loss_coverage = sprintf "%.3f", $loss_coverage/($cnv_exon_end-$cnv_exon_start+1); # make it a proportion
+
+    my $src = "DGV";
+    my $dgv_id = "NA";
+    my $dgv_caveat;
+    my $dgv_coverage;
+    if($ploidy > 2){
+      if(not defined $dgv_gain_id){
+        if(defined $dgv_loss_id){
+          $dgv_id = sprintf "%s/%.3f", $dgv_loss_id, $dgv_loss_length/($cnv_exon_end-$cnv_exon_start+1);
+          $dgv_caveat = "; No gains are known in healthy controls, the DGV2 ID reported is for a loss in the same area";
+          $dgv_coverage = $loss_coverage;
+        }
+        else{
+          $dgv_id = "novel";
+          $dgv_coverage = "NA";
+          $src = "NA";
+        }
+      }
+      else{
+        $dgv_id = sprintf "%s/%.3f", $dgv_gain_id, $dgv_gain_length/($cnv_exon_end-$cnv_exon_start+1);
+        $dgv_coverage = $gain_coverage;
+      }
+    }
+    elsif($ploidy < 2){
+      if(not defined $dgv_loss_id){
+        if(defined $dgv_gain_id){
+          $dgv_id = sprintf "%s/%.3f", $dgv_gain_id, $dgv_gain_length/($cnv_exon_end-$cnv_exon_start+1);
+          $dgv_caveat = "; No losses are known in healthy controls, the DGV2 ID reported is for a gain in the same area";
+          $dgv_coverage = $gain_coverage;
+        }
+        else{
+          $dgv_id = "novel";
+          $dgv_coverage = "NA";
+          $src = "NA";
+        }
+      }
+      else{
+        $dgv_id = sprintf "%s/%.3f", $dgv_loss_id, $dgv_loss_length/($cnv_exon_end-$cnv_exon_start+1);
+        $dgv_coverage = $loss_coverage;
+      }
+    }
+
+    my $non_coding = 0;
+    if(not exists $feature_cds_max{$target_parent} or not defined $feature_cds_max{$target_parent} or $feature_cds_max{$target_parent} eq ""){
+        $non_coding = 1;
+    }
+    $target_caveats .= $dgv_caveat if defined $dgv_caveat and $dgv_id ne "novel" and $target_caveats !~ /\Q$dgv_caveat\E/;
+    #print "Recorded $cnv_chr:$cnv_start caveat $caveats\n";
+      # if it doesn't overlap an exon, we need to find out which two exons it's between
+    if($ends_internally){
+        my $intron_found = 0;
+        for(my $i = 0; $i < $#feature_ranges; $i++){
+          if($feature_ranges[$i]->[1] < $cnv_start and $feature_ranges[$i+1]->[0] > $cnv_end){
+            if($ploidy > 2){ # gain
+              if($strand eq "-"){
+                record_snv("$target_parent\t",
+                   ($non_coding ? "g.$cnv_start\_$cnv_end" :
+                    "c.".($feature_ranges[$i+1]->[2])."+".($feature_ranges[$i+1]->[0]-$cnv_end)."_".($feature_ranges[$i+1]->[2]+1)."-".($cnv_start-$feature_ranges[$i]->[1])),
+                # pos	Zygosity	P-value	Variant Reads	Total Reads	Ref Bases	Var Bases	Population Frequency Source	Pop. freq. or DGV2 gain/loss coverage	dbSNP or DGV2 ID
+                   "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\tNA\t$p_value\tNA\tNA\t",
+                   "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
+              }
+              else{
+                record_snv("$target_parent\t",
+                   ($non_coding ? "g.$cnv_start\_$cnv_end" :
+                   "c.".($feature_ranges[$i+1]->[2]-1)."+".($cnv_start-$feature_ranges[$i]->[1])."_".$feature_ranges[$i+1]->[2]."-".($feature_ranges[$i+1]->[0]-$cnv_end)),
+                   "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\tNA\t$p_value\tNA\tNA\t",
+                   "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
+              }
+            }
+            else{ # loss
+              if($strand eq "-"){
+                record_snv("$target_parent\t",
+                   ($non_coding ? "g.$cnv_start\_$cnv_end" :
+                   "c.".($feature_ranges[$i+1]->[2])."+".($feature_ranges[$i+1]->[0]-$cnv_end)."_".($feature_ranges[$i+1]->[2]+1)."-".($cnv_start-$feature_ranges[$i]->[1])),
+                   "del\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t",
+                   "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
+              }
+              else{
+                record_snv("$target_parent\t",
+                   ($non_coding ? "g.$cnv_start\_$cnv_end" :
+                   "c.".($feature_ranges[$i+1]->[2]-1)."+".($cnv_start-$feature_ranges[$i]->[1])."_".$feature_ranges[$i+1]->[2]."-".($feature_ranges[$i+1]->[0]-$cnv_end)),
+                   "del\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t",
+                   "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n");
+              }
+            }
+            $intron_found = 1; last;
+          }
+        }
+        warn "Logic error: CNV overlaps a gene ($target_parent), but is neither intronic nor exonic. Offending CNV was $_\n" unless $intron_found;
+        next;
+      }
+      if($strand eq "-"){
+        my $tmp = $cnv_cdna_start;
+        $cnv_cdna_start = $cnv_cdna_end;
+        $cnv_cdna_end = $tmp;
+      }
+      # Make the location label pretty descriptive
+      my $cnv_phase = "";
+      if($cnv_exon_start > $cnv_start or $cnv_exon_end < $cnv_end){
+        $cnv_phase = "CNV-$cnv_chr:$cnv_start-$cnv_end"; # Use phase to note that it's part of a bigger CNV than just the range of this feature
+      }
+      # if we get here, we're in a gained/deleted exon category
+      # CNVs are fuzzy-edged (as opposed to large indels), so produce HGVS syntax that reflect this
+      if($ploidy > 2){ # gain
+        # find the exons encompassed by the CNV. NOTE that we assume that polyploidies are proximal
+        record_snv("$target_parent\t",
+                   ($non_coding ? "g.".($cnv_exon_start > $cnv_start ? "$cnv_exon_start-?" : $cnv_start)."_".($cnv_exon_end < $cnv_end ? "$cnv_exon_end+?" : $cnv_end) :
+                   "c.$cnv_cdna_start".($off5?"-?":"")."_$cnv_cdna_end".($off3?"+?":"")),
+                   "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_exon_start\t$cnv_exon_end\tNA\t$p_value\tNA\tNA\t",
+                   "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t$cnv_phase\n");
+      }
+      else{ # loss
+        #translate genome coordinates into cDNA coordinates
+        record_snv("$target_parent\t",
+                   ($non_coding ? "g.".($cnv_exon_start > $cnv_start ? "$cnv_exon_start-?" : $cnv_start)."_".($cnv_exon_end < $cnv_end ? "$cnv_exon_end+?" : $cnv_end) :
+                   "c.$cnv_cdna_start".($off5?"-?":"")."_$cnv_cdna_end".($off3?"+?":"")),
+                   "del\t$strand\t$cnv_chr\t$cnv_exon_start\t$cnv_exon_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t",
+                   "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t$cnv_phase\n");
+      }
+    }
+  }
+  close(CNV);
+
+}
+
+
+#sort genes by start, then longest if tied
+my %rc = qw(A T T A G C C G N N S W W S K M M K Y R R Y X X);
+print STDERR "Processing variant calls..." unless $quiet;
+%chr_read = ();
+open(VCFIN, $input_file)
+    or die "Cannot open $input_file for reading: $!\n";
+while(<VCFIN>){
+    if(/^\s*(?:#|$)/){ # blank or hash comment lines
+       next;
+    }
+    chomp;
+    my @fields = split /\t/, $_;
+
+    next unless exists $feature_range{$fields[0]};
+    if(not $quiet and not exists $chr_read{$fields[0]}){
+	print STDERR " $fields[0]";
+	$chr_read{$fields[0]} = 1;
+	#print STDERR "(not in reference file!)" unless exists $feature_range{$fields[0]};
+    }
+
+    next if $fields[4] eq "<NON_REF>"; # GVCF background stuff
+    next if $fields[9] eq "./." or $fields[9] eq "."; # no call
+    my $chr = $fields[0];
+    next if defined $which_chr and $chr ne $which_chr and $chr ne "chr$which_chr" and "chr$chr" ne $which_chr;
+    print STDERR "passes chr and field # test" if grep /dataset_7684.dat/, @ARGV;
+    $chr = "chr$chr" if $chr !~ /^chr/;
+    $fields[8] =~ s/\s+$//;
+    my @values = split /:/, $fields[9];
+    #print STDERR join(" / ", @values), "\n" if $. == 84;
+    my @keys = split /:/, $fields[8];
+    my $zygosity = "n/a";
+    my $quality = "n/a";
+    my $read_depth = "n/a";
+    my $variant_depths = "n/a";
+    for(my $i = 0; $i <= $#keys and $i <= $#values; $i++){ # values max index check because some genotypers are nasty and don't provide as many fields as they say they will
+	if($keys[$i] eq "GT"){
+            if($values[$i] =~ /\./ or $values[$i] =~ /0\/0/){ # one genotype not described
+              $zygosity = "none";
+              last;
+            }
+	    else{ # genotypes described
+              $zygosity = $values[$i] =~ /[02]/ ? "heterozygote" : "homozygote";
+            }
+	}
+        elsif($keys[$i] eq "DNM_CONFIG" and $zygosity eq "n/a"){ # hack
+            $zygosity = $values[$i] =~ /^(.)\1/ ? "homozygote" : "heterozygote";
+        }
+    	elsif($keys[$i] eq "GQ" and $values[$i] ne "."){
+            #print "Checking GQ (index $i) $values[$i] gq2p\n" if $. == 84;
+	    $quality = gq2p($values[$i]);
+	}
+    	elsif($keys[$i] eq "RD"){ # from some tools like denovo variant finders
+	    $read_depth = $values[$i];
+	}
+    	elsif($keys[$i] eq "DP"){
+	    $read_depth = $values[$i];
+	}
+        # the frequency of the variant can go by many names, here we have freebayes (A* are new and old versions) and atlas2_indel
+    	elsif($keys[$i] eq "AA" or $keys[$i] eq "VR" or $keys[$i] eq "AO"){
+	    $variant_depths = $values[$i];
+	}
+        # here we have GATK variant freq of form ref#,var#
+    	elsif($keys[$i] eq "AD"){
+	    $variant_depths = $values[$i];
+            $variant_depths =~ s/^\d+,//;
+	}
+        else{
+            #print STDERR "Ignoring field $keys[$i]\n";
+        }
+    }
+    next if $zygosity eq "none"; # GVCF non-ref for example or when multiple samples are reported simultaneously
+    $quality = z2p($1) if $fields[7] =~ /Z=(\d+\.\d+)/;
+    if($quality eq "n/a" and $fields[5] ne "."){
+      $quality = gq2p($fields[5]);
+    }
+    if($fields[5] eq "0" and $fields[6] eq "PASS"){ # not qual and a PASS in the filter column
+      $quality = 1;
+    }
+    elsif($quality ne "n/a" and $quality > $min_pvalue){ # p-value cutoff
+      #print "Checking call quality $fields[5]  gq2p\n" if $. == 84;
+      next unless gq2p($fields[5]) <= $min_pvalue; # in some cases, programs like FreeBayes give low genotype quality such as when a call is borderline homo/het
+                                                   # in these cases it would be silly to reject the call if their are many supporting reads.
+    }
+
+    # Some tools like GATK don't report number of variant reads...infer from other data if possible
+    if($variant_depths eq "n/a"){
+      my @key_value_pairs = split /;/, $fields[7];
+      for my $key_value_pair (@key_value_pairs){
+        if($key_value_pair !~ /^(.*?)=(.*)$/){
+          next;
+          #next if $key_value_pair eq "INDEL"; # samtools peculiarity
+          #die "Key-value pair field (column #8) does not have the format key=value for entry $key_value_pair (line #$. of ), please fix the VCF file\n";
+        }
+        if($1 eq "AB"){ # GATK: for het calls, AB is ref/(ref+var), only one variant reported per line
+          $variant_depths = "";
+          for my $ab (split /,/, $2){
+            $variant_depths .= int((1-$ab)*$read_depth).",";
+          }
+          chop $variant_depths;
+        }
+        elsif($1 eq "MLEAC"){
+        }
+        elsif($1 eq "DP4"){ # samtools: high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases
+          my @rds = split /,/, $2;
+          $variant_depths = $rds[2]+$rds[3];
+          $read_depth = $rds[0]+$rds[1]+$variant_depths;
+          if($fields[4] =~ /,/){ # samtools doesn't break down compound het calls into individual frequencies
+            my $num_alt_genotypes = $fields[4] =~ tr/,/,/;
+            $num_alt_genotypes++;
+            my $even_prop = sprintf "%.2f", $variant_depths/$read_depth/$num_alt_genotypes;
+            $variant_depths = join(",", ($even_prop) x $num_alt_genotypes);
+            if(not exists $chr2caveats{"$chr:$fields[1]"}){
+              $chr2caveats{"$chr:$fields[1]"} = "compound het var freq n/a in orig VCF file, auto set to $even_prop";
+            }
+            else{
+              $chr2caveats{"$chr:$fields[1]"} .= "; compound het var freq n/a in orig VCF file, auto set to $even_prop";
+            }
+          }
+        }
+      }
+    }
+    if($variant_depths eq "n/a"){ # usually homo var calls, can only assume all reads are variant
+      if($zygosity eq "homozygote"){
+        $variant_depths = $read_depth;
+        if($read_depth ne "n/a"){
+          if(not exists $chr2caveats{"$chr:$fields[1]"}){
+            $chr2caveats{"$chr:$fields[1]"} = "homo var freq n/a in orig VCF file, auto set to 1.0";
+          }
+          else{
+            $chr2caveats{"$chr:$fields[1]"} = "; homo var freq n/a in orig VCF file, auto set to 1.0";
+          }
+        }
+      }
+      else{
+        if($read_depth ne "n/a"){
+          $variant_depths = int($read_depth/2);
+          if(not exists $chr2caveats{"$chr:$fields[1]"}){
+            $chr2caveats{"$chr:$fields[1]"} = "het var freq n/a in orig VCF file, auto set to 0.5";
+          }
+          else{
+            $chr2caveats{"$chr:$fields[1]"} = "; het var freq n/a in orig VCF file, auto set to 0.5";
+          }
+        }
+        else{
+          $variant_depths = $read_depth;
+        }
+      }
+    }
+
+    my $target_parents = $feature_intervaltree{$chr}->fetch($fields[1]-$flanking_bases, $fields[1]+length($fields[3])+$flanking_bases);
+    # Last ditch, if not in a gene model, is it at least in an enrichment region?
+    if(not @$target_parents){
+       next if not exists $enrichment_regions{$chr};
+       my $regions_ref = $enrichment_regions{$chr};
+       my $location = $fields[1];
+       my $strand = "+"; # for lack of a better choice
+       for(my $i = find_earliest_index($location-$flanking_bases, $regions_ref);
+           $i < $#$regions_ref and $location <= $regions_ref->[$i]->[1]+$flanking_bases;
+           $i++){
+          next unless $regions_ref->[$i]->[0]-$flanking_bases <= $location and $regions_ref->[$i]->[1]+$flanking_bases >= $location;
+
+          my $feature_name = "enrichment_target_$chr:".$regions_ref->[$i]->[0]."-".$regions_ref->[$i]->[1];
+          $feature_type{$feature_name} = "misc_enrichment_kit_target";
+          $feature_length{$feature_name} = $regions_ref->[$i]->[1]-$regions_ref->[$i]->[0]+1;
+          my @variants = split /,/, $fields[4];
+          my @variant_depths = split /,/, $variant_depths;
+          my $ref = uc($fields[3]);
+          for(my $j = 0; $j <= $#variants; $j++){
+            my $variant = $variants[$j];
+            next if $variant eq "<NON_REF>" or $variant_depths[$j] eq "0"; # GVCF stuff
+            my $variant_depth = $variant_depths[$j];
+            if($min_prop){
+               next unless $variant_depth >= $min_depth and $read_depth ne "n/a" and $variant_depth/$read_depth >= $min_prop;
+            }
+            if(length($ref) == 1 and length($variant) == 1){ # SNP
+              record_snv("$feature_name\tg.$location",
+                         "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
+            }
+            elsif(length($ref) == 1 and length($variant) > 1){ # insertion
+              record_snv("$feature_name\tg.$location\_",($location+1),
+                         "ins",substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
+            }
+            elsif(length($variant) == 1 and length($ref) > 1){ # deletion
+              record_snv("$feature_name\tg.$location\_",($location+length($ref)-1),
+                         "del",substr($ref, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
+            }
+            else{ # indel
+              record_snv("$feature_name\tg.$location\_",($location+length($ref)-1),
+                         "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
+            }
+          } # end for variants
+          next; # process next record, we've done all we can with a non-coding-gene SNP
+       }
+    }
+
+    for my $target_parent (@$target_parents){
+
+        print STDERR "Checking parent $target_parent for on $chr:$fields[1] $fields[3] -> $fields[4]\n" if grep /dataset_7684.dat/, @ARGV;
+    my @feature_ranges = @{$feature_range{$chr}->{$target_parent}};
+    # Calculate the position of the change within the feature range position
+    my $strand = $feature_strand{$target_parent};
+    my $trans_table = exists $feature_transl_table{$target_parent} ? $feature_transl_table{$target_parent} : $default_transl_table;
+    $fields[4]=~tr/"//d; # sometime strangely surroundsed by quotes
+    my @variants = split /,/, $fields[4];
+    my @variant_depths = split /,/, $variant_depths;
+    my @refs = (uc($fields[3])) x scalar(@variants);
+    my @locations = ($fields[1]) x scalar(@variants);
+
+    for(my $j = 0; $j <= $#variants; $j++){
+	my $ref = $refs[$j];
+	my $location = $locations[$j];
+	my $feature_offset = 0;
+	my $feature_num = 0;
+	my $variant = uc($variants[$j]);
+        next if $variant eq "<NON_REF>" or $variant_depths[$j] eq "0"; # GVCF stuff
+	my $variant_depth = $variant_depths[$j] || "n/a";
+        #print STDERR "Evaluating target parent $target_parent for $chr:$fields[1]-".($fields[1]+length($fields[3]))." -> ",join("/", @$target_parents) , "\n" if $fields[1] == 982994;
+
+        # Break down MNPs into individual SNPs that are phased (TODO: skip if it's an inversion? would require amalgamating SNPs for tools that call them individually, phased :-P)
+        if(length($variant) > 1 and length($variant) == length($ref)){
+           my @subvariants;
+           my @subrefs;
+           my @sublocations;
+           my $phase_range = $location."-".($location+length($ref)-1);
+           for(my $k = 0; $k < length($variant); $k++){
+             my $r = substr($ref, $k, 1);
+             my $v = substr($variant, $k, 1);
+             if($r ne $v){
+               push @subvariants, $v;
+               push @subrefs, $r;
+               push @sublocations, $location+$k;
+             }
+             elsif(@variants == 1){
+               next; # homo ref call
+             }
+             if($zygosity eq "heterozygote"){
+               # need to ignore case where a homozygous call (variant or ref) is included in a double non-ref het MNP
+               if(@variants > 1){
+                 my $homo = 1;
+                 for(my $l = 0; $l <= $#variants; $l++){ # using loop in case we handle oligoploid genomes in the future
+                   if(length($variants[$l]) <= $k or substr($variants[$l], $k, 1) ne $v){
+                     $homo = 0;
+                     last;
+                   }
+                 }
+                 next if $homo;
+               }
+               my $phase_key = "$chr:".($location+$k).":$v";
+               my $phase_label = "M$j-$chr:$phase_range";
+               if(exists $chr2phase{$phase_key}){
+                 if($chr2phase{$phase_key} !~ /$phase_label/){
+                   $chr2phase{$phase_key} .= ",$phase_label";
+                 }
+               }
+               else{
+                 $chr2phase{$phase_key} = $phase_label;
+               }
+             }
+           }
+           # recycle this MNP variant loop to start processing the individual SNPs
+           splice(@refs, $j, 1, @subrefs);
+           splice(@variants, $j, 1, @subvariants);
+           splice(@locations, $j, 1, @sublocations);
+           splice(@variant_depths, $j, 1, ($variant_depth) x scalar(@subvariants));
+           $j--;
+           next;
+        }
+
+        if($min_prop != 0 and $variant_depth eq "n/a" or $variant_depth eq "."){
+          print STDERR "Could not parse variant depth from $_\n" unless $quiet;
+          next;
+        }
+        next unless $min_prop == 0 or $min_prop and $variant_depth >= $min_depth and $read_depth ne "n/a" and $variant_depth/$read_depth >= $min_prop;
+        if($zygosity eq "NA"){
+          # make the call ourselves
+          $zygosity = $variant_depths/$read_depth > 1-$min_prop ? "homozygote" : "heterozygote";
+        }
+	# e.g. chr22   47857671   . CAAAG   AAGAT,AAAAG    29.04  .
+	if(length($variant) > 1 and length($variant) == length($ref)){
+	    for(my $k = 0; $k < length($variant); $k++){
+		my $fixed_variant = $variant;
+		substr($fixed_variant, $k, 1) = substr($ref, $k, 1);
+		if($fixed_variant eq $ref){ # single base difference at base k between the two
+		    $ref = substr($ref, $k, 1);
+		    $variant = substr($variant, $k, 1);
+		    $location += $k;
+		    last;
+		}
+	    }
+	}
+
+        # samtools reports indels with long common tails, causing non-canonical HGVS reporting and a problem when looking up the variant in dbSNP etc.
+        # remove common tails to variant calls in order to try to rectify this
+        else{
+            while(length($ref) > 1 and length($variant) > 1 and substr($ref, length($ref)-1) eq substr($variant, length($variant)-1)){
+                chop $ref; chop $variant;
+            }
+        }
+
+        # See if a caveat should be added because the indel was in a polyhomomer region
+        if(length($ref) > length($variant) and index($ref, $variant) == 0 and $fasta_index->fetch("$chr:".($location+1)."-".($location+length($ref)+1)) =~ /^([ACGT])\1+$/i){
+          if(not exists $chr2caveats{"$chr:$location"}){
+            $chr2caveats{"$chr:$location"} = "poly".uc($1)." region deletion";
+          }
+          elsif($chr2caveats{"$chr:$location"} !~ /poly/){
+            $chr2caveats{"$chr:$location"} .= "; poly".uc($1)." region deletion";
+          }
+        }
+        elsif(length($ref) < length($variant) and index($variant, $ref) == 0 and substr($variant, 1) =~ /^([ACGT])\1+$/i){
+          if(not exists $chr2caveats{"$chr:$location"}){
+            $chr2caveats{"$chr:$location"} .= "poly".uc($1)." region insertion";
+          }
+          elsif($chr2caveats{"$chr:$location"} !~ /poly/){
+            $chr2caveats{"$chr:$location"} .= "; poly".uc($1)." region insertion";
+          }
+        }
+
+        # Not a protein-coding gene?  Report genomic cooordinates for HGVS
+        if(not exists $feature_cds_max{$target_parent} or not defined $feature_cds_max{$target_parent} or $feature_cds_max{$target_parent} eq ""){
+            if(length($ref) == 1 and length($variant) == 1){ # SNP
+              record_snv("$target_parent\tg.$location",
+                         "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
+            }
+            elsif(length($ref) == 1 and length($variant) > 1){ # insertion
+              record_snv("$target_parent\tg.$location\_",($location+1),
+                         "ins",substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
+            }
+            elsif(length($variant) == 1 and length($ref) > 1){ # deletion
+              record_snv("$target_parent\tg.$location\_",($location+length($ref)-1),
+                         "del",substr($ref, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
+            }
+            else{ # indel
+              record_snv("$target_parent\tg.$location\_",($location+length($ref)-1),
+                         "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+                         join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n");
+            }
+            next; # process next record, we've done all we can with a non-coding-gene SNP
+        }
+
+	if($strand eq "-"){
+	    # set up utr offset for correct CDS coordinates
+	    for(my $i = $#feature_ranges; $i >= 0; $i--){
+		# exon is completely 5' of the start
+		if($feature_ranges[$i]->[0] > $feature_cds_max{$target_parent}){
+		    #print STDERR "Whole 5' UTR exon $i: ",$feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1,"\n";
+		    $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
+		}
+		# exon with the cds start
+		elsif($feature_ranges[$i]->[1] >= $feature_cds_max{$target_parent} and
+		      $feature_ranges[$i]->[0] <= $feature_cds_max{$target_parent}){
+		    #print STDERR "Start codon in exon $i: ", $feature_cds_max{$target_parent} - $feature_ranges[$i]->[1],"\n";
+		    $feature_offset += $feature_cds_max{$target_parent} - $feature_ranges[$i]->[1];
+		    last;
+		}
+		else{
+		    die "The CDS for $target_parent (on negative strand) ends downstream ",
+		    "($feature_cds_max{$target_parent}) of the an exon",
+		    " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n";
+		}
+	    }
+	    for(my $i = $#feature_ranges; $i >= 0; $i--){
+		my $feature = $feature_ranges[$i];
+		# in the 3' UTR region of the gene
+		if($location < $feature_cds_min{$target_parent}){
+		    my $feature_exon = 0;
+		    $feature = $feature_ranges[$feature_exon];
+		    while($location > $feature->[1]+$flanking_bases and
+			  $feature_exon < $#feature_ranges){
+			$feature = $feature_ranges[++$feature_exon]; # find the 3' utr exon in which the variant is located
+		    }
+		    # utr exons passed entirely
+		    my $stop_exon = $feature_exon;
+		    while($feature_ranges[$stop_exon]->[1] < $feature_cds_min{$target_parent}){
+			$stop_exon++;
+		    }
+		    my $post_offset = $feature_cds_min{$target_parent}-$feature_ranges[$stop_exon]->[0]; # offset from the stop codon in the final coding exon
+		    while($stop_exon > $feature_exon){
+			$post_offset += $feature_ranges[$stop_exon]->[1]-$feature_ranges[$stop_exon]->[0]+1;
+			$stop_exon--;
+		    }
+
+		    my $pos = $feature->[1]-$location+1+$post_offset;
+                    my $junction_dist;
+		    if($location < $feature->[0]){ # after a UTR containing exon? set as .*DD+DD
+                        $junction_dist = ($feature->[0]-$location);
+			$pos = ($post_offset+$feature->[1]-$feature->[0]+1)."+".$junction_dist;
+		    }
+		    elsif($location > $feature->[1]){ # before a total UTR exon? set as .*DD-DD
+                        $junction_dist = -($location-$feature->[1]);
+			$pos = ($post_offset+1).$junction_dist;
+		    }
+                    else{ # in the UTR exon
+                        if($location - $feature->[0] < $feature->[1] - $location){ # location is closer to exon donor site
+                            $junction_dist = -($location - $feature->[0]+1); # +1 for HGVS syntax compatibility (there is no position 0, direct skip from -1 to +1)
+                        }
+                        else{
+                            $junction_dist = $feature->[1] - $location +1;
+                        }
+                    }
+		    if(length($ref) == 1 and length($variant) == 1){
+                        my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
+			# 3' UTR SNP
+			record_snv("$target_parent\tc.*$pos",
+			"$rc{$ref}>$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			#"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			join("\t",prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
+		    }
+		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
+                        my $rc = join("",map({$rc{$_}} split(//,reverse(substr($variant,1)))));
+			# 3' UTR insertion
+			record_snv("$target_parent\tc.*",
+			hgvs_plus($pos,-1),"_*",$pos,
+			"ins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			#"ins",substr($variant,1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
+		    }
+		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
+                        my $rc = join("",map({$rc{$_}} split(//,reverse($ref))));
+                        my $delBases = substr($rc,0,length($rc)-1);
+			if(length($ref) == 2){
+			    # 3' UTR single base deletion
+			    record_snv("$target_parent\tc.*",hgvs_plus($pos,-1),
+			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
+			}
+			else{
+			    # longer 3' UTR deletion
+			    record_snv("$target_parent\tc.*",
+			    hgvs_plus($pos,-length($ref)+1),"_*",hgvs_plus($pos, -1),
+			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
+			}
+		    }
+		    else{
+			my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
+			if($rc eq $ref and length($variant) > 3){
+			    # 3' UTR inversion
+			    record_snv("$target_parent\tc.*",
+			    hgvs_plus($pos,-length($ref)+1),"_*",$pos,
+			    "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
+			    last;
+			}
+
+			# complex substitution in 3' UTR
+			# Will break if stop codon is modified
+			record_snv("$target_parent\tc.*",
+			hgvs_plus($pos,-length($ref)+1),"_*", $pos,
+			"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n");
+		    }
+		    last;
+		}
+		# in the feature
+		elsif($location >= $feature->[0] and $location <= $feature->[1]){
+		    my $pos = $feature_offset+$feature->[1]-$location+1;
+		    if($location > $feature_cds_max{$target_parent}){ #since there is no position 0, the pos is in UTR, subtract one
+			$pos = hgvs_plus($pos, -1);
+		    }
+		    my $first_exon_base = $feature_offset+1;
+                    my $exon_edge_dist = $feature->[1]-$location+1; # equiv of HGVS +X or -X intron syntax, but for exons
+                    $exon_edge_dist = $feature->[0]-$location-1 if abs($feature->[0]-$location-1) < $exon_edge_dist; # pick closer of donor and acceptor sites
+                    #print STDERR "Got ", $feature->[1]-$location+1, "vs. ", $feature->[0]-$location-1, ": chose $exon_edge_dist\n";
+		    if(length($ref) == 1 and length($variant) == 1){
+			# SNP
+			record_snv("$target_parent\tc.",
+			$pos,
+			"$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
+                        ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+                        #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+		    }
+		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
+			my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
+                        my $insBases = substr($rc,1);
+			# insertion
+			record_snv("$target_parent\tc.",
+			hgvs_plus_exon($pos, -1, $first_exon_base),"_",$pos,"ins$insBases",
+			"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
+                        ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+                        #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+		    }
+		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
+			my $rc = join("",map({$rc{$_}} split(//,reverse($ref))));
+                        my $delBases = substr($rc,0,length($rc)-1);
+			# single nucleotide deletion
+			if(length($ref) == 2){
+			    record_snv("$target_parent\tc.",
+			    hgvs_plus_exon($pos, -1, $first_exon_base),
+			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
+                            ($pos-1 < 1 ? "NA" : $pos-1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+                            #($pos-1 < 1 ? "NA" : $pos-1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+			}
+			# longer deletion
+			else{
+                            $exon_edge_dist = $feature->[1]-$location-length($ref)+1 if abs($feature->[1]-$location-length($ref)+1) < $exon_edge_dist;
+			    record_snv("$target_parent\tc.",
+			    hgvs_plus_exon($pos, -length($ref)+1, $first_exon_base),"_",
+			    hgvs_plus_exon($pos, -1, $first_exon_base),
+			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
+                            ($pos-1 < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+                            #($pos-1 < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+			}
+		    }
+		    else{
+			# complex substitution
+                        $exon_edge_dist = $feature->[1]-$location-length($ref)+1 if abs($feature->[1]-$location-length($ref)+1) < $exon_edge_dist;
+			my $rc = join("",map({$rc{$_}} split(//,reverse($variant))));
+                        if($rc eq $variant and length($variant) > 3){
+                            # inversion
+                            record_snv("$target_parent\tc.",
+                                       hgvs_plus_exon($pos,-length($ref)+1,$first_exon_base),"_",
+                                       $pos,
+                                       "inv",
+                                       "\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+                                       join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
+                                       ($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+
+                            last;
+                        }
+			record_snv("$target_parent\tc.",
+			hgvs_plus_exon($pos,-length($ref)+1,$first_exon_base),"_",
+			$pos,
+			"delins$rc",
+			"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
+                        ($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+                        #($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+		    }
+		    last;
+		}
+		# 5' of feature (on negative strand)
+		elsif($location > $feature->[1]){
+		    if(length($ref) == 1 and length($variant) == 1){
+			# intronic SNP
+			if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
+			    # Closer to acceptor site
+			    record_snv("$target_parent\tc.",$feature_offset+1,
+			    ($feature->[1]-$location),
+			    "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant, $feature->[1]-$location)),"\tNA\n");
+			}
+			else{
+			    # Closer to donor site
+			    record_snv("$target_parent\tc.",$feature_offset,"+",
+			    ($feature_ranges[$i+1]->[0]-$location),
+			    "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant, $feature_ranges[$i+1]->[0]-$location)),"\tNA\n");
+			}
+		    }
+		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
+                        my $rc = join("",map({$rc{$_}} split(//,reverse(substr($variant,1)))));
+			if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
+			    # intronic insertion near acceptor
+			    my $pos = ($feature_offset+1).($feature->[1]-$location);
+			    record_snv("$target_parent\tc.",
+			    hgvs_plus($pos,-1),"_",$pos,
+			    "ins",
+			    $rc,"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    #substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location-1)),"\tNA\n");
+			}
+			else{
+			    # intronic insertion near donor
+			    my $pos = $feature_offset."+".($feature_ranges[$i+1]->[0]-$location);
+			    record_snv("$target_parent\tc.",
+			    hgvs_plus($pos,-1),"_",$pos,
+			    "ins",
+			    $rc,"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    #substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location+1)),"\tNA\n");
+			}
+		    }
+		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
+			# intronic deletion
+			# single nucleotide deletion
+			my $rc = reverse($ref);
+			$rc=~tr/ACGT/TGCA/;
+                        my $delBases = substr($rc, 0, length($rc)-1);
+			if(length($ref) == 2){
+			    # single intronic deletion near acceptor
+			    if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
+                                my $off = $feature->[1]-$location-1;
+				record_snv("$target_parent\tc.",
+				($feature_offset+1),$off,
+				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n");
+			    }
+			    # single intronic deletion near donor
+			    else{
+				my $pos = $feature_offset;
+                                my $off = $feature_ranges[$i+1]->[0]-$location+1;
+				record_snv("$target_parent\tc.",
+				hgvs_plus_exon($pos, $off, $pos),
+				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n");
+			    }
+			}
+			# longer deletion
+			else{
+			    if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
+				# long intronic deletion near acceptor
+                                my $off = $feature->[1]-$location-1;
+				my $pos = ($feature_offset+1).$off;
+				record_snv("$target_parent\tc.",
+				hgvs_plus($pos,-length($ref)+2),"_",$pos,
+				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n");
+			    }
+			    else{
+				# long intronic deletion near donor
+                                my $off = $feature_ranges[$i+1]->[0]-$location+1;
+				my $pos = ($feature_offset)."+".$off;
+				record_snv("$target_parent\tc.",
+				$pos,"_",hgvs_plus($pos,-length($ref)-1),
+				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off-length($ref)+1 <= 2 ? "p.?" : "NA"),"\n");
+			    }
+			    last;
+			}
+		    }
+		    else{
+			my $rc = reverse($ref);
+			$rc=~tr/ACGT/TGCA/;
+			if($rc eq $variant and length($variant) > 3){
+			    # intronic inversion near acceptor site
+			    if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
+				my $pos = ($feature_offset+1).($feature->[1]-$location);
+				record_snv("$target_parent\tc.",
+				hgvs_plus($pos,-length($ref)+1),"_",$pos,
+				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+				join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location)),"\tNA\n");
+			    }
+			    else{
+				my $pos = ($feature_offset)."+".($feature_ranges[$i+1]->[0]-$location);
+				record_snv("$target_parent\tc.",
+				$pos,"_",hgvs_plus($pos, length($ref)-1),
+				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+				join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location)),"\tNA\n");
+			    }
+			    last;
+			}
+                        $rc = reverse($variant);
+                        $rc=~tr/ACGT/TGCA/;
+			# Intronic complex substitution
+			if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){
+			    # complex intronic substitution near acceptor site
+			    my $pos = ($feature_offset+1).($feature->[1]-$location);
+			    record_snv("$target_parent\tc.",
+			    hgvs_plus($pos, -length($ref)+1),"_",$pos,
+			    "delins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    #"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location)),"\tNA\n");
+			}
+			else{
+			    # complex intronic substitution near donor site
+			    my $pos = $feature_offset."+".($feature_ranges[$i+1]->[0]-$location);
+			    record_snv("$target_parent\tc.",
+			    $pos,"_",hgvs_plus($pos, length($ref)-1),
+			    "delins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    #"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location)),"\tNA\n");
+			}
+		    }
+		    last;
+		}
+		else{
+		    #print STDERR "Offset going from ", $feature_offset, " to ", $feature_offset+$feature->[1]-$feature->[0]+1,"\n";
+		    $feature_offset += $feature->[1]-$feature->[0]+1;
+		    #print STDERR "Set feature offset to  $feature_offset by adding ",$feature->[1],"-", $feature->[0],"+1\n";
+		}
+	    }
+	}
+	else{
+	    # forward strand
+
+	    # set up utr offset for correct CDS coordinates
+	    for(my $i = 0; $i <= $#feature_ranges; $i++){
+		# All 5' utr exon
+		if($feature_ranges[$i]->[1] < $feature_cds_min{$target_parent}){
+		    $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1;
+		}
+		# exon with the cds start
+		elsif($feature_ranges[$i]->[1] >= $feature_cds_min{$target_parent} and
+		      $feature_ranges[$i]->[0] <= $feature_cds_min{$target_parent}){
+		    $feature_offset -= $feature_cds_min{$target_parent} - $feature_ranges[$i]->[0];
+		    last;
+		}
+		else{
+		    die "The CDS for $target_parent starts upstream ($feature_cds_max{$target_parent}) of the first exon",
+		    " (", $feature_ranges[$i]->[0], "), which is illogical.  Please revise the GFF file provided.\n";
+		}
+	    }
+	    for(my $i = 0; $i <= $#feature_ranges; $i++){
+		my $feature = $feature_ranges[$i];
+		# 3' of last coding position
+		if($location > $feature_cds_max{$target_parent}){
+		    # find the exon with the stop codon
+		    while($feature->[1] < $feature_cds_max{$target_parent}){
+			$feature = $feature_ranges[++$i];
+		    }
+		    my $post_offset = $feature->[0]-$feature_cds_max{$target_parent};
+		    while($location > $feature->[1]+$flanking_bases and
+			  $i < $#feature_ranges){
+			$post_offset += $feature->[1]-$feature->[0]+1;
+			$feature = $feature_ranges[++$i]; # find the 3' utr exon in which the variant is located
+		    }
+		    my $pos = $location-$feature->[0]+$post_offset;
+		    #print STDERR "Positive strand: got $pos for $location, exon #$i of $#feature_ranges, post_offset is $post_offset\n" if $location-$feature->[1] > $flanking_bases;
+                    my $off;
+		    if($location > $feature->[1]){ # after a UTR containing exon? set as .*DD+DD
+                        $off = $location-$feature->[1];
+			$pos = ($post_offset+$feature->[1]-$feature->[0]+1)."+".$off;
+		    }
+		    elsif($location < $feature->[0]){ # before a total UTR exon? set as .*DD-DD
+                        $off = -($feature->[0]-$location);
+			$pos = ($post_offset+1).$off;
+		    }
+                    else{
+                        if($location-$feature->[0] < $feature->[1]-$location){
+                           $off = $location-$feature->[0]+1; # +1 since HGVS skips right from -1 to +1 at exon boundary
+                        }
+                        else{
+                           $off = $location-$feature->[1]-1; # will be negative
+                        }
+                    }
+		    if(length($ref) == 1 and length($variant) == 1){
+			# 3' UTR SNP
+			record_snv("$target_parent\tc.*$pos",
+			"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n");
+		    }
+		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
+			# 3' UTR insertion
+			record_snv("$target_parent\tc.*",
+			hgvs_plus($pos,1),"_*",hgvs_plus($pos,2),
+			"ins",substr($variant,1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n");
+		    }
+		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
+                        my $delBases = substr($ref, 1);
+			if(length($ref) == 2){
+			    # 3' UTR single base deletion
+			    record_snv("$target_parent\tc.*",hgvs_plus($pos,1),
+			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n");
+			}
+			else{
+			    # longer 3' UTR deletion
+			    record_snv("$target_parent\tc.*",
+			    hgvs_plus($pos,1),"_*",hgvs_plus($pos,length($ref)-1),
+			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n");
+			}
+		    }
+		    else{
+			my $rc = reverse($ref);
+			$rc=~tr/ACGT/TGCA/;
+			if($rc eq $variant and length($variant) > 3){
+			    # 3' UTR inversion
+			    record_snv("$target_parent\tc.*$pos",
+			    "_*",hgvs_plus($pos,length($ref)-1),
+			    "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n");
+			    last;
+			}
+			# complex substitution in 3' UTR
+			record_snv("$target_parent\tc.*$pos",
+			"_*",hgvs_plus($pos,length($ref)-1),
+			"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n");
+		    }
+		    last;
+		}
+		# in the exon
+		elsif($location >= $feature->[0] and $location <= $feature->[1]){
+		    my $pos = $feature_offset+$location-$feature->[0]+1;
+		    my $last_exon_base = $feature_offset+$feature->[1]-$feature->[0]+1;
+                    my $exon_edge_dist = $location-$feature->[0]+1; # equiv of HGVS +X or -X intron syntax, but for exons
+                    $exon_edge_dist = $location-$feature->[1]-1 if abs($location-$feature->[1]-1) < $exon_edge_dist; # pick closer of donor and acceptor sites
+                    #print STDERR "Got ", $location-$feature->[0]+1, "vs. ", $location-$feature->[1]-1, ": chose $exon_edge_dist\n";
+		    if($location < $feature_cds_min{$target_parent}){ #since there is no position 0, the pos is in UTR, subtract one
+			$pos = hgvs_plus($pos, -1);
+		    }
+		    if(length($ref) == 1 and length($variant) == 1){
+			# SNP
+			record_snv("$target_parent\tc.$pos",
+			"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
+                        ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+                        #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+		    }
+		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
+			# insertion
+			record_snv("$target_parent\tc.$pos",
+			"_",hgvs_plus_exon($pos,1,$last_exon_base),"ins",
+			substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
+                        ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+                        #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+		    }
+		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
+                        my $delBases = substr($ref, 1);
+			# single nucleotide deletion
+			if(length($delBases) == 1){
+			    record_snv("$target_parent\tc.",
+			    hgvs_plus_exon($pos,1,$last_exon_base),
+			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
+                            ($pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+                            #($pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+			}
+			# longer deletion
+			else{
+                            $exon_edge_dist = $feature->[1]-$location-length($ref)-1 if abs($feature->[1]-$location-length($ref)-1) < $exon_edge_dist;
+			    record_snv("$target_parent\tc.",
+			    hgvs_plus_exon($pos,1,$last_exon_base),"_",
+			    hgvs_plus_exon($pos,length($ref)-1,$last_exon_base),
+			    "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
+                            ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+                            #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+			}
+		    }
+		    else{
+                        $exon_edge_dist = $feature->[1]-$location-length($ref)-1 if abs($feature->[1]-$location-length($ref)-1) < $exon_edge_dist;
+			my $rc = reverse($ref);
+			$rc=~tr/ACGT/TGCA/;
+			if($rc eq $variant and length($variant) > 3){
+			    # inversion
+			    record_snv("$target_parent\tc.$pos",
+			    "_",hgvs_plus_exon($pos,length($ref)-1, $last_exon_base),
+			    "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
+                            ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+                            #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+			    last;
+			}
+			# complex substitution
+			record_snv("$target_parent\tc.$pos",
+			"_",hgvs_plus_exon($pos, length($ref)-1, $last_exon_base),
+			"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t",
+                        ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+                        #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n");
+		    }
+		    last;
+		}
+		# 5' of feature
+		elsif($location < $feature->[0]){
+		    if(length($ref) == 1 and length($variant) == 1){
+			# intronic SNP
+			if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
+			    # Closer to donor site
+			    record_snv("$target_parent\tc.",$feature_offset,"+",
+			    ($location-$feature_ranges[$i-1]->[1]),
+			    "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
+			}
+			else{
+			    # Closer to acceptor site
+			    record_snv("$target_parent\tc.",$feature_offset+1,
+			    ($location-$feature->[0]),
+			    "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
+			}
+		    }
+		    elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){
+			if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
+			    # intronic insertion near donor
+			    my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]);
+			    record_snv("$target_parent\tc.",
+			    $pos,"_",hgvs_plus($pos,1),
+			    "ins",
+			    substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
+			}
+			else{
+			    # intronic insertion near acceptor
+			    my $pos = ($feature_offset+1).($location-$feature->[0]);
+			    record_snv("$target_parent\tc.",
+			    $pos,"_",hgvs_plus($pos,1),
+			    "ins",
+			    substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
+			}
+		    }
+		    elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){
+			# intronic deletion
+			# single nucleotide deletion
+                        my $delBases = substr($ref, 1);
+			if(length($ref) == 2){
+			    # single intronic deletion near donor
+			    if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
+                                my $off = $location-$feature_ranges[$i-1]->[1]+1;
+				record_snv("$target_parent\tc.",
+				$feature_offset,"+",$off,
+				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n");
+			    }
+			    # single intronic deletion near acceptor
+			    else{
+				my $pos = ($feature_offset+1);
+                                my $off = $location-$feature->[0];
+				record_snv("$target_parent\tc.",
+				hgvs_plus_exon($pos, $off, $pos),
+				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n");
+			    }
+			}
+			# longer deletion
+			else{
+			    if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
+				# long intronic deletion near donor
+                                my $off = $location-$feature_ranges[$i-1]->[1]+1;
+				my $pos = $feature_offset."+".$off;
+				record_snv("$target_parent\tc.",
+				$pos,"_",hgvs_plus($pos,length($ref)-2),
+				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n");
+			    }
+			    else{
+				# long intronic deletion near acceptor
+                                my $off = $location-$feature->[0]+1;
+				my $pos = ($feature_offset+1).$off;
+				record_snv("$target_parent\tc.",
+				$pos,"_",hgvs_plus($pos,length($ref)-2),
+				"del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+				join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off+length($ref)-2 >= -2 ? "p.?" : "NA"),"\n");
+			    }
+			}
+		    }
+		    else{
+			my $rc = reverse($ref);
+			$rc=~tr/ACGT/TGCA/;
+			if($rc eq $variant and length($variant) > 3){
+			    # intronic inversion near donor site
+			    if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
+				my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]);
+				record_snv("$target_parent\tc.",
+				$pos,"_",hgvs_plus($pos,length($ref)-1),
+				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+				join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
+			    }
+			    else{
+				my $pos = ($feature_offset+1).($location-$feature->[0]);
+				record_snv("$target_parent\tc.",
+				$pos,"_",hgvs_plus($pos, length($ref)-1),
+				"inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+				join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
+			    }
+			    last;
+			}
+			# Intronic complex substitution
+			# Note: sub maybe have  comma in it to denote two possible variants
+			if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){
+			    # complex intronic substitution near donor site
+			    my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]);
+			    record_snv("$target_parent\tc.",
+			    $pos,"_",hgvs_plus($pos, length($ref)-1),
+			    "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n");
+			}
+			else{
+			    # complex intronic substitution near acceptor site
+			    my $pos = ($feature_offset+1).($location-$feature->[0]);
+			    record_snv("$target_parent\tc.",
+			    $pos,"_",hgvs_plus($pos, length($ref)-1),
+			    "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t",
+			    join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n");
+			}
+		    }
+		    last;
+		}
+		else{
+		    # feature is past this exon
+		    $feature_offset += $feature->[1]-$feature->[0]+1;
+		}
+	    }
+	}
+    }  # for each variant on the line
+  } # for each  gene overlapping the site the VCF describes
+} # for each VCF line
+print STDERR "\n" unless $quiet;
+close(VCFIN);
+
+# Before we can start printing the variants, we need to look at the phase information and calculate the real haplotype HGVS changes
+#if(keys %chr2phase){
+  # Note that we could have samtools read-backed haplotype info, MNPs in the VCF, and pre-existing haplotypes in the input VCF (e.g. imputed or based on Mendelian inheritance where trios exist)
+  # We need to create new disjoint sets of phased blocks from the (consistent) union of these data.
+#  my $chr2phase2variants = combine_phase_data(\%chr2phase);
+
+  # TODO: Calculate protein HGVS syntax for each variant, now that all phase data has been incorporated
+  #for my $chr (keys %$chr2phase2variants){
+  #  for my $phase (keys %{$chr2phase2variants{$chr}){
+  #    # apply all phased changes to the reference chromosomal seq
+  #    my $phased_seq = $seq{$chr}; #reference
+  #    # sort the variants from 3' to 5' so that edits after indels don't need adjustment in their ref coordinate
+  #    my @sorted_variants = sort {my($a_pos) = $a =~ /:(\d+):/; my($b_pos) = $b =~ /:(\d+):/; $b_pos <=> $a_pos} @{$chr2phase2variants{$chr}->{$phase}};
+  #    for my $variant (@sorted_variants){
+  #    }
+  #  }
+  #}
+#}
+
+# retrieve the MAF info en masse for each chromosome, as this is much more efficient
+my @out_lines;
+for my $snv (@snvs){
+  chomp $snv;
+  my @fields = split /\t/, $snv;
+  # For CNVs, all the fields are already filled out
+  if(@fields > 13){
+    push @out_lines, join("\t", $feature_type{$fields[0]}, ($fields[0] =~ /\S/ ? $feature_length{$fields[0]} : "NA"), @fields);
+    next;
+  }
+  my $variant_key = $fields[9];
+  $fields[9] = join("\t", prop_info($dbsnp,$internal_snp,$variant_key));
+  my $from = $fields[4];
+  my $chr_pos_key = $fields[3].":".$from;
+  my $to = $fields[4]; # true for SNPs and insertions
+  my @variant_key = split /:/, $variant_key;
+  # For deletions and complex variants, calculate the affected reference genome range and set the 'to'
+  if(length($variant_key[2]) > 1){
+    $to += length($variant_key[2])-1;
+  }
+  splice(@fields, 5, 0, $to);
+
+  # Otherwise expand the key into the relevant MAF values
+  $fields[0] =~ s/\/chr.*$//;  # some transcript ids are repeated... we expect "id/chr#" in the GTF file to distinguish these, but should get rid of them at reporting time
+  # the offset from the nearest exon border if coding
+  push @fields, ($#variant_key > 3 ? $variant_key[4] : "");
+  # add gene name(s)
+  push @fields, range2genes($fields[3], $from, $to+1);
+  # add caveats
+  my $c = $fields[3];
+  if(not exists $chr2mappability{$c}){
+    if($c =~ s/^chr//){
+      # nothing more
+    }
+    else{
+      $c = "chr$c";
+    }
+  }
+  my $mappability_caveats = exists $chr2mappability{$c} ? $chr2mappability{$c}->fetch($fields[4], $fields[4]+1) : [];
+  if(ref $mappability_caveats eq "ARRAY" and @$mappability_caveats){
+    my %h;
+    if(exists $chr2caveats{$chr_pos_key}){
+      if($chr2caveats{$chr_pos_key} !~ /non-unique/){
+         $chr2caveats{$chr_pos_key} = join("; ", grep {not $h{$_}++} @$mappability_caveats)."; ".$chr2caveats{$chr_pos_key};
+      }
+    }
+    else{
+      $chr2caveats{$chr_pos_key} = join("; ", grep {not $h{$_}++} @$mappability_caveats);
+    }
+  }
+  push @fields, (exists $chr2caveats{$chr_pos_key} ? $chr2caveats{$chr_pos_key} : "");
+  # add phase
+  push @fields, find_phase($variant_key);
+  push @out_lines, join("\t", $feature_type{$fields[0]}, $feature_length{$fields[0]}, @fields);
+}
+
+# Now tabulate the rare variant numbers
+my %gene2rares;
+my %gene2aa_rares;
+for my $line (@out_lines){
+  my @F = split /\t/, $line, -1;
+  if($F[15] eq "NA" or $F[15] < $rare_variant_prop and (!$internal_snp or $F[17] < $rare_variant_prop)){
+    my $gene_list = $internal_snp ? $F[20] : $F[19];
+    next unless defined $gene_list;
+    for my $g (split /; /, $gene_list){
+      $gene2rares{$g}++;
+      # Check the cDNA HGVS syntax for relevance
+      if($F[3] =~ /c.\d+/ or # coding
+         $F[3] =~ /c.\d+.*-[12]/ or # splicing acceptor
+         $F[3] =~ /c.\d+\+[12345]/){ # splicing donor
+        $gene2aa_rares{$g}++;
+      }
+    }
+  }
+}
+
+# Print the results
+for my $line (@out_lines){
+  my @F = split /\t/, $line, -1;
+  my $gene_list = $internal_snp ? $F[20] : $F[19];
+  if(not defined $gene_list){
+    print OUT join("\t", @F, "", ""), "\n"; next;
+  }
+
+  my $max_gene_rare = 0;
+  my $max_gene_aa_rare = 0;
+  for my $g (split /; /, $gene_list){
+    next unless exists $gene2rares{$g};
+    if($gene2rares{$g} > $max_gene_rare){
+      $max_gene_rare = $gene2rares{$g};
+    }
+    next unless exists $gene2aa_rares{$g};
+    if($gene2aa_rares{$g} > $max_gene_aa_rare){
+      $max_gene_aa_rare = $gene2aa_rares{$g};
+    }
+  }
+  print OUT join("\t", @F, $max_gene_rare, $max_gene_aa_rare), "\n";
+}
+close(OUT);
+
+sub range2genes{
+  my ($c, $from, $to) = @_;
+  if(not exists $gene_ids{$c}){
+    if($c =~ s/^chr//){
+      # nothing more
+    }
+    else{
+      $c = "chr$c";
+    }
+  }
+  if(exists $gene_ids{$c}){
+    my %have;
+    return join("; ", grep {not $have{$_}++} @{$gene_ids{$c}->fetch($from, $to+1)});
+  }
+  else{
+    return "";
+  }
+}
+sub combine_phase_data{
+  my ($phases) = @_; # map from variant to its phase data
+
+  # Create a reverse mapping from phase regions to their variants
+  my %chr2phase_region2variants;
+  my @variants = keys %$phases;
+  for my $variant (@variants){
+    my ($chr) = $variant =~ /^\S+?-(\S+):/;
+    $chr2phase_region2variants{$chr} = {} if not exists $chr2phase_region2variants{$chr};
+    for my $phase_region (split /,/, $phases->{$variant}){
+      $chr2phase_region2variants{$chr}->{$phase_region} = [] if not exists $chr2phase_region2variants{$chr}->{$phase_region};
+      push @{$chr2phase_region2variants{$phase_region}}, $variant;
+    }
+  }
+
+  # Now for each phased block known so far, see if any variant in it is also part of another block
+  # If so, do a union since phasing is both transitive and commutative.
+  # The quickest way to do this is to check for overlapping intervals, then check for common members amongst those that do overlap
+  for my $chr (keys %chr2phase_region2variants){
+    my @ordered_phase_regions = sort {my($a_pos) = $a =~ /:(\d+)/; my($b_pos) = $b =~ /:(\d+)/; $a_pos <=> $b_pos} keys %{$chr2phase_region2variants{$chr}};
+    my $sets = new DisjointSets(scalar(@ordered_phase_regions));
+
+    for (my $i = 0; $i < $#ordered_phase_regions; $i++){
+      my ($start, $stop, $variant) = $ordered_phase_regions[$i];
+      for (my $j = $i+1; $j <= $#ordered_phase_regions; $j++){
+        my ($start2, $stop2, $variant2) = $ordered_phase_regions[$j];
+        if($start2 > $stop){ # won't overlap any regions after this in the sorted list
+          last;
+        }
+        # If we get here, it is implicit that $stop >= $start2 and $start < $stop2, i.e. there is overlap
+        # Now check if there is a shared variant (otherwise we might erroneously join blocks from different physical chromosomal arms)
+        my $have_shared_variant = 0;
+        my $overlapping_phase_region = $ordered_phase_regions[$j];
+        for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$i]}}){
+          if($phases->{$variant} =~ /\b$overlapping_phase_region\b/){
+            $have_shared_variant = 1; last;
+          }
+        }
+        # sanity check that there aren't conflicting variants in the new block (i.e. two different variants in the same position)
+        my %pos2base;
+        my $have_conflicting_variant = 0;
+        for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$i]}}, @{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$j]}}){
+          my ($pos, $base) = $variant =~ /(\d+):(.+?)$/;
+          if(exists $pos2base{$pos} and $pos2base{$pos} ne $base){
+            # conflict, note with a caveat
+            if(exists $chr2caveats{"$chr:$pos"}){
+              $chr2caveats{"$chr:$pos"} .= "; inconsistent haplotype phasing" unless $chr2caveats{"$chr:$pos"} =~ /inconsistent haplotype phasing/;
+            }
+            else{
+              $chr2caveats{"$chr:$pos"} = "inconsistent haplotype phasing";
+            }
+            $have_conflicting_variant ||= 1;
+          }
+          elsif(not exists $pos2base{$pos}){
+            $pos2base{$pos} = $base;
+          }
+        }
+
+        $sets->union($i+1, $j+1) if $have_shared_variant and not $have_conflicting_variant; # indexes are one-based for sets rather than 0-based
+      }
+    }
+    my $phase_sets = $sets->sets; #disjoint haplotype sets
+    my %region_counts;
+    for my $phase_set (@$phase_sets){
+      next if scalar(@$phase_set) == 1; # No change to existing phase region (is disjoint from all others)
+      # Generate a new haploblock to replace the old ones that are being merged
+      my $merged_start = 10000000000;
+      my $merged_end = 0;
+      for my $ordered_phase_region_index (@$phase_set){
+        my ($start, $end) = $ordered_phase_regions[$ordered_phase_region_index-1] =~ /(\d+)-(\d+)$/;
+        $merged_start = $start if $start < $merged_start;
+        $merged_end = $end if $end > $merged_end;
+      }
+      # At the start of the region is a unique prefix so we can tell the arms apart if two haploblocks have the exact same boundary
+      my $region_count = $region_counts{"$merged_start-$merged_end"}++;
+      my $merged_haploblock_name = "Y$region_count-$chr:$merged_start-$merged_end";
+      # Assign this new name to overwrite the premerge values for each variant contained within
+      for my $ordered_phase_region_index (@$phase_set){
+        for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$ordered_phase_region_index-1]}}){ # incl. one-based set correction in 0-based array index
+          print STDERR "Merging $variant from ", $phases->{$variant}, " into new block $merged_haploblock_name\n";
+          $phases->{$variant} = $merged_haploblock_name;
+        }
+      }
+    }
+    # TODO: if there are overlapping phase blocks still, but with different variants in the same position, we can infer that they are on the opposite strands...
+  }
+}
+
+# Sees if the positions of the variant are in the range of a phased haplotype, returning which allele it belongs to
+sub find_phase{
+  my ($chr,$pos,$ref,$variant) = split /:/, $_[0];
+  return "" if length($ref) != length($variant); # Can only deal with SNPs (and broken down MNPs) for now
+  for(my $i = 0; $i < length($ref); $i++){
+    my $key = "$chr:".($pos+$i).":".substr($variant, $i, 1);
+    #print STDERR "Checking phase for $key\n" if $pos == 12907379;
+    if(exists $chr2phase{$key}){
+      #print STDERR "returning phase data $chr2phase{$key}\n";
+      return $chr2phase{$key};
+    }
+    elsif(exists $chr2phase{"chr".$key}){
+      #print STDERR "returning phase data ", $chr2phase{"chr".$key}, "\n";
+      return $chr2phase{"chr".$key};
+    }
+  }
+  return "";
+}
+
+sub find_earliest_index{
+  # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start
+   my ($query, $array) = @_;
+
+   return 0 if $query < $array->[0]->[0];
+
+   my ($left, $right, $prevCenter) = (0, $#$array, -1);
+
+   while(1){
+      my $center = int (($left + $right)/2);
+
+      my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1);
+
+      return $center if $cmp == 0;
+      if ($center == $prevCenter) {
+         return $left;
+      }
+      $right = $center if $cmp < 0;
+      $left  = $center if $cmp > 0;
+      $prevCenter = $center;
+   }
+}
+