Mercurial > repos > yusuf > miseq_bam_variants
changeset 0:1a23ea467feb default tip
intial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Thu, 26 Mar 2015 09:36:17 -0600 |
parents | |
children | |
files | GenomeAnalysisTK.jar MiSeqBAMtoVariants.xml call_miseq_variants capture_kits.py combine_hgvs_tables freebayes_parallel gatk_haplotypecaller_parallel hgvs_collapse_transcripts split_hgvs_by_confidence tool-data/miseq_bam_variants.loc vcf2hgvs_table |
diffstat | 11 files changed, 3606 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MiSeqBAMtoVariants.xml Thu Mar 26 09:36:17 2015 -0600 @@ -0,0 +1,27 @@ +<?xml version="1.0"?> + +<tool id="call_miseq_vars" name="Call miSeq variants"> + <description>using a BAM file</description> + <version_string>echo 1.0.0</version_string> + <command interpreter="perl">call_miseq_variants $__tool_data_path__ geno . $inputBAM ${targetBED}.bed ${ref_genome}.bed ${ref_genome}.fa 12</command> + <inputs> + <param format="bam" name="inputBAM" type="data" label="Source BAM (mapped and realigned reads) file"/> + <param name="ref_genome" type="genomebuild" label="Reference genome" help="against which the reads were aligned"/> + <param name="targetBED" type="select" display="radio" dynamic_options="kit_fileOptions()" label="The enrichment (capture) kit used for the sequencing experiment"/> + </inputs> + <outputs> + <data format="achri_snp_table" label="Confident genotypes" name="confident_genotypes" from_work_dir="geno.combined.collapsed.confident.hgvs.txt"/> + <data format="achri_snp_table" label="Marginal genotypes" name="marginal_genotypes" from_work_dir="geno.combined.collapsed.marginal.hgvs.txt"/> + <data format="text" label="Genotyping log" name="genotyping log" from_work_dir="geno.call_variants.log.txt"/> + </outputs> + + <!-- the following code populates the selection from the public dbSNP datasets available in the local Galaxy installation --> + <code file="capture_kits.py"/> + <tests/> + + <help> +This tool runs a BAM file against multiple genotypers (FreeBayes 0.8.7, GATKHaplotypeCaller 3.1) and generates a consensus (confident) table of variant calls, +including caveats about haplotype phase inconsistencies and other potential false positive indicators. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/call_miseq_variants Thu Mar 26 09:36:17 2015 -0600 @@ -0,0 +1,122 @@ +#!/usr/bin/env perl + +# Locked down procedure for variant calling +use strict; +use warnings; +use File::Basename; +#@ARGV == 7 or die "Usage: $0 <sample name> <output dir> <deduped_realigned_input.bam> <target_regions.bed> <gene_regions.bed> <reference.fasta> <max num processes>\n"; + +# parse configuration file +my $dirname= dirname(__FILE__); +my $tool_data = shift @ARGV; +my %config; +if( not -e "$tool_data/miseq_bam_variants.loc" ){ + system("cp $dirname/tool-data/miseq_bam_variants.loc $tool_data/miseq_bam_variants.loc"); +} +open CONFIG, '<',"$tool_data/miseq_bam_variants.loc"; +while(<CONFIG>){ + (my $key, my $value) = split(/\s+/, $_ ); + $config{$key} = $value; +} +my $dbs_dir = $config{"reference_dbs"}; +my $name_dir = $config{"named_regions_dir"}; +my $capt_dir = $config{"capture_kits_dir"}; +close CONFIG; + +my $sample_name = shift @ARGV; +my $outdir = shift @ARGV; +my $bam = shift @ARGV; +my $bed = $capt_dir . (shift @ARGV); +my $gene_regions = $name_dir . (shift @ARGV); +my $fasta = $dbs_dir . (shift @ARGV); +my $max_processes = shift @ARGV; + +my $log = "$outdir/$sample_name.call_variants.log.txt"; +my $gatk = "$outdir/$sample_name.gatk_haplotypecaller"; +my $freeBayes = "$outdir/$sample_name.freeBayes"; +my $atlas2_indel = "$outdir/$sample_name.atlas2_indel"; +my $phase = "$outdir/$sample_name.phase"; +my $cover = "$outdir/$sample_name.read_coverage"; +my $flagstat = "$outdir/$sample_name.flagstat"; + +my @cmds; +@cmds = ( "$dirname/gatk_haplotypecaller_parallel $bam ".($max_processes > 5 ? int($max_processes*2/3+0.5) : 2)." $gatk.vcf -R $fasta --emitRefConfidence GVCF -variant_index_type LINEAR -variant_index_parameter 128000 &>> $log", + "$dirname/freebayes_parallel $bam ".($max_processes > 5 ? int(($max_processes-1)/3+0.5) : 2)." $freeBayes.vcf -j -C 2 -i -f $fasta -X &>> $log", + "samtools phase -F $bam > $phase.txt 2>> $log"); +&run_cmds($max_processes, @cmds); + +@cmds = ("$dirname/vcf2hgvs_table -q -d 2 -p 0.05 -h 0.1 -u $dbs_dir/internal_runs.sorted.vcf.gz -m $dbs_dir/mappability/hg19.crgRefSeqExomeMappability75mer.txt -x $bed -i $gatk.vcf -o $gatk.hgvs.txt -s $dbs_dir/dbsnp_1000g_esp6500.latest.vcf.gz -r $dbs_dir/hg19.fa -e $dbs_dir/hg19_refGene_gencode_ultrasensitive.gtf -b $gene_regions -z $phase.txt", + "$dirname/vcf2hgvs_table -q -d 2 -p 0.05 -h 0.1 -u $dbs_dir/internal_runs.sorted.vcf.gz -m $dbs_dir/mappability/hg19.crgRefSeqExomeMappability75mer.txt -x $bed -i $freeBayes.vcf -o $freeBayes.hgvs.txt -s $dbs_dir/dbsnp_1000g_esp6500.latest.vcf.gz -r $dbs_dir/hg19.fa -e $dbs_dir/hg19_refGene_gencode_ultrasensitive.gtf -b $gene_regions -z $phase.txt"); +&run_cmds($max_processes, @cmds); + +my $combined = "$outdir/$sample_name.combined"; +system "$dirname/combine_hgvs_tables -q true $combined.hgvs.txt GATKHaplotypeCaller $gatk.hgvs.txt FreeBayes $freeBayes.hgvs.txt >> $log"; +system "$dirname/hgvs_collapse_transcripts $combined.hgvs.txt $combined.collapsed.hgvs.txt 1 >> $log"; +system "$dirname/split_hgvs_by_confidence $combined.collapsed.hgvs.txt $combined.collapsed.confident.hgvs.txt $combined.collapsed.marginal.hgvs.txt 2 &>> $log"; + +# Check the logs for anything untowards +# e.g. FreeBayes: jumping is disabled +# or any line saying "error" +my @errors; +open(LOG, $log) + or die "Could not check log file for error message, therefore the validity of the genotypes cannot be guaranteed\n"; +while(<LOG>){ + if(/error/i or /jumping is disabled/){ + push @errors, $_; + } +} +close(LOG); + +if(@errors){ + print "Errors were detected in the genotyping log:\n", @errors; + exit 1; # failure +} +exit 0; #success + +sub run_cmds{ + + my ($max_cmds, @cmd) = @_; + + my ($num_children, $pid); + + for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){ + # initialize the number of child processes at 1, and increment it by one + #while it is less than $max_cmds + + my $cmd = shift (@cmds); + if($pid = fork) { + # do nothing if parent + } elsif(defined $pid) { # $pid is zero here if defined + #print STDERR $cmd, "\n"; + system $cmd; + exit; + } else { + #weird fork error + die "Can't fork: $!\n"; + } + } + + while(@cmds) { + undef $pid; + FORK: { + my $cmd = shift (@cmds); + if($pid = fork) { + # parent here + $num_children++; + wait; + $num_children--; + next; + + } elsif(defined $pid) { # $pid is zero here if defined + #print STDERR $cmd, "\n"; + system $cmd; + exit; + + } else { + #weird fork error + die "Can't fork: $!\n"; + } + } + } + wait while $num_children--; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/capture_kits.py Thu Mar 26 09:36:17 2015 -0600 @@ -0,0 +1,29 @@ +import os +import re +import sys +import operator +import csv +from galaxy import config + +# get tool-data path +configur = config.Configuration() +kitDir = configur.resolve_path("tool-data") + +# determine if config file exists +if not os.path.exists( kitDir + "/miseq_bam_variants.loc" ): + kitDir = "/export/achri_galaxy/dbs/CaptureKits/"; +else: + with open(kitDir + "/miseq_bam_variants.loc", "r") as tsv: + for line in csv.reader(tsv, delimiter="\t"): + if line[0] == 'capture_kits_dir': + kitDir = line[1] + +def kit_fileOptions(): + list = os.listdir(kitDir); + list.sort() + pattern = re.compile('(.*)$') + fileOptions = [(s) for s in list if os.path.exists(kitDir + s)] + ds = [pattern.match(s) for s in fileOptions] + datasets = [(m.group(1), m.group(1), False) for m in ds if m] + return datasets +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/combine_hgvs_tables Thu Mar 26 09:36:17 2015 -0600 @@ -0,0 +1,215 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +if(@ARGV == 1 and $ARGV[0] eq "-v"){ + print "Version 1.0\n"; + exit; +} + +my $quiet = 0; +if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){ + $quiet = 1; + shift @ARGV; +} + +@ARGV >= 6 and not @ARGV%2 or die "Usage: $0 [-q(uiet)] <true|false> <output file> <method_name1> <hgvs_file_1.txt> <method_name2> <hgvs_file_2.txt> ...\n", + "Where true or false determines if the genotypes calls should all be reported (if false, they are collapsed to the unique set of values observed)\n"; + +my %zygosity; +my %quality; +my %var_count; +my %tot_count; +my %methods; +my %data; +my $chr_prefix_exists = 0; # do any of the tools report chr1, if so we need to ensure 1 get reported as chr1 later for consistency across tools + +my $collapse = $ARGV[0] =~ /^true$/i; + +open(OUT, ">$ARGV[1]") + or die "Cannot open $ARGV[1] for writing: $!\n"; + +my @headers; +my ($chr_column, $zygosity_column, $pvalue_column, $alt_cnt_column, $tot_cnt_column, $transcript_column, $cdna_hgvs_column, $pos_column, $to_column, $srcs_column); +for(my $i=2; $i<$#ARGV; $i+=2){ + my $method_name = $ARGV[$i]; + my $infile = $ARGV[$i+1]; + print STDERR "Processing $infile...\n" unless $quiet; + open(IN, $infile) + or die "Cannot open $infile for reading: $!\n"; + my $header = <IN>; # header + chomp $header; + if($i == 2){ + @headers = split /\t/, $header; + for(my $j = 0; $j <= $#headers; $j++){ + if($headers[$j] eq "Chr"){ + $chr_column = $j; + } + elsif($headers[$j] eq "Zygosity"){ + $zygosity_column = $j; + } + elsif($headers[$j] eq "P-value"){ + $pvalue_column = $j; + } + elsif($headers[$j] eq "Variant Reads"){ + $alt_cnt_column = $j; + } + elsif($headers[$j] eq "Total Reads"){ + $tot_cnt_column = $j; + } + elsif($headers[$j] eq "Selected transcript"){ + $transcript_column = $j; + } + elsif($headers[$j] eq "Transcript HGVS"){ + $cdna_hgvs_column = $j; + } + elsif($headers[$j] eq "DNA From"){ + $pos_column = $j; + } + elsif($headers[$j] eq "DNA To"){ + $to_column = $j; + } + elsif($headers[$j] eq "Sources"){ + $srcs_column = $j; + } + } + if(defined $srcs_column){ + # print header as-is + print OUT "$header\n"; + } + else{ + # print header from the first file, with extra column for methods at the end + push @headers, "Sources"; + $srcs_column = $#headers; + print OUT join("\t", @headers), "\n"; + } + } + else{ + # Make sure the columns are the same in the various files + my @latest_headers = split /\t/, $header; + for(my $j = 0; $j <= $#latest_headers && $j <= $#headers; $j++){ + if($latest_headers[$j] ne $headers[$j]){ + die "Header column ", $j+1, "($latest_headers[$j]) in $ARGV[$i] is different from the equivalent column label ($headers[$j]) in $ARGV[2]. Aborting, cannot merge the files.\n"; + } + } + } + while(<IN>){ + chomp; + my @F = split /\t/, $_, -1; # -1 to keep trailing blank fields + if(not $chr_prefix_exists and $F[$chr_column] =~ /^chr/){ + $chr_prefix_exists = 1; + } + $F[$chr_column] =~ s/^chr//; # for consistency + my $key = "$F[$transcript_column]:$F[$cdna_hgvs_column]"; # transcript_id:cdna_hgvs is shared id for common variants amongst files + # record disagreement on zygosity if any + if(not defined $zygosity{$key}){ + $zygosity{$key} = []; + $quality{$key} = []; + $var_count{$key} = []; + $tot_count{$key} = []; + $data{$key} = \@F; + #print STDERR "Missing fields (only have ", scalar(@F), " for input '$_'\n" if $#F < $#headers; + } + push @{$zygosity{$key}}, split(/; /,$F[$zygosity_column]); + push @{$quality{$key}}, $F[$pvalue_column]; + push @{$var_count{$key}}, $F[$alt_cnt_column]; + push @{$tot_count{$key}}, $F[$tot_cnt_column]; + push @{$methods{$key}}, $method_name; + } + close(IN); +} + +if($chr_prefix_exists){ + for my $key (keys %data){ + # assuming there is a mix of chr1 and 1, always add the chr for consistency + $data{$key}->[$chr_column] = "chr".$data{$key}->[$chr_column] unless $data{$key}->[$chr_column] =~ /^chr/; + } +} + +# Sort by chr, then pos, then transcript_id +for my $key (sort {$data{$a}->[$chr_column] cmp $data{$b}->[$chr_column] or + $data{$a}->[$pos_column] <=> $data{$b}->[$pos_column] or + $data{$a}->[$to_column] <=> $data{$b}->[$to_column] or + $data{$a}->[$transcript_column] cmp $data{$b}->[$transcript_column]} keys %data){ + # Before printing a line, merge the zygosity, variant quality, read count data if requested + my %seen; + if($collapse){ + my @zygosities = grep {$_ ne "NA" and not $seen{$_}++} @{$zygosity{$key}}; + if(@zygosities == 0){ + # do nothing, existing NA in 7th column will be a fine answer as they are all like tha + } + elsif(@zygosities == 1){ + # agreement by all methods, just print the one not NA + $data{$key}->[$zygosity_column] = $zygosities[0] if $data{$key}->[$zygosity_column] eq "NA"; + } + else{ + $data{$key}->[$zygosity_column] = join("/", sort keys %seen); # list all unique values that occur + } + } + else{ + $data{$key}->[$zygosity_column] = join("; ", @{$zygosity{$key}}); + } + + if($collapse){ + undef %seen; + my @qualities = grep {$_ ne "NA" and not $seen{$_}++} @{$quality{$key}}; + if(@qualities == 0){ + # do nothing, existing NA in 8th column will be a fine answer as they are all like that + } + elsif(@qualities == 1){ + # agreement by all methods, just print the one not NA + $data{$key}->[$pvalue_column] = $qualities[0] if $data{$key}->[8] eq "NA"; + } + else{ + # calculate the median for the collapsed value + my @sorted_qualities = sort {$a <=> $b} grep({$_ ne "NA"} @{$quality{$key}}); + my $median_quality = $sorted_qualities[int($#sorted_qualities/2)]; # rounds down (actually better score as this is a p-value) + $data{$key}->[$pvalue_column] = $median_quality; + } + } + else{ + $data{$key}->[$pvalue_column] = join("; ", @{$quality{$key}}); + } + + if($collapse){ + undef %seen; + my @var_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$var_count{$key}}; + undef %seen; + my @tot_counts = grep {$_ ne "NA" and not $seen{$_}++} @{$tot_count{$key}}; + if(@var_counts == 0 and @tot_counts == 0){ + # do nothing, existing NAs okay + } + elsif(@var_counts == 1 and @tot_counts == 1){ + # agreement by all methods, just print the one in %data unless it's NA + $data{$key}->[$alt_cnt_column] = $var_counts[0] if $data{$key}->[$alt_cnt_column] eq "NA"; + $data{$key}->[$tot_cnt_column] = $tot_counts[0] if $data{$key}->[$tot_cnt_column] eq "NA"; + } + else{ + # calculate the median for the collapsed value + my @sorted_var_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$var_count{$key}}); + my $pivot = $#sorted_var_counts/2; + my $median_var_count = $pivot == int($pivot) ? $sorted_var_counts[$pivot] : # arithmetic mean + int(($sorted_var_counts[int($pivot)]+$sorted_var_counts[int($pivot)+1])/2); + $data{$key}->[$alt_cnt_column] = $median_var_count; + my @sorted_tot_counts = sort {$a <=> $b} grep({$_ ne "NA"} @{$tot_count{$key}}); + $pivot = $#sorted_tot_counts/2; + my $median_tot_count = $pivot == int($pivot) ? $sorted_tot_counts[$pivot] : # arithmetic mean + int(($sorted_tot_counts[int($pivot)]+$sorted_tot_counts[int($pivot)+1])/2); + $data{$key}->[$tot_cnt_column] = $median_tot_count; + } + } + else{ + # list the raw numbers + $data{$key}->[$alt_cnt_column] = join("; ", @{$var_count{$key}}); + $data{$key}->[$tot_cnt_column] = join("; ", @{$tot_count{$key}}); + } + + # to facilitate multiple rounds of combining, slash the extra column from the last round if it exists + $data{$key}->[$srcs_column] = join("; ", @{$methods{$key}}); + for(my $i = 0; $i <= $#{$data{$key}}; $i++){ + $data{$key}->[$i] = "" if not defined $data{$key}->[$i]; + } + print OUT join("\t", @{$data{$key}}), "\n"; +} +close(OUT);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/freebayes_parallel Thu Mar 26 09:36:17 2015 -0600 @@ -0,0 +1,132 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Temp; + +@ARGV >= 3 or die "Usage: $0 <input.bam> <num processes> <output.vcf> [freebayes options]\nRuns FreeBayes 0.8.7 separately for each reference chromosome, with as many concurrent processes as specified\n"; + +my $in_bam = shift @ARGV; +my $num_procs = shift @ARGV; +my $out_vcf = shift @ARGV; + +$SIG{__DIE__} = $SIG{INT} = $SIG{HUP} = $SIG{TERM} = $SIG{ABRT} = \&cleanup; +# Lines look some thing like "@SQ SN:chr1 LN:249250621" +open(SAM_HEADERS, "samtools view -H $in_bam |") + or die "Cannot run samtools on $in_bam: $!\n"; +my %seq2length; +my %seqname2orig_order; +while(<SAM_HEADERS>){ + if(/^\@SQ\s+SN:(\S+)\s+LN:(\d+)/){ + $seq2length{$1} = $2; + $seqname2orig_order{$1} = $.; + } +} +close(SAM_HEADERS); + +# Make sure the index is available and has the name FreeBayes expects +my $bamfile_stem = $in_bam; +$bamfile_stem =~ s/\.[^.]+$//; # remove last file extension, if there is one +if(not -e "$bamfile_stem.bai"){ + if(not -e "$in_bam.bai"){ + system("samtools index $in_bam") >> 8 and die "Cannot index $in_bam: samtools had exit startus $?\n"; + } + system("ln -s $in_bam.bai $bamfile_stem.bai")>> 8 and die "Cannot link exsiting BAM index $in_bam.bai to FreeBayes required file name $bamfile_stem.bai: ln had exit status $?\n"; +} + +my @cmds; +my @tmp_outfiles; +my %tmpfile2orig_order; +# Sort contigs from largest to smallest for scheduling efficiency +for my $seq_name (sort {$seq2length{$b} <=> $seq2length{$a}} keys %seq2length){ + my ($tmp_fh, $tmp_filename) = tmpnam(); + push @cmds, "/export/achri_data/programs/freebayes0.8.7 -b $in_bam -v $tmp_filename " . join(" ", @ARGV) . " -r $seq_name:1..$seq2length{$seq_name}"; + push @tmp_outfiles, $tmp_filename; + $tmpfile2orig_order{$tmp_filename} = $seqname2orig_order{$seq_name}; +} + +open(OUT_VCF, ">$out_vcf") + or die "Cannot open $out_vcf for writing: $!\n"; + +run_cmds($num_procs, @cmds); + +# Grab output header from first temp output file +open(H, $tmp_outfiles[0]) + or die "Cannot open $tmp_outfiles[0] for reading: $!\n"; +while(<H>){ + last if not /^#/; # end of headers + # mod for self-referencing meta-data + if(/^##commandline=/){ + print OUT_VCF "##commandline=$0 $in_bam $num_procs $out_vcf ".join(" ", @ARGV)."\n"; + } + else{ + # Otherwise verbatim + print OUT_VCF $_; + } + +} +close(H); + +# Concatenate the temporary results into a final outfile +for my $tmp_outfile (sort {$tmpfile2orig_order{$a} <=> $tmpfile2orig_order{$b}} @tmp_outfiles){ + open(TMP_VCF, $tmp_outfile) + or die "Cannot open $tmp_outfile for reading: $!\n"; + while(<TMP_VCF>){ + print OUT_VCF unless /^#/; + } + close(TMP_VCF); +} +close(OUT_VCF); +&cleanup; + +sub cleanup{ + unlink @tmp_outfiles; +} +sub run_cmds{ + + my ($max_cmds, @cmd) = @_; + + my ($num_children, $pid); + + for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){ + # initialize the number of child processes at 1, and increment it by one + #while it is less than $max_cmds + + my $cmd = shift (@cmds); + if($pid = fork) { + # do nothing if parent + } elsif(defined $pid) { # $pid is zero here if defined + print STDERR $cmd, "\n"; + system $cmd; + exit; + } else { + #weird fork error + die "Can't fork: $!\n"; + } + } + + while(@cmds) { + undef $pid; + FORK: { + my $cmd = shift (@cmds); + if($pid = fork) { + # parent here + $num_children++; + wait; + $num_children--; + next; + + } elsif(defined $pid) { # $pid is zero here if defined + print STDERR $cmd, "\n"; + system $cmd; + exit; + + } else { + #weird fork error + die "Can't fork: $!\n"; + } + } + } + wait while $num_children--; +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gatk_haplotypecaller_parallel Thu Mar 26 09:36:17 2015 -0600 @@ -0,0 +1,165 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Temp; +use vars qw($tmpbam_fh $tmpbam_filename); +use File::Basename; +@ARGV >= 3 or die "Usage: $0 <input.bam> <num processes> <output.vcf> [freebayes options]\nRuns GATK 3.1 separately for each reference chromosome, with as many concurrent processes as specified\n"; + +my $in_bam = shift @ARGV; +my $num_procs = shift @ARGV; +my $out_vcf = shift @ARGV; +my $dirname = dirname(__FILE__); +$SIG{__DIE__} = $SIG{INT} = $SIG{HUP} = $SIG{TERM} = $SIG{ABRT} = \&cleanup; + +# Lines look some thing like "@SQ SN:chr1 LN:249250621" +open(SAM_HEADERS, "samtools view -H $in_bam |") + or die "Cannot run samtools on $in_bam: $!\n"; +my %seq2length; +my %seqname2orig_order; +while(<SAM_HEADERS>){ + if(/^\@SQ\s+SN:(\S+)\s+LN:(\d+)/){ + $seq2length{$1} = $2; + $seqname2orig_order{$1} = $.; + } +} +close(SAM_HEADERS); + +# GATK will complain of the file doesn't have a .bam ending, which will happen if the file was uploaded to Galaxy (###.dat) +# Also, we need to make sure there is an index file +if($in_bam !~ /\.bam$/){ + ($tmpbam_fh, $tmpbam_filename) = tmpnam(); + system("ln -s $in_bam $tmpbam_filename.bam")>> 8 and die "Cannot create symbolic link from $in_bam to $tmpbam_filename.bam: exit status $?\n"; + if(not -e "$in_bam.bai"){ + system("samtools index $in_bam")>> 8 and die "Cannot create index for BAM file $in_bam: exit status $?\n"; + } + system("ln -s $in_bam.bai $tmpbam_filename.bam.bai")>> 8 and die "Cannot create symbolic link from $in_bam.bai to $tmpbam_filename.bam.bai: exit status $?\n"; + $in_bam = "$tmpbam_filename.bam"; +} +elsif(not -e "$in_bam.bai"){ + system("samtools index $in_bam")>> 8 and die "Cannot create index for BAM file $in_bam: exit status $?\n"; +} + +# Peak into the BAM to see if it's got Illumina encoded quality values, which will require an extra flag for GATK +my $is_illumina_encoded_qual = 1; +my $num_mapped = 0; +if(open(SAMTOOLS, "samtools view $in_bam |")){ + while(<SAMTOOLS>){ + my @F = split /\t/, $_; + next unless $F[2] ne "*"; # ignore unmapped reads + my @quals = map {ord($_)} split(//, $F[10]); + if(grep {$_ < 64} @quals){ + $is_illumina_encoded_qual = 0; last; + } + last if ++$num_mapped > 100; # only look at the first 100 mapped reads + } +} # take our chances if samtools call failed that it's not illumina encoded, which will fail quickly in GATK anyways +close(SAMTOOLS); + +my @cmds; +my @tmp_outfiles; +my %tmpfile2orig_order; +# Sort contigs from largest to smallest for scheduling efficiency +for my $seq_name (sort {$seq2length{$b} <=> $seq2length{$a}} keys %seq2length){ + my ($tmp_fh, $tmp_filename) = tmpnam(); + my $intervals_tmpfile = $tmp_filename.".intervals"; + open(INTERVAL, ">$intervals_tmpfile") or die "Cannot open $intervals_tmpfile for writing: $!\n"; + print INTERVAL "$seq_name:1-$seq2length{$seq_name}\n"; + close(INTERVAL); + push @cmds, "java -Xmx24G -jar $dirname/GenomeAnalysisTK.jar -I $in_bam -T HaplotypeCaller -o $tmp_filename ".join(" ", @ARGV)." -L $intervals_tmpfile -nct 2". + ($is_illumina_encoded_qual?" --fix_misencoded_quality_scores":""); + push @tmp_outfiles, $tmp_filename; + $tmpfile2orig_order{$tmp_filename} = $seqname2orig_order{$seq_name}; +} + +open(OUT_VCF, ">$out_vcf") + or die "Cannot open $out_vcf for writing: $!\n"; + +# div 2 because assigning 2 threads per process for efficiency +run_cmds(int($num_procs/2+0.5), @cmds); + +# Grab output header from first temp output file +open(H, $tmp_outfiles[0]) + or die "Cannot open $tmp_outfiles[0] for reading: $!\n"; +while(<H>){ + last if not /^#/; # end of headers + # mod for self-referencing meta-data + if(/^##commandline=/){ + print OUT_VCF "##commandline=$0 $in_bam $num_procs $out_vcf ".join(" ", @ARGV)."\n"; + } + else{ + # Otherwise verbatim + print OUT_VCF $_; + } + +} +close(H); + +# Concatenate the temporary results into a final outfile +for my $tmp_outfile (sort {$tmpfile2orig_order{$a} <=> $tmpfile2orig_order{$b}} @tmp_outfiles){ + open(TMP_VCF, $tmp_outfile) + or die "Cannot open $tmp_outfile for reading: $!\n"; + while(<TMP_VCF>){ + print OUT_VCF unless /^#/; + } + close(TMP_VCF); +} +close(OUT_VCF); +&cleanup; + +sub cleanup{ + for my $t (@tmp_outfiles){ + unlink $t, "$t.intervals"; + } + unlink "$tmpbam_filename.bam", "$tmpbam_filename.bam.bai" if defined $tmpbam_filename; +} + +sub run_cmds{ + + my ($max_cmds, @cmd) = @_; + + my ($num_children, $pid); + + for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){ + # initialize the number of child processes at 1, and increment it by one + #while it is less than $max_cmds + + my $cmd = shift (@cmds); + if($pid = fork) { + # do nothing if parent + } elsif(defined $pid) { # $pid is zero here if defined + print STDERR $cmd, "\n"; + system $cmd; + exit; + } else { + #weird fork error + die "Can't fork: $!\n"; + } + } + + while(@cmds) { + undef $pid; + FORK: { + my $cmd = shift (@cmds); + if($pid = fork) { + # parent here + $num_children++; + wait; + $num_children--; + next; + + } elsif(defined $pid) { # $pid is zero here if defined + print STDERR $cmd, "\n"; + system $cmd; + exit; + + } else { + #weird fork error + die "Can't fork: $!\n"; + } + } + } + wait while $num_children--; +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgvs_collapse_transcripts Thu Mar 26 09:36:17 2015 -0600 @@ -0,0 +1,201 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +# reports the variant in transcripts separately only if the AA change is different, or distance from splicing site is different +@ARGV == 2 or @ARGV == 3 or die "Usage: $0 <hgvs input.txt> <output.txt> [ignore splice distance diff]\n"; + +my %ftype_rank = ( protein_coding => 100, + processed_transcript => 90, + antisense => 80, + retained_intron => 70, + lincRNA => 60, + nonsense_mediated_decay => 50, + misc_enrichment_kit_target => 0); + +my %mtype_rank = ( nonsense => 100, + frameshift => 99, + nonstop => 90, + missense => 80, + silent => 50, + "non-coding" => 40); +open(IN, $ARGV[0]) + or die "Cannot open $ARGV[0] for reading: $!\n"; +open(OUT, ">$ARGV[1]") + or die "Cannot open $ARGV[1] for writing: $!\n"; +my $succinct = (@ARGV == 3); +my @lines; +my $last_chr = ""; +my $last_pos = ""; +my $last_alt = ""; +my %buffered_F; +my %buffered_id_rank; +my $header = <IN>; +print OUT $header; + +my ($chr_column, $pos_column, $alt_column, $cdna_hgvs_column, $aa_hgvs_column, $phase_column, $transcript_column, $transcript_length_column, $exon_dist_column, $ftype_column, $mtype_column, $splicing_score_column, $splicing_effect_column, $sources_column); +chomp $header; +my @headers = split /\t/, $header; +for(my $i = 0; $i <= $#headers; $i++){ + if($headers[$i] eq "Chr"){ + $chr_column = $i; + } + elsif($headers[$i] eq "Feature type"){ + $ftype_column = $i; + } + elsif($headers[$i] eq "Variant type"){ + $mtype_column = $i; + } + elsif($headers[$i] eq "DNA From" or $headers[$i] eq "DNA Pos"){ + $pos_column = $i; + } + elsif($headers[$i] eq "Obs base"){ + $alt_column = $i; + } + elsif($headers[$i] eq "Transcript HGVS"){ + $cdna_hgvs_column = $i; + } + elsif($headers[$i] eq "Protein HGVS"){ + $aa_hgvs_column = $i; + } + elsif($headers[$i] eq "Phase"){ + $phase_column = $i; + } + elsif($headers[$i] eq "Selected transcript"){ + $transcript_column = $i; + } + elsif($headers[$i] eq "Transcript length"){ + $transcript_length_column = $i; + } + elsif($headers[$i] eq "Closest exon junction (AA coding variants)"){ + $exon_dist_column = $i; + } + elsif($headers[$i] eq "Splicing alteration potential"){ + $splicing_score_column = $i; + } + elsif($headers[$i] eq "Splicing alteration details"){ + $splicing_effect_column = $i; + } + elsif($headers[$i] eq "Sources"){ + $sources_column = $i; + } +} +if(not defined $chr_column){ + die "Could not find Chr header in $ARGV[0], aborting.\n"; +} +if(not defined $pos_column){ + die "Could not find 'DNA From' header in $ARGV[0], aborting.\n"; +} +if(not defined $alt_column){ + die "Could not find 'Obs base' header in $ARGV[0], aborting.\n"; +} +if(not defined $cdna_hgvs_column){ + die "Could not find 'Transcript HGVS' header in $ARGV[0], aborting.\n"; +} +if(not defined $aa_hgvs_column){ + die "Could not find 'Protein HGVS' header in $ARGV[0], aborting.\n"; +} +if(not defined $phase_column){ + die "Could not find Phase header in $ARGV[0], aborting.\n"; +} +if(not defined $transcript_column){ + die "Could not find 'Selected transcript' header in $ARGV[0], aborting.\n"; +} +if(not defined $transcript_length_column){ + die "Could not find 'Transcript length' header in $ARGV[0], aborting.\n"; +} +#if(not defined $mtype_column){ +# die "Could not find 'Variant type' header in $ARGV[0], aborting.\n"; +#} +if(not defined $ftype_column){ + die "Could not find 'Feature type' header in $ARGV[0], aborting.\n"; +} +if(not defined $exon_dist_column){ + die "Could not find 'Closest exon junction (AA coding variants)' header in $ARGV[0], aborting.\n"; +} + +while(<IN>){ + chomp; + my @F = split /\t/, $_, -1; + my $chr = $F[$chr_column]; + my $pos = $F[$pos_column]; + my $alt = $F[$alt_column]; + my $cdna_hgvs = $F[$cdna_hgvs_column]; + my $hgvs = $F[$aa_hgvs_column]; + my $ftype = $F[$ftype_column]; + my $mtype; + $mtype = $F[$mtype_column] if defined $mtype_column; + $hgvs =~ s/\d+//g; # look only at the non-position parts of the AA syntax + my $exon_edge_distance = $F[$exon_dist_column]; + my $phase = $F[$phase_column] || ""; + # in the case of large indels (i.e. CNVs), their positions may not be the same, but effectively they should be reported as such + if($phase =~ /CNV-\S+?:(\S+)/){ + $pos = $1; + } + my $preferred_id = $succinct ? ($F[$transcript_column] =~ /^$succinct/o) : 0; + my $id_rank = $F[$transcript_column]; + $id_rank =~ s/^.*?0*(\d+)(\.\d+)?$/$1/; # look at only trailing non-padded number (and no .version suffix) + + my $collapse_key = $succinct ? "$chr:$pos:$alt" : "$chr:$pos:$hgvs:$exon_edge_distance:$phase"; + # Same variant + if($chr eq $last_chr and $pos eq $last_pos and $alt eq $last_alt){ + # same AA effect + if(not exists $buffered_F{$collapse_key}){ + $buffered_F{$collapse_key} = \@F; + $buffered_id_rank{$collapse_key} = $id_rank; + } + else{ + $buffered_F{$collapse_key}->[$sources_column] .= "; $F[$sources_column]" if defined $sources_column; + $ftype_rank{$ftype} = 0 if not defined $ftype_rank{$ftype}; + $mtype_rank{$mtype} = 0 if defined $mtype and not exists $mtype_rank{$mtype}; + my $buf_ftype = $buffered_F{$collapse_key}->[$ftype_column]; + $ftype_rank{$buf_ftype} = 0 if not defined $ftype_rank{$buf_ftype}; + my $buf_mtype; + $buf_mtype = $buffered_F{$collapse_key}->[$mtype_column] if defined $mtype_column; + $mtype_rank{$buf_mtype} = 0 if defined $buf_mtype and not exists $mtype_rank{$buf_mtype}; + # see if this transcript is "earlier" than the other, based on ID # + if($ftype_rank{$ftype} > $ftype_rank{$buf_ftype} or + (defined $mtype and $mtype_rank{$mtype} > $mtype_rank{$buf_mtype}) or + $preferred_id or + ($id_rank =~ /^\d+$/ and $buffered_id_rank{$collapse_key} =~ /^\d+$/ and $id_rank < $buffered_id_rank{$collapse_key}) + or $F[$transcript_column] lt $buffered_F{$collapse_key}->[$transcript_column]){ # alphabetical as backup + # make this the first one reported (and use its HGVS syntax) + $buffered_F{$collapse_key}->[$transcript_length_column] = $F[$transcript_length_column]."; ".$buffered_F{$collapse_key}->[$transcript_length_column]; + $buffered_F{$collapse_key}->[$transcript_column] = $F[$transcript_column]."; ".$buffered_F{$collapse_key}->[$transcript_column]; + $buffered_F{$collapse_key}->[$cdna_hgvs_column] = $F[$cdna_hgvs_column]; + $buffered_F{$collapse_key}->[$aa_hgvs_column] = $F[$aa_hgvs_column]; + $buffered_id_rank{$collapse_key} = $id_rank; + } + else{ + # just append it to the list of IDs + $buffered_F{$collapse_key}->[$transcript_length_column] .= "; $F[$transcript_length_column]"; + $buffered_F{$collapse_key}->[$transcript_column] .= "; $F[$transcript_column]"; + } + if($succinct and defined $splicing_score_column and $F[$splicing_score_column] ne "NA" and $F[$splicing_score_column] > $buffered_F{$collapse_key}->[$splicing_score_column]){ + $buffered_F{$collapse_key}->[$splicing_score_column] = $F[$splicing_score_column]; + $buffered_F{$collapse_key}->[$splicing_effect_column] = $F[$splicing_effect_column] ." (transcript model ".$F[$transcript_column].")"; + } + } + } + # Different variant from the last line, dump any buffered data + else{ + for my $collapse_key (keys %buffered_F){ + if(defined $sources_column){ + my %seen; + $buffered_F{$collapse_key}->[$sources_column] = join("; ", grep {not $seen{$_}++} split(/; /, $buffered_F{$collapse_key}->[$sources_column])); + } + print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n"; + } + undef %buffered_F; + $last_chr = $chr; + $last_pos = $pos; + $last_alt = $alt; + $buffered_F{$collapse_key} = \@F; + $buffered_id_rank{$collapse_key} = $id_rank; + } +} +for my $collapse_key (keys %buffered_F){ + print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n"; +} +close(IN);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/split_hgvs_by_confidence Thu Mar 26 09:36:17 2015 -0600 @@ -0,0 +1,42 @@ +#!/usr/bin/env perl + +@ARGV > 3 or die "Usage: $0 <input.combined.hgvs.txt> <confident_out.hgvs.txt> <marginal_out.hgvs.txt> <min num sources> [alt min regex]\n"; + +my $infile = shift @ARGV; +my $confident_outfile = shift @ARGV; +my $marginal_outfile = shift @ARGV; +my $min_sources = shift @ARGV; +my $alt_regex = @ARGV ? shift @ARGV : undef; + +open(IN, $infile) + or die "Cannot open $infile for reading: $!\n"; +open(CONFIDENT, ">$confident_outfile") + or die "Cannot open $confident_outfile for writing: $!\n"; +open(MARGINAL, ">$marginal_outfile") + or die "Cannot open $marginal_outfile for writing: $!\n"; + +my $header = <IN>; +print CONFIDENT $header; +print MARGINAL $header; +chomp $header; +my @headers = split /\t/, $header; +my $srcs_column; +for(my $i = 0; $i <= $#headers; $i++){ + if($headers[$i] eq "Sources"){ + $srcs_column = $i; + } +} +die "Cannot find Sources column in header of $infile, aborting.\n" if not defined $srcs_column; + +while(<IN>){ + my @F = split /\t/, $_; + my @sources = split /; /, $F[$#F]; + if(@sources >= $min_sources or defined $alt_regex and $F[$#F] =~ /$alt_regex/o){ + print CONFIDENT $_; + } + else{ + print MARGINAL $_; + } +} +close(CONFIDENT); +close(MARGINAL);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/miseq_bam_variants.loc Thu Mar 26 09:36:17 2015 -0600 @@ -0,0 +1,3 @@ +capture_kits_dir /export/achri_galaxy/dbs/CaptureKits/ +named_regions_dir /export/achri_galaxy/dbs/namedRegions/ +reference_dbs /export/achri_galaxy/dbs/
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf2hgvs_table Thu Mar 26 09:36:17 2015 -0600 @@ -0,0 +1,2670 @@ +#!/usr/bin/env perl + +BEGIN{ + my $prog_dir = `dirname $0`; + chomp $prog_dir; + push @INC, $prog_dir; # so DisjointSets.pm can be found no matter the working directory +} + +use DisjointSets; # homebrew module +use Bio::DB::Sam; # for FastA reference pulls +use Bio::SeqUtils; +use Bio::Tools::CodonTable; +use Statistics::Zed; +use Getopt::Long; +use Set::IntervalTree; +use strict; +use warnings; +use vars qw($min_prop $zed $codonTable $default_transl_table %transl_except %internal_prop %dbsnp_info %chr2variant_locs %chr2dbsnp_vcf_lines %chr2internal_vcf_lines %chr2caveats %chr2phase @snvs $fasta_index $max_args $quiet); + +if(@ARGV == 1 and $ARGV[0] eq "-v"){ + print "Version 1.0\n"; + exit; +} + +#$max_args = `getconf ARG_MAX`; # largest number of args you can send to a system command (enviroment included, see limits.h) +#chomp $max_args; +$max_args = 4096; # if not defined $max_args or $max_args < 1; # the minimum since System V +$max_args -= 50; + +# find out if a variant appears in the user provided data +sub internal_prop($$$$){ + my ($chr,$pos,$ref,$variant) = @_; + + my $key = "$chr:$pos:$ref:$variant"; + if(exists $internal_prop{$key}){ + return $internal_prop{$key}; + } + + #print STDERR "Checking if internal_prop for $key exists: "; + if(exists $chr2internal_vcf_lines{$chr}->{$pos}){ + for(@{$chr2internal_vcf_lines{$chr}->{$pos}}){ + my @fields = split /\t/, $_; + if($pos == $fields[1] and length($fields[3]) == length($ref) and $fields[4] eq $variant){ + #print STDERR "yes\n"; + if(/MAF=(\d\.\d+)/){ + $internal_prop{$key} = $1; # change from percent to proportion + return $1; + } + } + } + } + else{ + #print STDERR "no\n"; + } + + $internal_prop{$key} = "NA"; + return "NA"; +} + +# find out if a variant appears in the NCBI's dbSNP +sub dbsnp_info($$$$){ + my ($chr,$pos,$ref,$variant) = @_; + + my $key = "$chr:$pos:$ref:$variant"; + if(exists $dbsnp_info{$key}){ + return @{$dbsnp_info{$key}}; + } + + if(exists $chr2dbsnp_vcf_lines{$chr}->{$pos}){ + #print STDERR "Checking existing SNP data for $chr:$pos -> ", join("\n", @{$chr2dbsnp_vcf_lines{$chr}->{$pos}}), "\n"; + for(@{$chr2dbsnp_vcf_lines{$chr}->{$pos}}){ + my @fields = split /\t/, $_; + for my $var (split /,/, $fields[4]){ + # Allows for different reference seqs between dbSNP and input, assuming patches only + if(length($fields[3]) == length($ref) and ($var eq $variant or $ref eq $var and $variant eq $fields[3])){ + my ($freq, $subpop) = ("",""); + $freq = $1 if $fields[7] =~ /(?:\A|;)MMAF=(0\.\d+)(?:;|\Z)/; + $subpop = $1 if $fields[7] =~ /(?:\A|;)MMAF_SRC=(\S+?)(?:;|\Z)/; + $dbsnp_info{$key} = [$subpop, $freq || "NA", $fields[2]]; + return @{$dbsnp_info{$key}}; + } + } + } + } + $dbsnp_info{$key} = ["novel", "NA", "NA"]; + return @{$dbsnp_info{$key}}; +} + +sub record_snv{ + my $line = join("", @_); + push @snvs, $line; + + my @fields = split /\t/, $line; + my $prop_info_key = $fields[9]; + my ($chr,$pos,$ref,$variant) = split /:/, $prop_info_key; + $chr2variant_locs{$chr} = {} unless exists $chr2variant_locs{$chr}; + return unless $ref; # ref not defined for CNVs + # Need to grab whole range for MNPs + for(my $i = 0; $i < length($ref); $i++){ + $chr2variant_locs{$chr}->{$pos+$i} = 1; + } +} + +sub retrieve_vcf_lines($$$){ + my ($dbsnp_file, $internal_snp_file, $chr) = @_; + + my (%dbsnp_lines, %internal_snp_lines); + + if(not defined $dbsnp_file or not exists $chr2variant_locs{$chr}){ + return ({}, {}, {}, {}); # no data requested for this chromosome + } + + # build up the request + my @tabix_regions; + my @var_locs = keys %{$chr2variant_locs{$chr}}; + # sort by variant start location + for my $var_loc (sort {$a <=> $b} @var_locs){ + push @tabix_regions, $chr.":".$var_loc."-".$var_loc; + } + for(my $i = 0; $i <= $#tabix_regions; $i += $max_args){ # chunkify tabix request if too many for the system to handle + my $end = $i + $max_args > $#tabix_regions ? $#tabix_regions : $i + $max_args; + my $regions = "'".join("' '", @tabix_regions[$i..$end])."'"; + # From file is very slow for some reason + #my $regions_file = "/tmp/vcf2hgvs_$$.bed"; + #open(REQ_BED, ">$regions_file") + # or die "Cannot open $regions_file for writing: $!\n"; + #print REQ_BED join("\n", @tabix_regions), "\n"; + #close(REQ_BED); + + # retrieve the data + die "Cannot find dbSNP VCF file $dbsnp_file\n" if not -e $dbsnp_file; + + open(VCF, "tabix $dbsnp_file $regions |") + or die "Cannot run tabix on $dbsnp_file (args ".substr($regions, 0, length($regions)>100? 100 : length($regions))."): $!\n"; + while(<VCF>){ + #if(/^(\S+\t(\d+)(?:\t\S+){6})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible + if(/^(\S+\t(\d+)(?:\t\S+){6})/ and exists $chr2variant_locs{$chr}->{$2}){ # take only main columns to save room, if possible + $dbsnp_lines{$2} = [] unless exists $dbsnp_lines{$2}; + push @{$dbsnp_lines{$2}}, $1; + } + } + close(VCF); + + if($internal_snp_file){ + die "Cannot find internal VCF file $internal_snp_file\n" if not -e $internal_snp_file; + open(VCF, "tabix $internal_snp_file $regions |") + or die "Cannot run tabix on $internal_snp_file: $!\n"; + while(<VCF>){ + #if(/^(\S+\t(\d+)(?:\t\S+){6})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible + if(/^(\S+\t(\d+)(?:\t\S+){5})/ and exists $chr2variant_locs{$chr}->{$2}){ # take only main columns to save room, if possible + $internal_snp_lines{$2} = [] unless exists $internal_snp_lines{$2}; + push @{$internal_snp_lines{$2}}, $1; + } + } + close(VCF); + } + } + + #unlink $regions_file; + + return (\%dbsnp_lines, \%internal_snp_lines); +} + +sub prop_info_key{ + my($chr,$pos,$ref,$variant,$exon_edge_dist) = @_; + + $chr =~ s/^chr//; + if($chr eq "M"){ + $chr = "MT"; # NCBI uses different name for mitochondrial chromosome + $pos-- if $pos >= 3107; # also, doesn't keep the old positioning (historical) + } + return join(":", $chr,$pos,$ref,$variant, ($exon_edge_dist ? $exon_edge_dist : "")); +} + +sub prop_info($$$){ + my($snpfile,$internal_snps_file,$prop_info_key) = @_; + + my($chr,$pos,$ref,$variant) = split /:/, $prop_info_key; + + # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse + if(not exists $chr2dbsnp_vcf_lines{$chr}){ + ($chr2dbsnp_vcf_lines{$chr}, $chr2internal_vcf_lines{$chr}) = retrieve_vcf_lines($snpfile,$internal_snps_file,$chr); + } + my $internal_maf = 0; + if($internal_snps_file){ + $internal_maf = internal_prop($chr,$pos,$ref,$variant); + $internal_maf = 0 if $internal_maf eq "NA"; + } + + my @results = dbsnp_info($chr,$pos,$ref,$variant); + + # Not all entries have a proportion in dbSNP + return $internal_snps_file ? ($ref, $variant, @results, $internal_maf) : ($ref, $variant, @results); +} + +#offset a given HGVS nomenclature position (single position only) by a given number of bases +sub hgvs_plus($$){ + my ($hgvs, $offset) = @_; + if($hgvs =~ /^(\S+)(-\d+)(.*)/){ + # all negative + if($2+$offset<0){ + return $1.($2+$offset).$3; + } + # switches to positive, need to mod + else{ + return $1+($2+$offset); + } + } + elsif($hgvs =~ /^(\S+)\+(\d+)(.*)/){ + # all positive + if($2+$offset>0){ + return $1."+".($2+$offset).$3; + } + # switches to negative, need to mod + else{ + return $1+($2+$offset); + } + } + elsif($hgvs =~ /^(-?\d+)(.*)/){ + # special case if offset spans -/+ since there is no position 0 + if($1 < 0 and $1+$offset >= 0){ + $offset++; + } + elsif($1 > 0 and $1+$offset <= 0){ + $offset--; + } + return ($1+$offset).$2; + } + else{ + die "Cannot convert $hgvs to a new offset ($offset), only single base position nomenclature is currently supported\n"; + } +} + +# offset a given position by a given number of bases, +# taking into account that if the new offset crosses the threshold in the last argument, +# HGVS boundary nomenclature has to be introduced +sub hgvs_plus_exon($$$){ + my ($pos, $offset, $boundary) = @_; + + # special case if offset spans -/+ since there is no position 0 + if($pos =~ /^(-?\d+)(.*)/){ + if($1 < 0 and $1+$offset >= 0){ + $offset++; + } + elsif($1 > 0 and $1+$offset <= 0){ + $offset--; + } + } + my $new_pos = $pos + $offset; + if($new_pos > $boundary and $pos <= $boundary){ + # just moved into an intron 3' + $new_pos = $boundary."+".($new_pos-$boundary); + } + elsif($new_pos < $boundary and $pos >= $boundary){ + # just moved into an intron 5' + $new_pos = $boundary.($new_pos-$boundary); + } + return $new_pos; +} + +# given a nucleotide position, calculates the AA there (assumes coding region) +sub getCodonFromSeq($$$$){ + my ($chr_ref, $location, $frame_offset, $strand) = @_; + + my $codon; + if($strand eq "+"){ + $codon = substr($$chr_ref, $location-1-$frame_offset, 3); + } + else{ + $codon = substr($$chr_ref, $location-3+$frame_offset, 3); + $codon = reverse($codon); + $codon =~ tr/ACGTacgt/TGCAtgca/; + } + return $codon; +} + +sub getCodonFromSeqIndex($$$$){ + my ($chr, $location, $frame_offset, $strand) = @_; + + my $codon; + if($strand eq "+"){ + $codon = $fasta_index->fetch($chr.":".($location-$frame_offset)."-".($location-$frame_offset+2)); + } + else{ + $codon = $fasta_index->fetch($chr.":".($location-2+$frame_offset)."-".($location+$frame_offset)); + $codon = reverse($codon); + $codon =~ tr/ACGTacgt/TGCAtgca/; + } + return $codon; +} + +sub getAAFromSeq($$$$$){ + return $_[4]->translate(getCodonFromSeq($_[0], $_[1], $_[2], $_[3])); +} + +sub getAAFromSeqIndex($$$$$){ + # convert codon to AA + if(exists $transl_except{"$_[0]:$_[1]"}){ + return $transl_except{"$_[0]:$_[1]"}; + } + else{ + return $_[4]->translate(getCodonFromSeqIndex($_[0], $_[1], $_[2], $_[3])); + } +} + +sub hgvs_protein{ + my ($chr, $location, $ref, $variant, $cdna_pos, $strand, $transl_table) = @_; + + if(substr($ref,0,1) eq substr($variant,0,1)){ + substr($ref,0,1) = ""; + substr($variant,0,1) = ""; + $location++; + if($strand eq "-"){ + $cdna_pos--; + } + else{ + $cdna_pos++; + } + } + + if($cdna_pos !~ /^\d+/){ + die "Aborting: got illegal cDNA position ($cdna_pos) for protein HGVS conversion of position ", + "$location, ref $ref, variant $variant. Please correct the program code.\n"; + } + # Get the correct frame for the protein translation, to know what codons are affected + my $aapos = int(($cdna_pos-1)/3)+1; + + # does it destroy the start codon? + if($cdna_pos < 4){ # assumes animal codon usage + return "p.0?"; # indicates start codon missing, unsure of effect + } + + my $table = $transl_table ne $default_transl_table ? # non standard translation table requested + Bio::Tools::CodonTable->new(-id=>$transl_table) : $codonTable; + + my $frame_offset = ($cdna_pos-1)%3; + my $origAA = getAAFromSeqIndex($chr, $location, $frame_offset, $strand, $table); + # take 100000 bp on either side for translation context of variant seq + my $five_prime_buffer = $location < 10000 ? $location-1 : 10000; + my $mutSeq = $fasta_index->fetch($chr.":".($location-$five_prime_buffer)."-".($location+10000)); + + # substitute all of the immediately adjacent variants in phase with this one to get the correct local effect + substr($mutSeq, $five_prime_buffer, length($ref)) = $variant; + + # does it cause a frameshift? + my $length_diff = length($variant)-length($ref); + if($length_diff%3){ # insertion or deletion not a multiple of three + my $fs_codon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1, $frame_offset, $strand); + my $ext = 0; + my $newAA; + do{ + $ext++; + # The "NA"s below make it so that we don't pick up any translation exceptions from the original reference annotation + if($strand eq "+"){ + $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1+$ext*3, $frame_offset, $strand, $table); + } + else{ + $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1-$ext*3, $frame_offset, $strand, $table); + } + } while($newAA ne "*"); + + return "p.".$origAA.$aapos.$table->translate($fs_codon)."fs*$ext"; + } + + # does it cause a stop codon to be lost? + if($origAA eq "*"){ + my $stopChangeCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1, $frame_offset, $strand); + # still a stop after the mutation (ignore translation exceptions) + if($table->is_ter_codon($stopChangeCodon)){ + return "p.*$aapos="; + } + # calculate the new stop, assuming there aren't mutations downstream in candidate stop codons + my $ext = 0; + my $newCodon; + do{ + if($strand eq "+"){ + $newCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1+(++$ext*3), $frame_offset, $strand); + } + else{ + $newCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1-(++$ext*3), $frame_offset, $strand); + } + } while(not $table->is_ter_codon($newCodon)); + + return "p.*".$aapos.$table->translate($stopChangeCodon)."ext*".$ext; + } + + # if we get this far, it's a "regular" AA level change + my $origAAs = ""; + for(my $i = 0; $i < length($ref)+$frame_offset; $i+=3){ + my $oldAA = getAAFromSeqIndex($chr, $location+$i, $frame_offset, $strand, $table); + if($strand eq "+"){ + $origAAs .= $oldAA; + } + else{ + $origAAs = $oldAA . $origAAs; + } + } + my $newAAs = ""; + for(my $i = 0; $i < length($variant)+$frame_offset; $i+=3){ + # NA means we don't take translation exceptions from the original + my $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1+$i, $frame_offset, $strand, $table); + if($strand eq "+"){ + $newAAs .= $newAA; + } + else{ + $newAAs = $newAA . $newAAs; + } + } + + # silent + if($origAAs eq $newAAs){ + return "p.".$origAAs.$aapos."="; + } + + # minimize the difference if there are leading or trailing AAs the same + my $delLength = length($ref); + while(substr($newAAs, 0, 1) eq substr($origAAs, 0, 1)){ + $newAAs = substr($newAAs, 1); + $origAAs = substr($origAAs, 1); + $location+=3; + $delLength-=3; + $aapos++; + } + while(substr($newAAs, -1) eq substr($origAAs, -1)){ + $newAAs = substr($newAAs, 0, length($newAAs)-1); + $origAAs = substr($origAAs, 0, length($origAAs)-1); + } + + # insertion + if(length($origAAs) == 0){ + my $insAAs = getAAFromSeqIndex($chr,$location-3,$frame_offset,$strand,$table).($aapos-1)."_". + getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table); + return "p.".$insAAs.$aapos."ins".$newAAs; + } + # deletion + elsif(length($newAAs) == 0){ + my $delAAs; + if(length($origAAs) == 1){ + $delAAs = getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos; # single AA deletion + } + else{ # deleting a stretch + if($strand eq "+"){ + my $endPoint = $location+$delLength-1; + $delAAs = getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos."_". + getAAFromSeqIndex($chr,$endPoint,$frame_offset,$strand,$table).($aapos+int(($delLength-1)/3)); + } + else{ + my $endPoint = $location-$delLength+1; + $delAAs = getAAFromSeqIndex($chr,$endPoint,$frame_offset,$strand,$table).($aapos-int(($delLength-1)/3))."_". + getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos; + } + } + return "p.".$delAAs."del"; + } + else{ + # substitution + if(length($origAAs) == 1 and length($newAAs) == 1){ + return "p.".$origAAs.$aapos.$newAAs; + } + # indel + elsif(length($origAAs) != 1){ + # convert ref stretch into range syntax + if($strand eq "+"){ + $origAAs = substr($origAAs, 0, 1).$aapos."_".substr($origAAs, -1).($aapos+length($origAAs)-1); + } + else{ + $origAAs = substr($origAAs, 0, 1).($aapos-length($origAAs)+1)."_".substr($origAAs, -1).$aapos; + } + } + return "p.".$origAAs."delins".$newAAs; + } + return ("NA", ""); +} + +sub z2p{ + if(not defined $zed){ + $zed = new Statistics::Zed; + } + my $p = $zed->z2p(value => $_[0]); + return $p < 0.0000000001 ? 0 : $p; +} +sub gq2p{ + return $_[0] > 200 ? 0 : 10**($_[0]/-10); +} + +my ($multi_phased, $min_depth, $flanking_bases, $dbsnp, $internal_snp, $genename_bed_file, $dir_1000G, $dir_esp6500, $min_pvalue, $mappability_file, $reference_file, $samtools_phasing_file, $exons_file, $input_file, $output_file, $cnv_file, $dgv_file, $which_chr, $enrichment_regions_file, $rare_variant_prop); +$multi_phased = 0; +$min_depth = 2; +$flanking_bases = 30; +$min_pvalue = 0.01; +$min_prop = 0.14; +$rare_variant_prop = 0.05; +$input_file = "-"; # STDIN by default +$output_file = "-"; # STDOUT by default +$default_transl_table = "1"; # assumes NCBI 'Standard' table, unless it is an argument to the script... +&GetOptions("d=i" => \$min_depth, + "f=i" => \$flanking_bases, + "s=s" => \$dbsnp, + "t=s" => \$dir_1000G, + "n=s" => \$dir_esp6500, + "u=s" => \$internal_snp, + "q" => \$quiet, + "p=f" => \$min_pvalue, + "h=f" => \$min_prop, + "m=s" => \$mappability_file, + "r=s" => \$reference_file, + "z=s" => \$samtools_phasing_file, + "e=s" => \$exons_file, + "i=s" => \$input_file, + "c=s" => \$cnv_file, + "g=s" => \$dgv_file, + "b=s" => \$genename_bed_file, + "w=s" => \$which_chr, + "o=s" => \$output_file, + "a=i" => \$default_transl_table, + "v=f" => \$rare_variant_prop, + "x=s" => \$enrichment_regions_file); # if enrichment regions are specified, variants without a transcript model but in these ranges will be reported + +if(($input_file ne "/dev/null" and not defined $reference_file) or + not defined $exons_file or + (defined $cnv_file and not defined $dgv_file)){ + die "Usage: $0 [-v(ersion)] [-q(uiet)] [-w(hich) contig_to_report (default is all)] [-d(epth of variant reads req'd) #] [-v(ariant max freq to count as rare)] [-f(lanking exon bases to report) #] [-p(robability of random genotype, maximum to report) 0.#]\n", + " [-h(et proportion of variant reads, minimum to report) 0.#] [-c(opy number) variants_file.bed -g(enomic structural) variants_control_db.txt.gz] [-z file_containing_samtools_phase_output.txt]\n", + " [-t(housand) genomes_integrated_vcfs_gz_dir] [-n ESP6500_dir] [-u(ser) specified_population.vcf.gz] [-m(appability) crg_file.bed]\n", + " [-x enrichment_regions_file.bed] [-a(mino) acid translation table number from NCBI]\n", + " [-i(nput) genotypes.vcf <-r(eference) sequence_file.fasta>] [-o(utput) hgvs_file.tsv] [-s(np) database_from_ncbi.vcf.gz]\n", + " <-b(ed) file of named gene regions.bed> <-e(xons) file.gtf>\n\n", + "Input gz files must be indexed with Tabix.\nDefault input is STDIN, default output is STDOUT. Note: if -c is specified, polyploidies are are assume to be proximal. Other defaults: -d 2, -v 0.05, -f 30, -p 0.01, -h 0.14 -a 1\nReference sequence is not strictly necessary if only CNV are being annotated.\n"; +} + +print STDERR "Considering $flanking_bases flanking bases for variants as well\n" unless $quiet; + +$codonTable = new Bio::Tools::CodonTable(id => $default_transl_table); + +my %enrichment_regions; +# Note, we assume the regions are non-overlapping +if(defined $enrichment_regions_file){ + print STDERR "Loading enrichment regions...\n" unless $quiet; + open(BED, $enrichment_regions_file) + or die "Cannot open $enrichment_regions_file for reading: $!\n"; + while(<BED>){ + chomp; + my @F = split /\t/, $_; + $enrichment_regions{$F[0]} = [] if not exists $enrichment_regions{$F[0]}; + push @{$enrichment_regions{$F[0]}}, [$F[1], $F[2]]; + } + close(BED); +} +for my $chr (keys %enrichment_regions){ # sort by start + $enrichment_regions{$chr} = [sort {$a->[0] <=> $b->[0]} @{$enrichment_regions{$chr}}]; +} + +if(defined $reference_file){ + print STDERR "Scanning reference FastA info\n" unless $quiet; + if(not -e $reference_file){ + die "Reference FastA file ($reference_file) does not exist.\n"; + } + if(not -e $reference_file.".fai" and not -w dirname($reference_file)){ + die "Reference FastA file ($reference_file) is not indexed, and the directory is not writable.\n"; + } + $fasta_index = Bio::DB::Sam::Fai->load($reference_file); +} + +my %chr2mappability; +if(defined $mappability_file){ + print STDERR "Reading in mappability data\n" unless $quiet; + my ($nmer) = $mappability_file =~ /(\d+).*?$/; + die "Cannot determine nmer from nmer file name $mappability_file, aborting\n" unless $nmer; + open(MAP, $mappability_file) + or die "Cannot open mappability data file $mappability_file for reading: $!\n"; + <MAP>; # header + while(<MAP>){ + next if /^#/; + chomp; + my @F = split /\t/, $_; + my $x = int(1/$F[3]+0.5); + $chr2mappability{$F[0]} = Set::IntervalTree->new() if not exists $chr2mappability{$F[0]}; + $chr2mappability{$F[0]}->insert("non-unique mapping region (x$x)", $F[1], $F[2]+$nmer-1); + } + close(MAP); +} + +# Is phasing data provided? +if(defined $samtools_phasing_file){ + print STDERR "Reading in phasing data\n" unless $quiet; + open(PHASE, $samtools_phasing_file) + or die "Cannot open phasing data file $samtools_phasing_file for reading: $!\n"; + my $phase_range; + while(<PHASE>){ + if(/^PS/){ + chomp; + my @F = split /\t/, $_; + $phase_range = "$F[2]-$F[3]"; + } + if(/^M[12]/){ + chomp; + my @F = split /\t/, $_; + #ignore strange cases where haplotype reference has no cases (weird samtools call) + next if $F[9] == 0 or $F[7] == 0; + my $chr = $F[1]; + next if defined $which_chr and not $chr eq $which_chr; + my $pos = $F[3]; + #print STDERR "Recording phase for $chr:$pos:$F[4] , $chr:$pos:$F[5] as A-$chr:$phase_range and B-$chr:$phase_range\n" if $pos == 12907379; + if(($F[10]+$F[8])/($F[9]+$F[7]) >= $min_prop){ # error meets reporting threshold + $chr2caveats{"$chr:$pos"} .= "; " if exists $chr2caveats{"$chr:$pos"}; + $chr2caveats{"$chr:$pos"} .= "inconsistent haplotype phasing"; + } + else{ # appears to be a genuine phasing + $chr2phase{"$chr:$pos:$F[4]"} = "A-$chr:$phase_range"; # grouping for haplotype + $chr2phase{"$chr:$pos:$F[5]"} = "B-$chr:$phase_range"; # grouping for haplotype + } + } + } + close(PHASE); +} + +# Check the VCF file to see if contains phase data +open(VCFIN, $input_file) + or die "Cannot open $input_file for reading: $!\n"; +my $phase_chr = ""; +my @phase_dataA; +my @phase_dataB; +while(<VCFIN>){ + if(/^\s*(?:#|$)/){ # blank or hash comment + next; + } + my @F = split /\t/, $_; + next if exists $chr2caveats{"$F[0]:$F[1]"} and $chr2caveats{"$F[0]:$F[1]"} =~ /inconsistent haplotype phasing/; + # | indicates phased + if($F[8] =~ m(^(\d+)\|(\d+):)){ + next if $1 eq $2; # not useful to us (actually would mess up phase combining later on), but is provided sometimes + # start of a phasing block + if($phase_chr eq ""){ + $phase_chr = $F[0]; + } + my @vars = split /,/, $F[4]; + if($1 > @vars){ + die "Invalid VCF file (line #$.): First haplotype listed as $1, but only ", scalar(@vars), " variants were provided (", join(",", @vars), "\n"; + } + if($2 > @vars){ + die "Invalid VCF file (line #$.): Second haplotype listed as $1, but only ", scalar(@vars), " variants were provided (", join(",", @vars), "\n"; + } + unshift @vars, $F[3]; + push @phase_dataA, [$F[1], $vars[$1]]; + push @phase_dataB, [$F[1], $vars[$2]]; + } + # non phased het call, ends any phasing block there might be + elsif($F[8] =~ m(^0/1)){ + # Did we just finish a phased block? If so, output it. + if(@phase_dataA > 1){ + my $phase_def = "G-$phase_chr:".$phase_dataA[0]->[0]."-".$phase_dataA[$#phase_dataA]->[0]; + for my $d (@phase_dataA){ + my ($p, $v) = @$d; + if(exists $chr2phase{"$phase_chr:$p:$v"}){ + $chr2phase{"$phase_chr:$p:$v"} .= ",$phase_def"; + $multi_phased ||= 1; + } + else{ + $chr2phase{"$phase_chr:$p:$v"} = $phase_def; + } + } + $phase_def = "H-$phase_chr:".$phase_dataB[0]->[0]."-".$phase_dataB[$#phase_dataB]->[0]; + for my $d (@phase_dataB){ + my ($p, $v) = @$d; + if(exists $chr2phase{"$phase_chr:$p:$v"}){ + $chr2phase{"$phase_chr:$p:$v"} = ",$phase_def"; + $multi_phased ||= 1; + } + else{ + $chr2phase{"$phase_chr:$p:$v"} = $phase_def; + } + } + } + if($phase_chr ne ""){ + $phase_chr = ""; + @phase_dataA = (); + @phase_dataB = (); + } + } +} + +print STDERR "Reading in feature GTF data..." unless $quiet; +my %feature_range; # chr => transcript_id => [[genomic_exon_start,genomic_exon_end,cdna_start_pos],...] +my %feature_intervaltree; # chr => transcript_id => [[genomic_exon_start,genomic_exon_end,cdna_start_pos],...] +my %feature_strand; # transcript_id => +|- +my $feature_count = 0; +my %feature_min; +my %feature_max; +my %feature_cds_min; +my %feature_cds_max; +my %feature_contig; +my %feature_length; +my %feature_type; +my %feature_transl_table; # note alternate translation table usage +my %chr_read; +open(GTF, $exons_file) + or die "Cannot open $exons_file for reading: $!\n"; +while(<GTF>){ + next if /^\s*#/; + my @fields = split /\t/, $_; + next if defined $which_chr and $fields[0] ne $which_chr and "chr$fields[0]" ne $which_chr and $fields[0] ne "chr$which_chr"; + + if($fields[2] eq "exon" or $fields[2] eq "CDS"){ + next unless $fields[$#fields] =~ /transcript_id \"(.*?)\"/o; + my $parent = $1; + if(not $quiet and not exists $chr_read{$fields[0]}){ + print STDERR " $fields[0]"; + $chr_read{$fields[0]} = 1; + } + if(not exists $feature_strand{$parent}){ + $feature_strand{$parent} = $fields[6]; + $feature_contig{$parent} = $fields[0]; + if($fields[$#fields] =~ /transcript_type \"(.*?)\"/){ + $feature_type{$parent} = $1; + } + else{ + $feature_type{$parent} = "NA"; + } + } + if($fields[2] eq "CDS"){ + #print STDERR "CDS value for $parent is $fields[2]..$fields[3]\n"; + if(not exists $feature_cds_min{$parent} or $fields[3] < $feature_cds_min{$parent}){ + $feature_cds_min{$parent} = $fields[3]; + } + if(not exists $feature_cds_max{$parent} or $fields[4] > $feature_cds_max{$parent}){ + $feature_cds_max{$parent} = $fields[4]; + } + if($fields[$#fields] =~ /transl_table \"(\d+)\"/){ + $feature_transl_table{$parent} = $1; #assume one translation table per CDS, which should be reasonable + } + while($fields[$#fields] =~ /transl_except \"pos:(\S+?),aa:(\S+?)\"/g){ + my $pos = $1; + my $new_aa = $2; # needs to change from three letter code to 1 + if($new_aa =~ /^ter/i){ # can be funny so have special case (allows TERM, etc.) + $new_aa = "*"; + } + elsif(length($new_aa) == 3){ + $new_aa = Bio::SeqUtils->new()->seq3in($new_aa); + } + if($pos =~ /^(\d+)\.\.(\d+)/){ + for my $p ($1..$2){ + $transl_except{"$fields[0]:$p"} = $new_aa; + } + } + else{ + $transl_except{"$fields[0]:$pos"} = $new_aa; + } + } + next; + } + if(not exists $feature_min{$parent} or $fields[3] < $feature_min{$parent}){ + $feature_min{$parent} = $fields[3]; + } + if(not exists $feature_max{$parent} or $fields[4] > $feature_max{$parent}){ + $feature_max{$parent} = $fields[4]; + } + + $feature_count++; + if(not exists $feature_range{$fields[0]}){ + $feature_range{$fields[0]} = {}; # Chr => {parentID => [start,stop]} + $feature_intervaltree{$fields[0]} = Set::IntervalTree->new(); + } + if(not exists $feature_range{$fields[0]}->{$parent}){ + $feature_range{$fields[0]}->{$parent} = []; + } + push @{$feature_range{$fields[0]}->{$parent}}, [$fields[3],$fields[4]]; + $feature_intervaltree{$fields[0]}->insert($parent, $fields[3], $fields[4]+1); # ranges need to have positive length for module to work properly + $feature_length{$parent} += $fields[4]-$fields[3]+1; + } +} +close(GTF); +print STDERR "\nFound $feature_count exons on ", scalar(keys %feature_range), " contigs in the GTF file\n" unless $quiet; + +for my $contig (keys %feature_range){ + for my $parent (keys %{$feature_range{$contig}}){ + # sort by subrange start + my @feature_ranges = sort {$a->[0] <=> $b->[0]} @{$feature_range{$contig}->{$parent}}; + $feature_range{$contig}->{$parent} = \@feature_ranges; + $feature_range{"chr".$contig}->{$parent} = \@feature_ranges if not $contig =~ /^chr/; + $feature_range{$1}->{$parent} = \@feature_ranges if $contig =~ /^chr(\S+)/; + } +} + +# Calculate the cDNA position of the leftmost (reference strand) base for each exon +for my $contig (keys %feature_range){ + for my $parent (keys %{$feature_range{$contig}}){ + my @feature_ranges = @{$feature_range{$contig}->{$parent}}; + if($feature_strand{$parent} eq "-"){ + # set up utr offset for correct CDS coordinates + my $feature_offset = 0; + for(my $i = $#feature_ranges; $i >= 0; $i--){ + last if not $feature_cds_max{$parent}; + # exon is completely 5' of the start + if($feature_ranges[$i]->[0] > $feature_cds_max{$parent}){ + $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; + } + # exon with the cds start + elsif($feature_ranges[$i]->[1] >= $feature_cds_max{$parent} and + $feature_ranges[$i]->[0] <= $feature_cds_max{$parent}){ + $feature_offset += $feature_cds_max{$parent} - $feature_ranges[$i]->[1]; + last; + } + else{ + die "The CDS for $parent (on negative strand) ends downstream ", + "($feature_cds_max{$parent}) of the an exon", + " (", $feature_ranges[$i]->[0], "), which is illogical. Please revise the GFF file provided.\n"; + } + } + for(my $i = $#feature_ranges; $i >= 0; $i--){ + $feature_offset += $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; + $feature_ranges[$i]->[2] = $feature_offset-1; + } + } + else{ # positive strand + # set up utr offset for correct CDS coordinates + my $feature_offset = 0; + for(my $i = 0; $i <= $#feature_ranges; $i++){ + last if not $feature_cds_min{$parent}; + # All 5' utr exon + if($feature_ranges[$i]->[1] < $feature_cds_min{$parent}){ + $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; + } + # exon with the cds start + elsif($feature_ranges[$i]->[1] >= $feature_cds_min{$parent} and + $feature_ranges[$i]->[0] <= $feature_cds_min{$parent}){ + $feature_offset -= $feature_cds_min{$parent} - $feature_ranges[$i]->[0]; + last; + } + else{ + die "The CDS for $parent starts upstream ($feature_cds_min{$parent}) of the first exon", + " (", $feature_ranges[$i]->[0], "), which is illogical. Please revise the GFF file provided.\n"; + } + } + # assign cDNA coords for each exon to the third array element + for(my $i = 0; $i <= $#feature_ranges; $i++){ + $feature_ranges[$i]->[2] = $feature_offset; + $feature_offset += $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; + } + } + } +} + +print STDERR "Reading in gene name definitions...\n" unless $quiet; +die "Data file $genename_bed_file does not exist, aborting.\n" if not -e $genename_bed_file; +my %gene_ids; +open(TAB, $genename_bed_file) + or die "Cannot open gene name BED file $genename_bed_file for reading: $!\n"; +while(<TAB>){ + chomp; + # format should be "chr start stop gene_name ..." + my @fields = split /\t/, $_; + next if $#fields < 3; + my $c = $fields[0]; + if(not exists $gene_ids{$c}){ + $gene_ids{$c} = Set::IntervalTree->new(); + } + $gene_ids{$c}->insert($fields[3], $fields[1], $fields[2]); +} + +# Print output header +open(OUT, ">$output_file") + or die "Cannot open $output_file for writing: $!\n"; + +print OUT join("\t", "Feature type", "Transcript length", "Selected transcript", "Transcript HGVS", "Strand", "Chr", "DNA From", "DNA To", "Zygosity", "P-value", "Variant Reads", "Total Reads", + "Ref base", "Obs base", "Pop. freq. source", "Pop. freq.", "Variant DB ID"), "\t", + ($internal_snp ? "Internal pop. freq.\t" : ""), + join("\t", "Protein HGVS", "Closest exon junction (AA coding variants)", "Gene Name", "Caveats", "Phase", "Num rare variants in gene (MAF <= $rare_variant_prop)", "Num rare coding and splice site variants in gene (MAF <= $rare_variant_prop)"),"\n"; + +# If there is CNV data, load it. +# BED columns should be chr start stop caveats ploidy . ignored ignored r,g,b +# The dot means the strand doesn't matter. +# where the first five fields are required, others optional +# where r,g,b is overloaded with father,mother ploidies and "b" is integer representing affected status logical AND (father bit mask 1, mother bit mask 2) +if(defined $cnv_file){ + print STDERR "Reading in CNV data...\n" unless $quiet; + open(CNV, $cnv_file) + or die "Cannot open $cnv_file for reading: $!\n"; + while(<CNV>){ + chomp; + my @F = split /\t/, $_, -1; + if(@F < 5){ + print STDERR "Skipping unparseable line ($cnv_file #$.): $_\n"; + next; + } + my $ploidy = $F[4]; + my $cnv_chr = $F[0]; + next if defined $which_chr and $cnv_chr ne $which_chr and "chr$cnv_chr" ne $which_chr and $cnv_chr ne "chr$which_chr"; + my $cnv_start = $F[1]; + my $cnv_end = $F[2]; + my $p_value = "NA"; + if($F[3] =~ s/p-value=(\S+?)(?:;|$)//){ + $p_value = $1; + next if $min_pvalue < $p_value; + } + + # Report a variant line for each gene that is found in this CNV + my $target_parents = $feature_intervaltree{$cnv_chr}->fetch($cnv_start, $cnv_end+1); + + my $caveats = ""; + if(@F == 9){ + my @parents_ploidy = split /,/, $F[8]; + if($parents_ploidy[2] == 0){ # neither parent affected + if($ploidy < $parents_ploidy[0] and $ploidy < $parents_ploidy[1]){ + if($ploidy > 2){ + $caveats = "Polyploidy is less severe than in either unaffected parents"; + } + # else: no caveats, this offspring has fewer copies than normally observed, or in unaffected parents + elsif($ploidy < 2){ + if($parents_ploidy[0] == 2 and $parents_ploidy[1] == 2){ + $caveats = "De novo copy loss, unaffected parents are diploid"; + } + else{ + $caveats = "Copy loss is greater than in either unaffected parent"; + } + } + } + elsif($ploidy >= $parents_ploidy[0] and $ploidy <= $parents_ploidy[1] or + $ploidy >= $parents_ploidy[1] and $ploidy <= $parents_ploidy[0]){ + $caveats = "Aneuploidy likely inherited from an unaffected parent"; + } + elsif($ploidy > $parents_ploidy[0] and $ploidy > $parents_ploidy[1]){ + if($parents_ploidy[0] > 2){ + if($parents_ploidy[1] > 2){ + $caveats = "Lower polyploidy already exists in both unaffected parents"; + } + else{ + $caveats = "Lower polyploidy already exists in unaffected father"; + } + } + else{ + if($parents_ploidy[1] > 2){ + $caveats = "Lower polyploidy already exists in unaffected mother"; + } + # else no caveats, because both parents are "normal", yet we have polyploidy in the offspring + else{ + $caveats = "De novo polyploidy, unaffected parents are diploid"; + } + } + } + # else + else{ + die "Oops! Error in program logic...how did we get here (unaffected parents)? $_"; + } + } + elsif($parents_ploidy[2] == 1){ # father affected + if($ploidy == $parents_ploidy[1]){ # just like unaffected Mom + if($ploidy > 2){ + if($ploidy == $parents_ploidy[0]){ + $caveats = "Same polyploidy present in both affected and unaffected parents"; + } + else{ + $caveats = "Polyploidy inherited from unaffected mother"; + } + } + elsif($ploidy < 2){ + if($ploidy == $parents_ploidy[0]){ + $caveats = "Same copy loss in both affected and unaffected parents"; + } + else{ + $caveats = "Copy loss is shared with unaffected mother"; + } + } + else{ + if($ploidy == $parents_ploidy[0]){ + # Why was this even reported? parents and child have diploid status... + next; + } + $caveats = "Diploidy is shared with unaffected mother"; + } + } + elsif($ploidy > 2){ # polyploidy + if($parents_ploidy[0] == 2){ + if($parents_ploidy[1] > 2){ + $caveats = "Unaffected mother has polyploidy (".$parents_ploidy[1]."x), but affected father is diploid"; + } + elsif($parents_ploidy[1] == 2){ + $caveats = "Both unaffected mother and affected father are diploid"; + } + else{ + $caveats = "Affected father is diploid, unaffected mother has copy loss (".$parents_ploidy[1]."x)"; + } + } + elsif($parents_ploidy[0] < 2){ + $caveats = "Polyploidy found, but affected father had copy loss (".$parents_ploidy[0]."x)"; + } + elsif($ploidy < $parents_ploidy[1]){ + $caveats = "Polyploidy is less severe than in unaffected mother (".$parents_ploidy[1]."x), or affected father (".$parents_ploidy[0]."x)"; + } + # past here the ploidy is great than in the unaffected mother + elsif($parents_ploidy[1] < 2){ + $caveats = "Polyploidy is also severe in affected father (".$parents_ploidy[0]."x), but unaffected mother actually had copy loss (". $parents_ploidy[1]. "x)"; + } + elsif($parents_ploidy[1] == 2){ + $caveats = "Polyploidy is also severe in affected father (".$parents_ploidy[0]."x), and mother is diploid"; + } + elsif($ploidy < $parents_ploidy[0]){ + $caveats = "Polyploidy is less severe than in affected father (".$parents_ploidy[0]."x), but more severe than unaffected mother (". $parents_ploidy[1]. "x)"; + } + elsif($ploidy > $parents_ploidy[0]){ + $caveats = "Polyploidy is more severe than in affected father (".$parents_ploidy[0]."x)"; + } + else{ + $caveats = "Polyploidy is as severe as in affected father"; + } + } + elsif($ploidy == 2){ + # Don't report diploid status, any funny recombination should show up in large indel analysis + next; + } + else{ # copies < 2 + if($ploidy == $parents_ploidy[0]){ + if($ploidy > $parents_ploidy[1]){ + $caveats = "Copy loss is the same as affected father, but less than unaffected mother (". $parents_ploidy[1]. "x)"; + } + else{ + $caveats = "Copy loss is as severe as in affected father"; + } + } + elsif($ploidy > $parents_ploidy[0]){ + if($ploidy > $parents_ploidy[1]){ + if($parents_ploidy[1] == 0 and $parents_ploidy[0] == 0){ + $caveats = "Poor mapping, or Mendelian inheritence violation is severe: no copies of region in either parent, but present in offspring"; + } + elsif($ploidy == 2){ + next; # child got best of both parents, ignore from CNV standpoint (may still have SNPs of course, or translocation, etc.) + } + else{ + $caveats = "Copy loss is less severe than in unaffected mother (".$parents_ploidy[1]."x), or affected father (".$parents_ploidy[0]."x)"; + } + } + # else: child has less copies than unaffected mom, but more than affected Dad + else{ + if($parents_ploidy[1] > 2){ + $caveats = "Copy loss was more severe in affected father (".$parents_ploidy[0]."x), but unaffected mother had polyploidy (".$parents_ploidy[1]."x)"; + } + elsif($parents_ploidy[1] == 2){ + $caveats = "Copy loss was more severe in affected father (".$parents_ploidy[0]."x), but unaffected mother was diploid"; + } + else{ # unaffected has loss + $caveats = "Copy loss is more severe than unaffect mother (".$parents_ploidy[1]."x), but less severe than affected father (".$parents_ploidy[0]."x)"; + } + } + } + # past here, ploidy is less than affected father + elsif($parents_ploidy[1] > 2){ + $caveats = "Copy loss is more severe than affected father (".$parents_ploidy[0]."x), and unaffected mother had polyploidy (".$parents_ploidy[1]."x)"; + } + elsif($parents_ploidy[1] == 2){ + $caveats = "Copy loss is more severe than in affected father (".$parents_ploidy[0]."x)"; + } + else{ + $caveats = "Copy loss is more severe than in both unaffect mother (".$parents_ploidy[1]."x), and affected father (".$parents_ploidy[0]."x)"; + } + } + } + elsif($parents_ploidy[2] == 2){ # mother affected + if($ploidy == $parents_ploidy[0]){ # just like unaffected Dad + if($ploidy > 2){ + if($ploidy == $parents_ploidy[1]){ + $caveats = "Same polyploidy present in both affected and unaffected parents"; + } + else{ + $caveats = "Polyploidy inherited from unaffected father"; + } + } + elsif($ploidy < 2){ + if($ploidy == $parents_ploidy[1]){ + $caveats = "Same copy loss in both affected and unaffected parents"; + } + else{ + $caveats = "Copy loss is shared with unaffected father"; + } + } + else{ + if($ploidy == $parents_ploidy[1]){ + # Why was this even reported? parents and child have diploid status... + next; + } + $caveats = "Diploidy is shared with unaffected father"; + } + } + elsif($ploidy > 2){ # polyploidy + if($parents_ploidy[1] == 2){ + if($parents_ploidy[0] > 2){ + $caveats = "Unaffected father has polyploidy (".$parents_ploidy[0]."x), but affected mother is diploid"; + } + elsif($parents_ploidy[0] == 2){ + $caveats = "Both unaffected father and affected mother are diploid"; + } + else{ + $caveats = "Affected mother is diploid, unaffected father has copy loss (".$parents_ploidy[1]."x)"; + } + } + elsif($parents_ploidy[1] < 2){ + $caveats = "Polyploidy found, but affected mother had copy loss (".$parents_ploidy[1]."x)"; + } + elsif($ploidy < $parents_ploidy[0]){ + $caveats = "Polyploidy is less severe than in unaffected father (".$parents_ploidy[0]."x), or affected mother (".$parents_ploidy[1]."x)"; + } + # past here the ploidy is great than in the unaffected father + elsif($parents_ploidy[0] < 2){ + $caveats = "Polyploidy is also severe in affected mother (".$parents_ploidy[1]."x), but unaffected father actually had copy loss (". $parents_ploidy[0]. "x)"; + } + elsif($parents_ploidy[0] == 2){ + $caveats = "Polyploidy is also severe in affected mother (".$parents_ploidy[1]."x), and unaffected father is diploid"; + } + elsif($ploidy < $parents_ploidy[1]){ + $caveats = "Polyploidy is less severe than in affected mother (".$parents_ploidy[1]."x), but more severe than unaffected father (". $parents_ploidy[0]. "x)"; + } + elsif($ploidy > $parents_ploidy[1]){ + $caveats = "Polyploidy is more severe than in affected mother (".$parents_ploidy[1]."x)"; + } + else{ + $caveats = "Polyploidy is as severe as in affected mother"; + } + } + elsif($ploidy == 2){ + # Don't report diploid status, any funny recombination should show up in large indel analysis + next; + } + else{ # copies < 2 + if($ploidy == $parents_ploidy[1]){ + if($ploidy > $parents_ploidy[0]){ + $caveats = "Copy loss is the same as affected mother, but less than unaffected father (". $parents_ploidy[0]. "x)"; + } + else{ + $caveats = "Copy loss is as severe as in affected mother"; + } + } + elsif($ploidy > $parents_ploidy[1]){ + if($ploidy > $parents_ploidy[0]){ + if($parents_ploidy[1] == 0 and $parents_ploidy[0] == 0){ + $caveats = "Poor mapping, or Mendelian inheritence violation is severe: no copies of region in either parent, but present in offspring"; + } + elsif($ploidy == 2){ + next; # child got best of both parents, ignore from CNV standpoint (may still have SNPs of course, or translocation, etc.) + } + else{ + $caveats = "Copy loss is less severe than in unaffected father (".$parents_ploidy[0]."x), or affected mother (".$parents_ploidy[1]."x)"; + } + } + # else: child has less copies than unaffected Dad, but more than affected Mom + else{ + if($parents_ploidy[0] > 2){ + $caveats = "Copy loss was more severe in affected mother (".$parents_ploidy[1]."x), but unaffected father had polyploidy (".$parents_ploidy[0]."x)"; + } + elsif($parents_ploidy[0] == 2){ + $caveats = "Copy loss was more severe in affected mother (".$parents_ploidy[1]."x), but unaffected father was diploid"; + } + else{ # unaffected has loss + $caveats = "Copy loss is more severe than unaffect father (".$parents_ploidy[0]."x), but less severe than affected mother (".$parents_ploidy[1]."x)"; + } + } + } + # past here, ploidy is less than affected mother + elsif($parents_ploidy[0] > 2){ + $caveats = "Copy loss is more severe than affected mother (".$parents_ploidy[1]."x), and unaffected father had polyploidy (".$parents_ploidy[0]."x)"; + } + elsif($parents_ploidy[0] == 2){ + $caveats = "Copy loss is more severe than in affected mother (".$parents_ploidy[1]."x)"; + } + else{ + $caveats = "Copy loss is more severe than in both unaffect father (".$parents_ploidy[0]."x), and affected mother (".$parents_ploidy[1]."x)"; + } + } + } + + } + if($F[3] and $F[3] ne "-"){ # prexisting caveat from CNV caller + if(defined $caveats){ + $caveats .= "; $F[3]" unless $caveats =~ /\b$F[3]\b/; + } + else{ + $caveats = $F[3]; + } + } + + # Sort by start for consistency + my @target_parents = sort {$feature_range{$cnv_chr}->{$a}->[0]->[0] <=> $feature_range{$cnv_chr}->{$b}->[0]->[0]} @$target_parents; + + for my $target_parent (@target_parents){ + my $target_caveats = $caveats; + my $strand = $feature_strand{$target_parent}; + # report the gain/loss of each gene separately, for simplicity in downstream analysis + my $cnv_exon_start = 10000000000; # genomic coords + my $cnv_exon_end = 0; + my $cnv_cdna_start = 0; # cDNA coords + my $cnv_cdna_end = 0; + my $off5 = 0; # border outside the exon? + my $off3 = 0; + my @feature_ranges = @{$feature_range{$cnv_chr}->{$target_parent}}; + # find the first and last exons in the gene that are inside the CNV + for my $subregion (@feature_ranges){ + # exon overlaps CNV? + if($subregion->[0] <= $cnv_end and $subregion->[1] >= $cnv_start){ + if($cnv_exon_start > $subregion->[0]){ + if($cnv_start < $subregion->[0]){ + $cnv_exon_start = $subregion->[0]; $off5 = 1; + $cnv_cdna_start = $subregion->[2]; + } + else{ + $cnv_exon_start = $cnv_start; $off5 = 0; + $cnv_cdna_start = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$cnv_start: $cnv_start-$subregion->[0]); + } + } + if($cnv_exon_end < $subregion->[1]){ + if($cnv_end > $subregion->[1]){ + $cnv_exon_end = $subregion->[1]; $off3 = 1; + $cnv_cdna_end = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$subregion->[1] : $subregion->[1]-$subregion->[0]); + } + else{ + $cnv_exon_end = $cnv_end; $off3 = 0; + $cnv_cdna_end = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$cnv_end : $cnv_end-$subregion->[0]); + } + } + } + } + + my $ends_internally = 0; + if($cnv_exon_end == 0){ # ends inside the exon + $cnv_exon_end = $cnv_end; + $ends_internally = 1; + } + # See if it's in the structural variant database + my @gain_coverage; $#gain_coverage = $cnv_exon_end-$cnv_exon_start; # preallocate blanks + my @loss_coverage; $#loss_coverage = $cnv_exon_end-$cnv_exon_start; # preallocate blanks + my $dgv_loss_id; # report the DGV entry that covers most of the observed structural variant + my $dgv_loss_length = 0; # report the DGV entry that covers most of the observed structural variant + my $dgv_gain_id; # report the DGV entry that covers most of the observed structural variant + my $dgv_gain_length = 0; # report the DGV entry that covers most of the observed structural variant + my $gains; + my $losses; + my $dgv_chr = $cnv_chr; + $dgv_chr =~ s/^chr//; # no prefix in DGV + #open(DGV, "tabix $dgv_file $dgv_chr:$cnv_exon_start-$cnv_exon_end |") # check out CNV in this gene model region + # or die "Cannot run tabix: $!\n"; + open(DGV, "/dev/null"); + while(<DGV>){ + my @C = split /\t/, $_; + next if $C[4] ne "CNV"; # todo: handle indels? + my $dgv_start = $C[2]; + my $dgv_end = $C[3]; + my $dgv_direction = $C[5]; + my $gain_cov_count = 0; + my $loss_cov_count = 0; + if($dgv_direction eq "Gain"){ + for(my $i = ($dgv_start < $cnv_exon_start ? $cnv_exon_start : $dgv_start); $i <= $dgv_end and $i <= $cnv_exon_end; $i++){ + $gain_coverage[$i-$cnv_exon_start] = 1 unless defined $gain_coverage[$i-$cnv_exon_start]; + $gain_cov_count++; + } + } + elsif($dgv_direction eq "Loss"){ + for(my $i = ($dgv_start < $cnv_exon_start ? $cnv_exon_start : $dgv_start); $i <= $dgv_end and $i <= $cnv_exon_end; $i++){ + $loss_coverage[$i-$cnv_exon_start] = 1 unless defined $loss_coverage[$i-$cnv_exon_start]; + $loss_cov_count++; + } + } + if($dgv_direction eq "Gain" and $gain_cov_count > $dgv_gain_length){ + $dgv_gain_id = $C[0]; + $dgv_gain_length = $gain_cov_count; + } + if($dgv_direction eq "Loss" and $loss_cov_count > $dgv_loss_length){ + $dgv_loss_id = $C[0]; + $dgv_loss_length = $loss_cov_count; + } + } + close(DGV); + + my $gain_coverage = 0; + for my $count (@gain_coverage){ + $gain_coverage++ if defined $count; + } + $gain_coverage = sprintf "%.3f", $gain_coverage/($cnv_exon_end-$cnv_exon_start+1); # make it a proportion + my $loss_coverage = 0; + for my $count (@loss_coverage){ + $loss_coverage++ if defined $count; + } + $loss_coverage = sprintf "%.3f", $loss_coverage/($cnv_exon_end-$cnv_exon_start+1); # make it a proportion + + my $src = "DGV"; + my $dgv_id = "NA"; + my $dgv_caveat; + my $dgv_coverage; + if($ploidy > 2){ + if(not defined $dgv_gain_id){ + if(defined $dgv_loss_id){ + $dgv_id = sprintf "%s/%.3f", $dgv_loss_id, $dgv_loss_length/($cnv_exon_end-$cnv_exon_start+1); + $dgv_caveat = "; No gains are known in healthy controls, the DGV2 ID reported is for a loss in the same area"; + $dgv_coverage = $loss_coverage; + } + else{ + $dgv_id = "novel"; + $dgv_coverage = "NA"; + $src = "NA"; + } + } + else{ + $dgv_id = sprintf "%s/%.3f", $dgv_gain_id, $dgv_gain_length/($cnv_exon_end-$cnv_exon_start+1); + $dgv_coverage = $gain_coverage; + } + } + elsif($ploidy < 2){ + if(not defined $dgv_loss_id){ + if(defined $dgv_gain_id){ + $dgv_id = sprintf "%s/%.3f", $dgv_gain_id, $dgv_gain_length/($cnv_exon_end-$cnv_exon_start+1); + $dgv_caveat = "; No losses are known in healthy controls, the DGV2 ID reported is for a gain in the same area"; + $dgv_coverage = $gain_coverage; + } + else{ + $dgv_id = "novel"; + $dgv_coverage = "NA"; + $src = "NA"; + } + } + else{ + $dgv_id = sprintf "%s/%.3f", $dgv_loss_id, $dgv_loss_length/($cnv_exon_end-$cnv_exon_start+1); + $dgv_coverage = $loss_coverage; + } + } + + my $non_coding = 0; + if(not exists $feature_cds_max{$target_parent} or not defined $feature_cds_max{$target_parent} or $feature_cds_max{$target_parent} eq ""){ + $non_coding = 1; + } + $target_caveats .= $dgv_caveat if defined $dgv_caveat and $dgv_id ne "novel" and $target_caveats !~ /\Q$dgv_caveat\E/; + #print "Recorded $cnv_chr:$cnv_start caveat $caveats\n"; + # if it doesn't overlap an exon, we need to find out which two exons it's between + if($ends_internally){ + my $intron_found = 0; + for(my $i = 0; $i < $#feature_ranges; $i++){ + if($feature_ranges[$i]->[1] < $cnv_start and $feature_ranges[$i+1]->[0] > $cnv_end){ + if($ploidy > 2){ # gain + if($strand eq "-"){ + record_snv("$target_parent\t", + ($non_coding ? "g.$cnv_start\_$cnv_end" : + "c.".($feature_ranges[$i+1]->[2])."+".($feature_ranges[$i+1]->[0]-$cnv_end)."_".($feature_ranges[$i+1]->[2]+1)."-".($cnv_start-$feature_ranges[$i]->[1])), + # pos Zygosity P-value Variant Reads Total Reads Ref Bases Var Bases Population Frequency Source Pop. freq. or DGV2 gain/loss coverage dbSNP or DGV2 ID + "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\tNA\t$p_value\tNA\tNA\t", + "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n"); + } + else{ + record_snv("$target_parent\t", + ($non_coding ? "g.$cnv_start\_$cnv_end" : + "c.".($feature_ranges[$i+1]->[2]-1)."+".($cnv_start-$feature_ranges[$i]->[1])."_".$feature_ranges[$i+1]->[2]."-".($feature_ranges[$i+1]->[0]-$cnv_end)), + "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\tNA\t$p_value\tNA\tNA\t", + "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n"); + } + } + else{ # loss + if($strand eq "-"){ + record_snv("$target_parent\t", + ($non_coding ? "g.$cnv_start\_$cnv_end" : + "c.".($feature_ranges[$i+1]->[2])."+".($feature_ranges[$i+1]->[0]-$cnv_end)."_".($feature_ranges[$i+1]->[2]+1)."-".($cnv_start-$feature_ranges[$i]->[1])), + "del\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t", + "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n"); + } + else{ + record_snv("$target_parent\t", + ($non_coding ? "g.$cnv_start\_$cnv_end" : + "c.".($feature_ranges[$i+1]->[2]-1)."+".($cnv_start-$feature_ranges[$i]->[1])."_".$feature_ranges[$i+1]->[2]."-".($feature_ranges[$i+1]->[0]-$cnv_end)), + "del\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t", + "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n"); + } + } + $intron_found = 1; last; + } + } + warn "Logic error: CNV overlaps a gene ($target_parent), but is neither intronic nor exonic. Offending CNV was $_\n" unless $intron_found; + next; + } + if($strand eq "-"){ + my $tmp = $cnv_cdna_start; + $cnv_cdna_start = $cnv_cdna_end; + $cnv_cdna_end = $tmp; + } + # Make the location label pretty descriptive + my $cnv_phase = ""; + if($cnv_exon_start > $cnv_start or $cnv_exon_end < $cnv_end){ + $cnv_phase = "CNV-$cnv_chr:$cnv_start-$cnv_end"; # Use phase to note that it's part of a bigger CNV than just the range of this feature + } + # if we get here, we're in a gained/deleted exon category + # CNVs are fuzzy-edged (as opposed to large indels), so produce HGVS syntax that reflect this + if($ploidy > 2){ # gain + # find the exons encompassed by the CNV. NOTE that we assume that polyploidies are proximal + record_snv("$target_parent\t", + ($non_coding ? "g.".($cnv_exon_start > $cnv_start ? "$cnv_exon_start-?" : $cnv_start)."_".($cnv_exon_end < $cnv_end ? "$cnv_exon_end+?" : $cnv_end) : + "c.$cnv_cdna_start".($off5?"-?":"")."_$cnv_cdna_end".($off3?"+?":"")), + "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_exon_start\t$cnv_exon_end\tNA\t$p_value\tNA\tNA\t", + "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t$cnv_phase\n"); + } + else{ # loss + #translate genome coordinates into cDNA coordinates + record_snv("$target_parent\t", + ($non_coding ? "g.".($cnv_exon_start > $cnv_start ? "$cnv_exon_start-?" : $cnv_start)."_".($cnv_exon_end < $cnv_end ? "$cnv_exon_end+?" : $cnv_end) : + "c.$cnv_cdna_start".($off5?"-?":"")."_$cnv_cdna_end".($off3?"+?":"")), + "del\t$strand\t$cnv_chr\t$cnv_exon_start\t$cnv_exon_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t", + "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t$cnv_phase\n"); + } + } + } + close(CNV); + +} + + +#sort genes by start, then longest if tied +my %rc = qw(A T T A G C C G N N S W W S K M M K Y R R Y X X); +print STDERR "Processing variant calls..." unless $quiet; +%chr_read = (); +open(VCFIN, $input_file) + or die "Cannot open $input_file for reading: $!\n"; +while(<VCFIN>){ + if(/^\s*(?:#|$)/){ # blank or hash comment lines + next; + } + chomp; + my @fields = split /\t/, $_; + + next unless exists $feature_range{$fields[0]}; + if(not $quiet and not exists $chr_read{$fields[0]}){ + print STDERR " $fields[0]"; + $chr_read{$fields[0]} = 1; + #print STDERR "(not in reference file!)" unless exists $feature_range{$fields[0]}; + } + + next if $fields[4] eq "<NON_REF>"; # GVCF background stuff + next if $fields[9] eq "./." or $fields[9] eq "."; # no call + my $chr = $fields[0]; + next if defined $which_chr and $chr ne $which_chr and $chr ne "chr$which_chr" and "chr$chr" ne $which_chr; + print STDERR "passes chr and field # test" if grep /dataset_7684.dat/, @ARGV; + $chr = "chr$chr" if $chr !~ /^chr/; + $fields[8] =~ s/\s+$//; + my @values = split /:/, $fields[9]; + #print STDERR join(" / ", @values), "\n" if $. == 84; + my @keys = split /:/, $fields[8]; + my $zygosity = "n/a"; + my $quality = "n/a"; + my $read_depth = "n/a"; + my $variant_depths = "n/a"; + for(my $i = 0; $i <= $#keys and $i <= $#values; $i++){ # values max index check because some genotypers are nasty and don't provide as many fields as they say they will + if($keys[$i] eq "GT"){ + if($values[$i] =~ /\./ or $values[$i] =~ /0\/0/){ # one genotype not described + $zygosity = "none"; + last; + } + else{ # genotypes described + $zygosity = $values[$i] =~ /[02]/ ? "heterozygote" : "homozygote"; + } + } + elsif($keys[$i] eq "DNM_CONFIG" and $zygosity eq "n/a"){ # hack + $zygosity = $values[$i] =~ /^(.)\1/ ? "homozygote" : "heterozygote"; + } + elsif($keys[$i] eq "GQ" and $values[$i] ne "."){ + #print "Checking GQ (index $i) $values[$i] gq2p\n" if $. == 84; + $quality = gq2p($values[$i]); + } + elsif($keys[$i] eq "RD"){ # from some tools like denovo variant finders + $read_depth = $values[$i]; + } + elsif($keys[$i] eq "DP"){ + $read_depth = $values[$i]; + } + # the frequency of the variant can go by many names, here we have freebayes (A* are new and old versions) and atlas2_indel + elsif($keys[$i] eq "AA" or $keys[$i] eq "VR" or $keys[$i] eq "AO"){ + $variant_depths = $values[$i]; + } + # here we have GATK variant freq of form ref#,var# + elsif($keys[$i] eq "AD"){ + $variant_depths = $values[$i]; + $variant_depths =~ s/^\d+,//; + } + else{ + #print STDERR "Ignoring field $keys[$i]\n"; + } + } + next if $zygosity eq "none"; # GVCF non-ref for example or when multiple samples are reported simultaneously + $quality = z2p($1) if $fields[7] =~ /Z=(\d+\.\d+)/; + if($quality eq "n/a" and $fields[5] ne "."){ + $quality = gq2p($fields[5]); + } + if($fields[5] eq "0" and $fields[6] eq "PASS"){ # not qual and a PASS in the filter column + $quality = 1; + } + elsif($quality ne "n/a" and $quality > $min_pvalue){ # p-value cutoff + #print "Checking call quality $fields[5] gq2p\n" if $. == 84; + next unless gq2p($fields[5]) <= $min_pvalue; # in some cases, programs like FreeBayes give low genotype quality such as when a call is borderline homo/het + # in these cases it would be silly to reject the call if their are many supporting reads. + } + + # Some tools like GATK don't report number of variant reads...infer from other data if possible + if($variant_depths eq "n/a"){ + my @key_value_pairs = split /;/, $fields[7]; + for my $key_value_pair (@key_value_pairs){ + if($key_value_pair !~ /^(.*?)=(.*)$/){ + next; + #next if $key_value_pair eq "INDEL"; # samtools peculiarity + #die "Key-value pair field (column #8) does not have the format key=value for entry $key_value_pair (line #$. of ), please fix the VCF file\n"; + } + if($1 eq "AB"){ # GATK: for het calls, AB is ref/(ref+var), only one variant reported per line + $variant_depths = ""; + for my $ab (split /,/, $2){ + $variant_depths .= int((1-$ab)*$read_depth).","; + } + chop $variant_depths; + } + elsif($1 eq "MLEAC"){ + } + elsif($1 eq "DP4"){ # samtools: high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases + my @rds = split /,/, $2; + $variant_depths = $rds[2]+$rds[3]; + $read_depth = $rds[0]+$rds[1]+$variant_depths; + if($fields[4] =~ /,/){ # samtools doesn't break down compound het calls into individual frequencies + my $num_alt_genotypes = $fields[4] =~ tr/,/,/; + $num_alt_genotypes++; + my $even_prop = sprintf "%.2f", $variant_depths/$read_depth/$num_alt_genotypes; + $variant_depths = join(",", ($even_prop) x $num_alt_genotypes); + if(not exists $chr2caveats{"$chr:$fields[1]"}){ + $chr2caveats{"$chr:$fields[1]"} = "compound het var freq n/a in orig VCF file, auto set to $even_prop"; + } + else{ + $chr2caveats{"$chr:$fields[1]"} .= "; compound het var freq n/a in orig VCF file, auto set to $even_prop"; + } + } + } + } + } + if($variant_depths eq "n/a"){ # usually homo var calls, can only assume all reads are variant + if($zygosity eq "homozygote"){ + $variant_depths = $read_depth; + if($read_depth ne "n/a"){ + if(not exists $chr2caveats{"$chr:$fields[1]"}){ + $chr2caveats{"$chr:$fields[1]"} = "homo var freq n/a in orig VCF file, auto set to 1.0"; + } + else{ + $chr2caveats{"$chr:$fields[1]"} = "; homo var freq n/a in orig VCF file, auto set to 1.0"; + } + } + } + else{ + if($read_depth ne "n/a"){ + $variant_depths = int($read_depth/2); + if(not exists $chr2caveats{"$chr:$fields[1]"}){ + $chr2caveats{"$chr:$fields[1]"} = "het var freq n/a in orig VCF file, auto set to 0.5"; + } + else{ + $chr2caveats{"$chr:$fields[1]"} = "; het var freq n/a in orig VCF file, auto set to 0.5"; + } + } + else{ + $variant_depths = $read_depth; + } + } + } + + my $target_parents = $feature_intervaltree{$chr}->fetch($fields[1]-$flanking_bases, $fields[1]+length($fields[3])+$flanking_bases); + # Last ditch, if not in a gene model, is it at least in an enrichment region? + if(not @$target_parents){ + next if not exists $enrichment_regions{$chr}; + my $regions_ref = $enrichment_regions{$chr}; + my $location = $fields[1]; + my $strand = "+"; # for lack of a better choice + for(my $i = find_earliest_index($location-$flanking_bases, $regions_ref); + $i < $#$regions_ref and $location <= $regions_ref->[$i]->[1]+$flanking_bases; + $i++){ + next unless $regions_ref->[$i]->[0]-$flanking_bases <= $location and $regions_ref->[$i]->[1]+$flanking_bases >= $location; + + my $feature_name = "enrichment_target_$chr:".$regions_ref->[$i]->[0]."-".$regions_ref->[$i]->[1]; + $feature_type{$feature_name} = "misc_enrichment_kit_target"; + $feature_length{$feature_name} = $regions_ref->[$i]->[1]-$regions_ref->[$i]->[0]+1; + my @variants = split /,/, $fields[4]; + my @variant_depths = split /,/, $variant_depths; + my $ref = uc($fields[3]); + for(my $j = 0; $j <= $#variants; $j++){ + my $variant = $variants[$j]; + next if $variant eq "<NON_REF>" or $variant_depths[$j] eq "0"; # GVCF stuff + my $variant_depth = $variant_depths[$j]; + if($min_prop){ + next unless $variant_depth >= $min_depth and $read_depth ne "n/a" and $variant_depth/$read_depth >= $min_prop; + } + if(length($ref) == 1 and length($variant) == 1){ # SNP + record_snv("$feature_name\tg.$location", + "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); + } + elsif(length($ref) == 1 and length($variant) > 1){ # insertion + record_snv("$feature_name\tg.$location\_",($location+1), + "ins",substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); + } + elsif(length($variant) == 1 and length($ref) > 1){ # deletion + record_snv("$feature_name\tg.$location\_",($location+length($ref)-1), + "del",substr($ref, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); + } + else{ # indel + record_snv("$feature_name\tg.$location\_",($location+length($ref)-1), + "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); + } + } # end for variants + next; # process next record, we've done all we can with a non-coding-gene SNP + } + } + + for my $target_parent (@$target_parents){ + + print STDERR "Checking parent $target_parent for on $chr:$fields[1] $fields[3] -> $fields[4]\n" if grep /dataset_7684.dat/, @ARGV; + my @feature_ranges = @{$feature_range{$chr}->{$target_parent}}; + # Calculate the position of the change within the feature range position + my $strand = $feature_strand{$target_parent}; + my $trans_table = exists $feature_transl_table{$target_parent} ? $feature_transl_table{$target_parent} : $default_transl_table; + $fields[4]=~tr/"//d; # sometime strangely surroundsed by quotes + my @variants = split /,/, $fields[4]; + my @variant_depths = split /,/, $variant_depths; + my @refs = (uc($fields[3])) x scalar(@variants); + my @locations = ($fields[1]) x scalar(@variants); + + for(my $j = 0; $j <= $#variants; $j++){ + my $ref = $refs[$j]; + my $location = $locations[$j]; + my $feature_offset = 0; + my $feature_num = 0; + my $variant = uc($variants[$j]); + next if $variant eq "<NON_REF>" or $variant_depths[$j] eq "0"; # GVCF stuff + my $variant_depth = $variant_depths[$j] || "n/a"; + #print STDERR "Evaluating target parent $target_parent for $chr:$fields[1]-".($fields[1]+length($fields[3]))." -> ",join("/", @$target_parents) , "\n" if $fields[1] == 982994; + + # Break down MNPs into individual SNPs that are phased (TODO: skip if it's an inversion? would require amalgamating SNPs for tools that call them individually, phased :-P) + if(length($variant) > 1 and length($variant) == length($ref)){ + my @subvariants; + my @subrefs; + my @sublocations; + my $phase_range = $location."-".($location+length($ref)-1); + for(my $k = 0; $k < length($variant); $k++){ + my $r = substr($ref, $k, 1); + my $v = substr($variant, $k, 1); + if($r ne $v){ + push @subvariants, $v; + push @subrefs, $r; + push @sublocations, $location+$k; + } + elsif(@variants == 1){ + next; # homo ref call + } + if($zygosity eq "heterozygote"){ + # need to ignore case where a homozygous call (variant or ref) is included in a double non-ref het MNP + if(@variants > 1){ + my $homo = 1; + for(my $l = 0; $l <= $#variants; $l++){ # using loop in case we handle oligoploid genomes in the future + if(length($variants[$l]) <= $k or substr($variants[$l], $k, 1) ne $v){ + $homo = 0; + last; + } + } + next if $homo; + } + my $phase_key = "$chr:".($location+$k).":$v"; + my $phase_label = "M$j-$chr:$phase_range"; + if(exists $chr2phase{$phase_key}){ + if($chr2phase{$phase_key} !~ /$phase_label/){ + $chr2phase{$phase_key} .= ",$phase_label"; + } + } + else{ + $chr2phase{$phase_key} = $phase_label; + } + } + } + # recycle this MNP variant loop to start processing the individual SNPs + splice(@refs, $j, 1, @subrefs); + splice(@variants, $j, 1, @subvariants); + splice(@locations, $j, 1, @sublocations); + splice(@variant_depths, $j, 1, ($variant_depth) x scalar(@subvariants)); + $j--; + next; + } + + if($min_prop != 0 and $variant_depth eq "n/a" or $variant_depth eq "."){ + print STDERR "Could not parse variant depth from $_\n" unless $quiet; + next; + } + next unless $min_prop == 0 or $min_prop and $variant_depth >= $min_depth and $read_depth ne "n/a" and $variant_depth/$read_depth >= $min_prop; + if($zygosity eq "NA"){ + # make the call ourselves + $zygosity = $variant_depths/$read_depth > 1-$min_prop ? "homozygote" : "heterozygote"; + } + # e.g. chr22 47857671 . CAAAG AAGAT,AAAAG 29.04 . + if(length($variant) > 1 and length($variant) == length($ref)){ + for(my $k = 0; $k < length($variant); $k++){ + my $fixed_variant = $variant; + substr($fixed_variant, $k, 1) = substr($ref, $k, 1); + if($fixed_variant eq $ref){ # single base difference at base k between the two + $ref = substr($ref, $k, 1); + $variant = substr($variant, $k, 1); + $location += $k; + last; + } + } + } + + # samtools reports indels with long common tails, causing non-canonical HGVS reporting and a problem when looking up the variant in dbSNP etc. + # remove common tails to variant calls in order to try to rectify this + else{ + while(length($ref) > 1 and length($variant) > 1 and substr($ref, length($ref)-1) eq substr($variant, length($variant)-1)){ + chop $ref; chop $variant; + } + } + + # See if a caveat should be added because the indel was in a polyhomomer region + if(length($ref) > length($variant) and index($ref, $variant) == 0 and $fasta_index->fetch("$chr:".($location+1)."-".($location+length($ref)+1)) =~ /^([ACGT])\1+$/i){ + if(not exists $chr2caveats{"$chr:$location"}){ + $chr2caveats{"$chr:$location"} = "poly".uc($1)." region deletion"; + } + elsif($chr2caveats{"$chr:$location"} !~ /poly/){ + $chr2caveats{"$chr:$location"} .= "; poly".uc($1)." region deletion"; + } + } + elsif(length($ref) < length($variant) and index($variant, $ref) == 0 and substr($variant, 1) =~ /^([ACGT])\1+$/i){ + if(not exists $chr2caveats{"$chr:$location"}){ + $chr2caveats{"$chr:$location"} .= "poly".uc($1)." region insertion"; + } + elsif($chr2caveats{"$chr:$location"} !~ /poly/){ + $chr2caveats{"$chr:$location"} .= "; poly".uc($1)." region insertion"; + } + } + + # Not a protein-coding gene? Report genomic cooordinates for HGVS + if(not exists $feature_cds_max{$target_parent} or not defined $feature_cds_max{$target_parent} or $feature_cds_max{$target_parent} eq ""){ + if(length($ref) == 1 and length($variant) == 1){ # SNP + record_snv("$target_parent\tg.$location", + "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); + } + elsif(length($ref) == 1 and length($variant) > 1){ # insertion + record_snv("$target_parent\tg.$location\_",($location+1), + "ins",substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); + } + elsif(length($variant) == 1 and length($ref) > 1){ # deletion + record_snv("$target_parent\tg.$location\_",($location+length($ref)-1), + "del",substr($ref, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); + } + else{ # indel + record_snv("$target_parent\tg.$location\_",($location+length($ref)-1), + "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); + } + next; # process next record, we've done all we can with a non-coding-gene SNP + } + + if($strand eq "-"){ + # set up utr offset for correct CDS coordinates + for(my $i = $#feature_ranges; $i >= 0; $i--){ + # exon is completely 5' of the start + if($feature_ranges[$i]->[0] > $feature_cds_max{$target_parent}){ + #print STDERR "Whole 5' UTR exon $i: ",$feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1,"\n"; + $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; + } + # exon with the cds start + elsif($feature_ranges[$i]->[1] >= $feature_cds_max{$target_parent} and + $feature_ranges[$i]->[0] <= $feature_cds_max{$target_parent}){ + #print STDERR "Start codon in exon $i: ", $feature_cds_max{$target_parent} - $feature_ranges[$i]->[1],"\n"; + $feature_offset += $feature_cds_max{$target_parent} - $feature_ranges[$i]->[1]; + last; + } + else{ + die "The CDS for $target_parent (on negative strand) ends downstream ", + "($feature_cds_max{$target_parent}) of the an exon", + " (", $feature_ranges[$i]->[0], "), which is illogical. Please revise the GFF file provided.\n"; + } + } + for(my $i = $#feature_ranges; $i >= 0; $i--){ + my $feature = $feature_ranges[$i]; + # in the 3' UTR region of the gene + if($location < $feature_cds_min{$target_parent}){ + my $feature_exon = 0; + $feature = $feature_ranges[$feature_exon]; + while($location > $feature->[1]+$flanking_bases and + $feature_exon < $#feature_ranges){ + $feature = $feature_ranges[++$feature_exon]; # find the 3' utr exon in which the variant is located + } + # utr exons passed entirely + my $stop_exon = $feature_exon; + while($feature_ranges[$stop_exon]->[1] < $feature_cds_min{$target_parent}){ + $stop_exon++; + } + my $post_offset = $feature_cds_min{$target_parent}-$feature_ranges[$stop_exon]->[0]; # offset from the stop codon in the final coding exon + while($stop_exon > $feature_exon){ + $post_offset += $feature_ranges[$stop_exon]->[1]-$feature_ranges[$stop_exon]->[0]+1; + $stop_exon--; + } + + my $pos = $feature->[1]-$location+1+$post_offset; + my $junction_dist; + if($location < $feature->[0]){ # after a UTR containing exon? set as .*DD+DD + $junction_dist = ($feature->[0]-$location); + $pos = ($post_offset+$feature->[1]-$feature->[0]+1)."+".$junction_dist; + } + elsif($location > $feature->[1]){ # before a total UTR exon? set as .*DD-DD + $junction_dist = -($location-$feature->[1]); + $pos = ($post_offset+1).$junction_dist; + } + else{ # in the UTR exon + if($location - $feature->[0] < $feature->[1] - $location){ # location is closer to exon donor site + $junction_dist = -($location - $feature->[0]+1); # +1 for HGVS syntax compatibility (there is no position 0, direct skip from -1 to +1) + } + else{ + $junction_dist = $feature->[1] - $location +1; + } + } + if(length($ref) == 1 and length($variant) == 1){ + my $rc = join("",map({$rc{$_}} split(//,reverse($variant)))); + # 3' UTR SNP + record_snv("$target_parent\tc.*$pos", + "$rc{$ref}>$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t",prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); + } + elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ + my $rc = join("",map({$rc{$_}} split(//,reverse(substr($variant,1))))); + # 3' UTR insertion + record_snv("$target_parent\tc.*", + hgvs_plus($pos,-1),"_*",$pos, + "ins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + #"ins",substr($variant,1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); + } + elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ + my $rc = join("",map({$rc{$_}} split(//,reverse($ref)))); + my $delBases = substr($rc,0,length($rc)-1); + if(length($ref) == 2){ + # 3' UTR single base deletion + record_snv("$target_parent\tc.*",hgvs_plus($pos,-1), + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); + } + else{ + # longer 3' UTR deletion + record_snv("$target_parent\tc.*", + hgvs_plus($pos,-length($ref)+1),"_*",hgvs_plus($pos, -1), + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); + } + } + else{ + my $rc = join("",map({$rc{$_}} split(//,reverse($variant)))); + if($rc eq $ref and length($variant) > 3){ + # 3' UTR inversion + record_snv("$target_parent\tc.*", + hgvs_plus($pos,-length($ref)+1),"_*",$pos, + "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); + last; + } + + # complex substitution in 3' UTR + # Will break if stop codon is modified + record_snv("$target_parent\tc.*", + hgvs_plus($pos,-length($ref)+1),"_*", $pos, + "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); + } + last; + } + # in the feature + elsif($location >= $feature->[0] and $location <= $feature->[1]){ + my $pos = $feature_offset+$feature->[1]-$location+1; + if($location > $feature_cds_max{$target_parent}){ #since there is no position 0, the pos is in UTR, subtract one + $pos = hgvs_plus($pos, -1); + } + my $first_exon_base = $feature_offset+1; + my $exon_edge_dist = $feature->[1]-$location+1; # equiv of HGVS +X or -X intron syntax, but for exons + $exon_edge_dist = $feature->[0]-$location-1 if abs($feature->[0]-$location-1) < $exon_edge_dist; # pick closer of donor and acceptor sites + #print STDERR "Got ", $feature->[1]-$location+1, "vs. ", $feature->[0]-$location-1, ": chose $exon_edge_dist\n"; + if(length($ref) == 1 and length($variant) == 1){ + # SNP + record_snv("$target_parent\tc.", + $pos, + "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", + ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + } + elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ + my $rc = join("",map({$rc{$_}} split(//,reverse($variant)))); + my $insBases = substr($rc,1); + # insertion + record_snv("$target_parent\tc.", + hgvs_plus_exon($pos, -1, $first_exon_base),"_",$pos,"ins$insBases", + "\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", + ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + } + elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ + my $rc = join("",map({$rc{$_}} split(//,reverse($ref)))); + my $delBases = substr($rc,0,length($rc)-1); + # single nucleotide deletion + if(length($ref) == 2){ + record_snv("$target_parent\tc.", + hgvs_plus_exon($pos, -1, $first_exon_base), + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", + ($pos-1 < 1 ? "NA" : $pos-1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + #($pos-1 < 1 ? "NA" : $pos-1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + } + # longer deletion + else{ + $exon_edge_dist = $feature->[1]-$location-length($ref)+1 if abs($feature->[1]-$location-length($ref)+1) < $exon_edge_dist; + record_snv("$target_parent\tc.", + hgvs_plus_exon($pos, -length($ref)+1, $first_exon_base),"_", + hgvs_plus_exon($pos, -1, $first_exon_base), + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", + ($pos-1 < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + #($pos-1 < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + } + } + else{ + # complex substitution + $exon_edge_dist = $feature->[1]-$location-length($ref)+1 if abs($feature->[1]-$location-length($ref)+1) < $exon_edge_dist; + my $rc = join("",map({$rc{$_}} split(//,reverse($variant)))); + if($rc eq $variant and length($variant) > 3){ + # inversion + record_snv("$target_parent\tc.", + hgvs_plus_exon($pos,-length($ref)+1,$first_exon_base),"_", + $pos, + "inv", + "\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", + ($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + + last; + } + record_snv("$target_parent\tc.", + hgvs_plus_exon($pos,-length($ref)+1,$first_exon_base),"_", + $pos, + "delins$rc", + "\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", + ($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + #($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + } + last; + } + # 5' of feature (on negative strand) + elsif($location > $feature->[1]){ + if(length($ref) == 1 and length($variant) == 1){ + # intronic SNP + if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ + # Closer to acceptor site + record_snv("$target_parent\tc.",$feature_offset+1, + ($feature->[1]-$location), + "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant, $feature->[1]-$location)),"\tNA\n"); + } + else{ + # Closer to donor site + record_snv("$target_parent\tc.",$feature_offset,"+", + ($feature_ranges[$i+1]->[0]-$location), + "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant, $feature_ranges[$i+1]->[0]-$location)),"\tNA\n"); + } + } + elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ + my $rc = join("",map({$rc{$_}} split(//,reverse(substr($variant,1))))); + if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ + # intronic insertion near acceptor + my $pos = ($feature_offset+1).($feature->[1]-$location); + record_snv("$target_parent\tc.", + hgvs_plus($pos,-1),"_",$pos, + "ins", + $rc,"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + #substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location-1)),"\tNA\n"); + } + else{ + # intronic insertion near donor + my $pos = $feature_offset."+".($feature_ranges[$i+1]->[0]-$location); + record_snv("$target_parent\tc.", + hgvs_plus($pos,-1),"_",$pos, + "ins", + $rc,"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + #substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location+1)),"\tNA\n"); + } + } + elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ + # intronic deletion + # single nucleotide deletion + my $rc = reverse($ref); + $rc=~tr/ACGT/TGCA/; + my $delBases = substr($rc, 0, length($rc)-1); + if(length($ref) == 2){ + # single intronic deletion near acceptor + if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ + my $off = $feature->[1]-$location-1; + record_snv("$target_parent\tc.", + ($feature_offset+1),$off, + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n"); + } + # single intronic deletion near donor + else{ + my $pos = $feature_offset; + my $off = $feature_ranges[$i+1]->[0]-$location+1; + record_snv("$target_parent\tc.", + hgvs_plus_exon($pos, $off, $pos), + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n"); + } + } + # longer deletion + else{ + if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ + # long intronic deletion near acceptor + my $off = $feature->[1]-$location-1; + my $pos = ($feature_offset+1).$off; + record_snv("$target_parent\tc.", + hgvs_plus($pos,-length($ref)+2),"_",$pos, + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n"); + } + else{ + # long intronic deletion near donor + my $off = $feature_ranges[$i+1]->[0]-$location+1; + my $pos = ($feature_offset)."+".$off; + record_snv("$target_parent\tc.", + $pos,"_",hgvs_plus($pos,-length($ref)-1), + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off-length($ref)+1 <= 2 ? "p.?" : "NA"),"\n"); + } + last; + } + } + else{ + my $rc = reverse($ref); + $rc=~tr/ACGT/TGCA/; + if($rc eq $variant and length($variant) > 3){ + # intronic inversion near acceptor site + if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ + my $pos = ($feature_offset+1).($feature->[1]-$location); + record_snv("$target_parent\tc.", + hgvs_plus($pos,-length($ref)+1),"_",$pos, + "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location)),"\tNA\n"); + } + else{ + my $pos = ($feature_offset)."+".($feature_ranges[$i+1]->[0]-$location); + record_snv("$target_parent\tc.", + $pos,"_",hgvs_plus($pos, length($ref)-1), + "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location)),"\tNA\n"); + } + last; + } + $rc = reverse($variant); + $rc=~tr/ACGT/TGCA/; + # Intronic complex substitution + if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ + # complex intronic substitution near acceptor site + my $pos = ($feature_offset+1).($feature->[1]-$location); + record_snv("$target_parent\tc.", + hgvs_plus($pos, -length($ref)+1),"_",$pos, + "delins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + #"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location)),"\tNA\n"); + } + else{ + # complex intronic substitution near donor site + my $pos = $feature_offset."+".($feature_ranges[$i+1]->[0]-$location); + record_snv("$target_parent\tc.", + $pos,"_",hgvs_plus($pos, length($ref)-1), + "delins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + #"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location)),"\tNA\n"); + } + } + last; + } + else{ + #print STDERR "Offset going from ", $feature_offset, " to ", $feature_offset+$feature->[1]-$feature->[0]+1,"\n"; + $feature_offset += $feature->[1]-$feature->[0]+1; + #print STDERR "Set feature offset to $feature_offset by adding ",$feature->[1],"-", $feature->[0],"+1\n"; + } + } + } + else{ + # forward strand + + # set up utr offset for correct CDS coordinates + for(my $i = 0; $i <= $#feature_ranges; $i++){ + # All 5' utr exon + if($feature_ranges[$i]->[1] < $feature_cds_min{$target_parent}){ + $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; + } + # exon with the cds start + elsif($feature_ranges[$i]->[1] >= $feature_cds_min{$target_parent} and + $feature_ranges[$i]->[0] <= $feature_cds_min{$target_parent}){ + $feature_offset -= $feature_cds_min{$target_parent} - $feature_ranges[$i]->[0]; + last; + } + else{ + die "The CDS for $target_parent starts upstream ($feature_cds_max{$target_parent}) of the first exon", + " (", $feature_ranges[$i]->[0], "), which is illogical. Please revise the GFF file provided.\n"; + } + } + for(my $i = 0; $i <= $#feature_ranges; $i++){ + my $feature = $feature_ranges[$i]; + # 3' of last coding position + if($location > $feature_cds_max{$target_parent}){ + # find the exon with the stop codon + while($feature->[1] < $feature_cds_max{$target_parent}){ + $feature = $feature_ranges[++$i]; + } + my $post_offset = $feature->[0]-$feature_cds_max{$target_parent}; + while($location > $feature->[1]+$flanking_bases and + $i < $#feature_ranges){ + $post_offset += $feature->[1]-$feature->[0]+1; + $feature = $feature_ranges[++$i]; # find the 3' utr exon in which the variant is located + } + my $pos = $location-$feature->[0]+$post_offset; + #print STDERR "Positive strand: got $pos for $location, exon #$i of $#feature_ranges, post_offset is $post_offset\n" if $location-$feature->[1] > $flanking_bases; + my $off; + if($location > $feature->[1]){ # after a UTR containing exon? set as .*DD+DD + $off = $location-$feature->[1]; + $pos = ($post_offset+$feature->[1]-$feature->[0]+1)."+".$off; + } + elsif($location < $feature->[0]){ # before a total UTR exon? set as .*DD-DD + $off = -($feature->[0]-$location); + $pos = ($post_offset+1).$off; + } + else{ + if($location-$feature->[0] < $feature->[1]-$location){ + $off = $location-$feature->[0]+1; # +1 since HGVS skips right from -1 to +1 at exon boundary + } + else{ + $off = $location-$feature->[1]-1; # will be negative + } + } + if(length($ref) == 1 and length($variant) == 1){ + # 3' UTR SNP + record_snv("$target_parent\tc.*$pos", + "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n"); + } + elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ + # 3' UTR insertion + record_snv("$target_parent\tc.*", + hgvs_plus($pos,1),"_*",hgvs_plus($pos,2), + "ins",substr($variant,1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n"); + } + elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ + my $delBases = substr($ref, 1); + if(length($ref) == 2){ + # 3' UTR single base deletion + record_snv("$target_parent\tc.*",hgvs_plus($pos,1), + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n"); + } + else{ + # longer 3' UTR deletion + record_snv("$target_parent\tc.*", + hgvs_plus($pos,1),"_*",hgvs_plus($pos,length($ref)-1), + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n"); + } + } + else{ + my $rc = reverse($ref); + $rc=~tr/ACGT/TGCA/; + if($rc eq $variant and length($variant) > 3){ + # 3' UTR inversion + record_snv("$target_parent\tc.*$pos", + "_*",hgvs_plus($pos,length($ref)-1), + "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n"); + last; + } + # complex substitution in 3' UTR + record_snv("$target_parent\tc.*$pos", + "_*",hgvs_plus($pos,length($ref)-1), + "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n"); + } + last; + } + # in the exon + elsif($location >= $feature->[0] and $location <= $feature->[1]){ + my $pos = $feature_offset+$location-$feature->[0]+1; + my $last_exon_base = $feature_offset+$feature->[1]-$feature->[0]+1; + my $exon_edge_dist = $location-$feature->[0]+1; # equiv of HGVS +X or -X intron syntax, but for exons + $exon_edge_dist = $location-$feature->[1]-1 if abs($location-$feature->[1]-1) < $exon_edge_dist; # pick closer of donor and acceptor sites + #print STDERR "Got ", $location-$feature->[0]+1, "vs. ", $location-$feature->[1]-1, ": chose $exon_edge_dist\n"; + if($location < $feature_cds_min{$target_parent}){ #since there is no position 0, the pos is in UTR, subtract one + $pos = hgvs_plus($pos, -1); + } + if(length($ref) == 1 and length($variant) == 1){ + # SNP + record_snv("$target_parent\tc.$pos", + "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", + ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + } + elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ + # insertion + record_snv("$target_parent\tc.$pos", + "_",hgvs_plus_exon($pos,1,$last_exon_base),"ins", + substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", + ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + } + elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ + my $delBases = substr($ref, 1); + # single nucleotide deletion + if(length($delBases) == 1){ + record_snv("$target_parent\tc.", + hgvs_plus_exon($pos,1,$last_exon_base), + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", + ($pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + #($pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + } + # longer deletion + else{ + $exon_edge_dist = $feature->[1]-$location-length($ref)-1 if abs($feature->[1]-$location-length($ref)-1) < $exon_edge_dist; + record_snv("$target_parent\tc.", + hgvs_plus_exon($pos,1,$last_exon_base),"_", + hgvs_plus_exon($pos,length($ref)-1,$last_exon_base), + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", + ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + } + } + else{ + $exon_edge_dist = $feature->[1]-$location-length($ref)-1 if abs($feature->[1]-$location-length($ref)-1) < $exon_edge_dist; + my $rc = reverse($ref); + $rc=~tr/ACGT/TGCA/; + if($rc eq $variant and length($variant) > 3){ + # inversion + record_snv("$target_parent\tc.$pos", + "_",hgvs_plus_exon($pos,length($ref)-1, $last_exon_base), + "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", + ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + last; + } + # complex substitution + record_snv("$target_parent\tc.$pos", + "_",hgvs_plus_exon($pos, length($ref)-1, $last_exon_base), + "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", + ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); + } + last; + } + # 5' of feature + elsif($location < $feature->[0]){ + if(length($ref) == 1 and length($variant) == 1){ + # intronic SNP + if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ + # Closer to donor site + record_snv("$target_parent\tc.",$feature_offset,"+", + ($location-$feature_ranges[$i-1]->[1]), + "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n"); + } + else{ + # Closer to acceptor site + record_snv("$target_parent\tc.",$feature_offset+1, + ($location-$feature->[0]), + "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n"); + } + } + elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ + if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ + # intronic insertion near donor + my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]); + record_snv("$target_parent\tc.", + $pos,"_",hgvs_plus($pos,1), + "ins", + substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n"); + } + else{ + # intronic insertion near acceptor + my $pos = ($feature_offset+1).($location-$feature->[0]); + record_snv("$target_parent\tc.", + $pos,"_",hgvs_plus($pos,1), + "ins", + substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n"); + } + } + elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ + # intronic deletion + # single nucleotide deletion + my $delBases = substr($ref, 1); + if(length($ref) == 2){ + # single intronic deletion near donor + if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ + my $off = $location-$feature_ranges[$i-1]->[1]+1; + record_snv("$target_parent\tc.", + $feature_offset,"+",$off, + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n"); + } + # single intronic deletion near acceptor + else{ + my $pos = ($feature_offset+1); + my $off = $location-$feature->[0]; + record_snv("$target_parent\tc.", + hgvs_plus_exon($pos, $off, $pos), + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n"); + } + } + # longer deletion + else{ + if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ + # long intronic deletion near donor + my $off = $location-$feature_ranges[$i-1]->[1]+1; + my $pos = $feature_offset."+".$off; + record_snv("$target_parent\tc.", + $pos,"_",hgvs_plus($pos,length($ref)-2), + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n"); + } + else{ + # long intronic deletion near acceptor + my $off = $location-$feature->[0]+1; + my $pos = ($feature_offset+1).$off; + record_snv("$target_parent\tc.", + $pos,"_",hgvs_plus($pos,length($ref)-2), + "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off+length($ref)-2 >= -2 ? "p.?" : "NA"),"\n"); + } + } + } + else{ + my $rc = reverse($ref); + $rc=~tr/ACGT/TGCA/; + if($rc eq $variant and length($variant) > 3){ + # intronic inversion near donor site + if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ + my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]); + record_snv("$target_parent\tc.", + $pos,"_",hgvs_plus($pos,length($ref)-1), + "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n"); + } + else{ + my $pos = ($feature_offset+1).($location-$feature->[0]); + record_snv("$target_parent\tc.", + $pos,"_",hgvs_plus($pos, length($ref)-1), + "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n"); + } + last; + } + # Intronic complex substitution + # Note: sub maybe have comma in it to denote two possible variants + if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ + # complex intronic substitution near donor site + my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]); + record_snv("$target_parent\tc.", + $pos,"_",hgvs_plus($pos, length($ref)-1), + "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n"); + } + else{ + # complex intronic substitution near acceptor site + my $pos = ($feature_offset+1).($location-$feature->[0]); + record_snv("$target_parent\tc.", + $pos,"_",hgvs_plus($pos, length($ref)-1), + "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", + join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n"); + } + } + last; + } + else{ + # feature is past this exon + $feature_offset += $feature->[1]-$feature->[0]+1; + } + } + } + } # for each variant on the line + } # for each gene overlapping the site the VCF describes +} # for each VCF line +print STDERR "\n" unless $quiet; +close(VCFIN); + +# Before we can start printing the variants, we need to look at the phase information and calculate the real haplotype HGVS changes +#if(keys %chr2phase){ + # Note that we could have samtools read-backed haplotype info, MNPs in the VCF, and pre-existing haplotypes in the input VCF (e.g. imputed or based on Mendelian inheritance where trios exist) + # We need to create new disjoint sets of phased blocks from the (consistent) union of these data. +# my $chr2phase2variants = combine_phase_data(\%chr2phase); + + # TODO: Calculate protein HGVS syntax for each variant, now that all phase data has been incorporated + #for my $chr (keys %$chr2phase2variants){ + # for my $phase (keys %{$chr2phase2variants{$chr}){ + # # apply all phased changes to the reference chromosomal seq + # my $phased_seq = $seq{$chr}; #reference + # # sort the variants from 3' to 5' so that edits after indels don't need adjustment in their ref coordinate + # my @sorted_variants = sort {my($a_pos) = $a =~ /:(\d+):/; my($b_pos) = $b =~ /:(\d+):/; $b_pos <=> $a_pos} @{$chr2phase2variants{$chr}->{$phase}}; + # for my $variant (@sorted_variants){ + # } + # } + #} +#} + +# retrieve the MAF info en masse for each chromosome, as this is much more efficient +my @out_lines; +for my $snv (@snvs){ + chomp $snv; + my @fields = split /\t/, $snv; + # For CNVs, all the fields are already filled out + if(@fields > 13){ + push @out_lines, join("\t", $feature_type{$fields[0]}, ($fields[0] =~ /\S/ ? $feature_length{$fields[0]} : "NA"), @fields); + next; + } + my $variant_key = $fields[9]; + $fields[9] = join("\t", prop_info($dbsnp,$internal_snp,$variant_key)); + my $from = $fields[4]; + my $chr_pos_key = $fields[3].":".$from; + my $to = $fields[4]; # true for SNPs and insertions + my @variant_key = split /:/, $variant_key; + # For deletions and complex variants, calculate the affected reference genome range and set the 'to' + if(length($variant_key[2]) > 1){ + $to += length($variant_key[2])-1; + } + splice(@fields, 5, 0, $to); + + # Otherwise expand the key into the relevant MAF values + $fields[0] =~ s/\/chr.*$//; # some transcript ids are repeated... we expect "id/chr#" in the GTF file to distinguish these, but should get rid of them at reporting time + # the offset from the nearest exon border if coding + push @fields, ($#variant_key > 3 ? $variant_key[4] : ""); + # add gene name(s) + push @fields, range2genes($fields[3], $from, $to+1); + # add caveats + my $c = $fields[3]; + if(not exists $chr2mappability{$c}){ + if($c =~ s/^chr//){ + # nothing more + } + else{ + $c = "chr$c"; + } + } + my $mappability_caveats = exists $chr2mappability{$c} ? $chr2mappability{$c}->fetch($fields[4], $fields[4]+1) : []; + if(ref $mappability_caveats eq "ARRAY" and @$mappability_caveats){ + my %h; + if(exists $chr2caveats{$chr_pos_key}){ + if($chr2caveats{$chr_pos_key} !~ /non-unique/){ + $chr2caveats{$chr_pos_key} = join("; ", grep {not $h{$_}++} @$mappability_caveats)."; ".$chr2caveats{$chr_pos_key}; + } + } + else{ + $chr2caveats{$chr_pos_key} = join("; ", grep {not $h{$_}++} @$mappability_caveats); + } + } + push @fields, (exists $chr2caveats{$chr_pos_key} ? $chr2caveats{$chr_pos_key} : ""); + # add phase + push @fields, find_phase($variant_key); + push @out_lines, join("\t", $feature_type{$fields[0]}, $feature_length{$fields[0]}, @fields); +} + +# Now tabulate the rare variant numbers +my %gene2rares; +my %gene2aa_rares; +for my $line (@out_lines){ + my @F = split /\t/, $line, -1; + if($F[15] eq "NA" or $F[15] < $rare_variant_prop and (!$internal_snp or $F[17] < $rare_variant_prop)){ + my $gene_list = $internal_snp ? $F[20] : $F[19]; + next unless defined $gene_list; + for my $g (split /; /, $gene_list){ + $gene2rares{$g}++; + # Check the cDNA HGVS syntax for relevance + if($F[3] =~ /c.\d+/ or # coding + $F[3] =~ /c.\d+.*-[12]/ or # splicing acceptor + $F[3] =~ /c.\d+\+[12345]/){ # splicing donor + $gene2aa_rares{$g}++; + } + } + } +} + +# Print the results +for my $line (@out_lines){ + my @F = split /\t/, $line, -1; + my $gene_list = $internal_snp ? $F[20] : $F[19]; + if(not defined $gene_list){ + print OUT join("\t", @F, "", ""), "\n"; next; + } + + my $max_gene_rare = 0; + my $max_gene_aa_rare = 0; + for my $g (split /; /, $gene_list){ + next unless exists $gene2rares{$g}; + if($gene2rares{$g} > $max_gene_rare){ + $max_gene_rare = $gene2rares{$g}; + } + next unless exists $gene2aa_rares{$g}; + if($gene2aa_rares{$g} > $max_gene_aa_rare){ + $max_gene_aa_rare = $gene2aa_rares{$g}; + } + } + print OUT join("\t", @F, $max_gene_rare, $max_gene_aa_rare), "\n"; +} +close(OUT); + +sub range2genes{ + my ($c, $from, $to) = @_; + if(not exists $gene_ids{$c}){ + if($c =~ s/^chr//){ + # nothing more + } + else{ + $c = "chr$c"; + } + } + if(exists $gene_ids{$c}){ + my %have; + return join("; ", grep {not $have{$_}++} @{$gene_ids{$c}->fetch($from, $to+1)}); + } + else{ + return ""; + } +} +sub combine_phase_data{ + my ($phases) = @_; # map from variant to its phase data + + # Create a reverse mapping from phase regions to their variants + my %chr2phase_region2variants; + my @variants = keys %$phases; + for my $variant (@variants){ + my ($chr) = $variant =~ /^\S+?-(\S+):/; + $chr2phase_region2variants{$chr} = {} if not exists $chr2phase_region2variants{$chr}; + for my $phase_region (split /,/, $phases->{$variant}){ + $chr2phase_region2variants{$chr}->{$phase_region} = [] if not exists $chr2phase_region2variants{$chr}->{$phase_region}; + push @{$chr2phase_region2variants{$phase_region}}, $variant; + } + } + + # Now for each phased block known so far, see if any variant in it is also part of another block + # If so, do a union since phasing is both transitive and commutative. + # The quickest way to do this is to check for overlapping intervals, then check for common members amongst those that do overlap + for my $chr (keys %chr2phase_region2variants){ + my @ordered_phase_regions = sort {my($a_pos) = $a =~ /:(\d+)/; my($b_pos) = $b =~ /:(\d+)/; $a_pos <=> $b_pos} keys %{$chr2phase_region2variants{$chr}}; + my $sets = new DisjointSets(scalar(@ordered_phase_regions)); + + for (my $i = 0; $i < $#ordered_phase_regions; $i++){ + my ($start, $stop, $variant) = $ordered_phase_regions[$i]; + for (my $j = $i+1; $j <= $#ordered_phase_regions; $j++){ + my ($start2, $stop2, $variant2) = $ordered_phase_regions[$j]; + if($start2 > $stop){ # won't overlap any regions after this in the sorted list + last; + } + # If we get here, it is implicit that $stop >= $start2 and $start < $stop2, i.e. there is overlap + # Now check if there is a shared variant (otherwise we might erroneously join blocks from different physical chromosomal arms) + my $have_shared_variant = 0; + my $overlapping_phase_region = $ordered_phase_regions[$j]; + for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$i]}}){ + if($phases->{$variant} =~ /\b$overlapping_phase_region\b/){ + $have_shared_variant = 1; last; + } + } + # sanity check that there aren't conflicting variants in the new block (i.e. two different variants in the same position) + my %pos2base; + my $have_conflicting_variant = 0; + for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$i]}}, @{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$j]}}){ + my ($pos, $base) = $variant =~ /(\d+):(.+?)$/; + if(exists $pos2base{$pos} and $pos2base{$pos} ne $base){ + # conflict, note with a caveat + if(exists $chr2caveats{"$chr:$pos"}){ + $chr2caveats{"$chr:$pos"} .= "; inconsistent haplotype phasing" unless $chr2caveats{"$chr:$pos"} =~ /inconsistent haplotype phasing/; + } + else{ + $chr2caveats{"$chr:$pos"} = "inconsistent haplotype phasing"; + } + $have_conflicting_variant ||= 1; + } + elsif(not exists $pos2base{$pos}){ + $pos2base{$pos} = $base; + } + } + + $sets->union($i+1, $j+1) if $have_shared_variant and not $have_conflicting_variant; # indexes are one-based for sets rather than 0-based + } + } + my $phase_sets = $sets->sets; #disjoint haplotype sets + my %region_counts; + for my $phase_set (@$phase_sets){ + next if scalar(@$phase_set) == 1; # No change to existing phase region (is disjoint from all others) + # Generate a new haploblock to replace the old ones that are being merged + my $merged_start = 10000000000; + my $merged_end = 0; + for my $ordered_phase_region_index (@$phase_set){ + my ($start, $end) = $ordered_phase_regions[$ordered_phase_region_index-1] =~ /(\d+)-(\d+)$/; + $merged_start = $start if $start < $merged_start; + $merged_end = $end if $end > $merged_end; + } + # At the start of the region is a unique prefix so we can tell the arms apart if two haploblocks have the exact same boundary + my $region_count = $region_counts{"$merged_start-$merged_end"}++; + my $merged_haploblock_name = "Y$region_count-$chr:$merged_start-$merged_end"; + # Assign this new name to overwrite the premerge values for each variant contained within + for my $ordered_phase_region_index (@$phase_set){ + for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$ordered_phase_region_index-1]}}){ # incl. one-based set correction in 0-based array index + print STDERR "Merging $variant from ", $phases->{$variant}, " into new block $merged_haploblock_name\n"; + $phases->{$variant} = $merged_haploblock_name; + } + } + } + # TODO: if there are overlapping phase blocks still, but with different variants in the same position, we can infer that they are on the opposite strands... + } +} + +# Sees if the positions of the variant are in the range of a phased haplotype, returning which allele it belongs to +sub find_phase{ + my ($chr,$pos,$ref,$variant) = split /:/, $_[0]; + return "" if length($ref) != length($variant); # Can only deal with SNPs (and broken down MNPs) for now + for(my $i = 0; $i < length($ref); $i++){ + my $key = "$chr:".($pos+$i).":".substr($variant, $i, 1); + #print STDERR "Checking phase for $key\n" if $pos == 12907379; + if(exists $chr2phase{$key}){ + #print STDERR "returning phase data $chr2phase{$key}\n"; + return $chr2phase{$key}; + } + elsif(exists $chr2phase{"chr".$key}){ + #print STDERR "returning phase data ", $chr2phase{"chr".$key}, "\n"; + return $chr2phase{"chr".$key}; + } + } + return ""; +} + +sub find_earliest_index{ + # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start + my ($query, $array) = @_; + + return 0 if $query < $array->[0]->[0]; + + my ($left, $right, $prevCenter) = (0, $#$array, -1); + + while(1){ + my $center = int (($left + $right)/2); + + my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1); + + return $center if $cmp == 0; + if ($center == $prevCenter) { + return $left; + } + $right = $center if $cmp < 0; + $left = $center if $cmp > 0; + $prevCenter = $center; + } +} +