Changeset 0:9977d1935a07 (2015-03-25)

Commit message:
initial commit

added:
AnnotateVariants.xml
hgvs_table_annotate
tool-data/annotate_hgvs.loc

diff -r 000000000000 -r 9977d1935a07 AnnotateVariants.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/AnnotateVariants.xml Wed Mar 25 13:10:47 2015 -0600

@@ -0,0 +1,29 @@
+<?xml version="1.0"?>
+
+<tool id="hgvs_annotation_2" name="Annotate an HGVS table">
+  <requirements>
+    <requirement type="package">genesplicer</requirement>
+     <requirement type="package">human</requirement>
+     <requirement type="package">score3.pl</requirement>
+     <requirement type="package">score5.pl</requirement>
+  </requirements>
+
+  <description>with deleterious AA and splicing change predictions, protein domain disruptions, etc.</description>
+  <version_string>hgvs_table_annotate -v</version_string>
+  <command interpreter="perl">hgvs_table_annotate $__tool_data_path__ -q /sift/hg19 /polyphen2/hg19.txt.gz /dbs/gerp/hg19 /dbs/TissueDistributionDBs/human.v2009-07-30.tab /dbs/pathways/KEGG.human.2012-09-25.txt /dbs/interpro_supermatch_hg19.bed /dbs/OMIM/morbidmap $input_hgvs_table $out_hgvs_annotated_table /hg19.fa</command>
+  <inputs>
+    <param format="achri_snp_table" name="input_hgvs_table" type="data" label="Variant calls table with HGVS syntax"/>
+  </inputs>
+  <outputs>
+    <data format="achri_annotated_snp_table" name="out_hgvs_annotated_table" type="data" label="Functionally annotated HGVS variant table"/>
+  </outputs>
+
+  <tests>
+  </tests>
+
+  <help>
+This tool reports several functional attributes, and potential for functional disturbance, based on genes that have declared sequence variants.
+These results can be used to help find the genetic cause of a clinical phenotype.
+  </help>
+
+</tool>

diff -r 000000000000 -r 9977d1935a07 hgvs_table_annotate
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hgvs_table_annotate Wed Mar 25 13:10:47 2015 -0600

[

b'@@ -0,0 +1,1029 @@\n+#!/usr/bin/env perl\n+\n+use Bio::DB::Sam;\n+use DBI;\n+use IPC::Open2;\n+use Set::IntervalTree;\n+use strict;\n+use warnings;\n+use vars qw($quiet %chr2variant_locs %polyphen_info %chr2polyphen_vcf_lines %chr2gerp_vcf_lines %range2genes $fasta_index $UNAFFECTED $tmpfile);\n+use File::Basename;\n+$UNAFFECTED = -299999; # sentinel for splice site predictions\n+my $current_directory = dirname(__FILE__);\n+if(@ARGV == 1 and $ARGV[0] eq "-v"){\n+ print "Version 1.0\\n";\n+ exit;\n+}\n+\n+#configuration file stuff\n+my %config;\n+my $tool_dir = shift @ARGV;\n+if(not -e "$tool_dir/annotate_hgvs.loc"){\n+ system("cp $current_directory/tool-data/annotate_hgvs.loc $tool_dir/annotate_hgvs.loc");\n+}\n+open CONFIG, \'<\', "$tool_dir/annotate_hgvs.loc";\n+while(<CONFIG>){\n+ next if $_ =~ /^#/;\n+ (my $key, my $value) = split(/\\s+/,$_);\n+ $config{$key} = $value;\n+}\n+close CONFIG;\n+my $dbs_dir = $config{"dbs_dir"};\n+\n+$quiet = 0;\n+if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){\n+ $quiet = 1;\n+ shift @ARGV;\n+}\n+\n+@ARGV == 9 or @ARGV == 10 or die "Usage: $0 -q <SIFT dir> <POLYPHEN2 file> <GERP dir> <tissuedb file> <pathway file> <interpro domains.bed> <omim morbidmap> <input hgvs table.txt> <output hgvs table.annotated.txt> [reference.fasta]\\n Where if the reference fasta is provided, we predict cryptic splicing effects (using MaxEntScan and GeneSplicer)\\n";\n+\n+my $sift_dir = $dbs_dir . shift @ARGV;\n+my $polyphen_file = $dbs_dir . shift @ARGV;\n+my $gerp_dir = $dbs_dir . shift @ARGV;\n+my $tissue_file = $dbs_dir . shift @ARGV;\n+my $pathway_file = $dbs_dir. shift @ARGV;\n+my $interpro_bed_file = $dbs_dir . shift @ARGV;\n+my $morbidmap_file = $dbs_dir . shift @ARGV;\n+my $hgvs_table_file = shift @ARGV;\n+my $outfile = shift @ARGV;\n+my $predict_cryptic_splicings = @ARGV ? ($dbs_dir . shift @ARGV) : undef;\n+my $flanking_bases = 500; # assume this is ROI around a gene\n+my @lines;\n+ \n+sub polyphen_gerp_info($$$){\n+ my($gerp_dir,$polyphen_file,$info_key) = @_;\n+\n+ my($chr,$pos,$ref,$variant) = split /:/, $info_key;\n+\n+ # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse\n+ if(not exists $chr2polyphen_vcf_lines{$chr}){\n+ ($chr2polyphen_vcf_lines{$chr}, $chr2gerp_vcf_lines{$chr}) = retrieve_vcf_lines($gerp_dir,$polyphen_file,$chr);\n+ }\n+\n+ my @results;\n+ if(not $ref or not $variant){ # can only get data for SNPs\n+ @results = qw(NA NA NA);\n+ }\n+ #print STDERR "ref = $ref and variant = $variant for $info_key\\n";\n+ elsif(length($ref) == 1 and length($variant) == 1){\n+ #print STDERR "Grabbing poly/gerp for $chr,$pos,$ref,$variant\\n";\n+ @results = polyphen_info($chr,$pos,$variant);\n+ $results[2] = gerp_info($chr,$pos);\n+ }\n+\n+ # It may be that a novel MNP variant is a bunch of known SNPs in a row. In this case, \n+ # report the list of variants\n+ elsif(length($ref) == length($variant) and $ref ne "NA"){\n+ my (@poly_scores, @poly_calls, @gerp_scores);\n+ for(my $i = 0; $i < length($ref); $i++){\n+ my $refbase = substr($ref, $i, 1);\n+ my $newbase = substr($variant, $i, 1);\n+ if($newbase eq $refbase){\n+ push @poly_scores, "-";\n+ push @poly_calls, "-";\n+ push @gerp_scores, "-";\n+ next;\n+ }\n+ my @base_results = polyphen_info($chr,$pos+$i,$refbase,$newbase);\n+ push @poly_scores, $base_results[0];\n+ push @poly_calls, $base_results[1];\n+ $base_results[2] = gerp_info($chr,$pos+$i);\n+ push @gerp_scores, $base_results[2];\n+ }\n+ $results[0] = join(",", @poly_scores); \n+ $results[1] = join(",", @poly_calls); \n+ $results[2] = join(",", @gerp_scores); \n+ }\n+ else{\n+ @results = qw(NA NA NA);\n+ }\n+\n+ return @results;\n+}\n+\n+sub get_variant_key($$$$){\n+ # params chr pos ref variant\n+ my $c = $_[0];\n+ $c =~ s/^chr//;\n+ my $prop_info_key = join(":", $c, @_[1..3]);\n+ $'..b'rsID = $fields[$dbid_column];\n+ my $popFreq = $fields[$maf_column];\n+ my $internalFreq;\n+ $internalFreq = $fields[$internal_freq_column] if defined $internal_freq_column;\n+ my $popFreqSrc = $fields[$maf_src_column];\n+\n+ # Get the gene name and omim info\n+ my %seen;\n+ my @gene_names = split /; /, $fields[$genename_column]; \n+ my $morbid = join("; ", grep {/\\S/ and not $seen{$_}++} map({$gene2morbids{$_} or ""} @gene_names));\n+ my $tissue = join("; ", grep {/\\S/ and not $seen{$_}++} map({$tissues{$_} or ""} @gene_names));\n+ my $function = join("; ", grep {/\\S/ and not $seen{$_}++} map({$tissue_desc{$_} or ""} @gene_names));\n+ my $pathways = join("; ", grep {/\\S/ and not $seen{$_}++} map({$pathways{$_} or ""} @gene_names));\n+ my $pathway_ids = join("; ", grep {/\\S/ and not $seen{$_}++} map({$pathway_ids{$_} or ""} @gene_names));\n+\n+ # It may be that the calls and freqs are multiple concatenated values, e.g. "X; Y"\n+ my @tot_bases = split /; /,$fields[$tot_cnt_column];\n+ my @var_bases = split /; /,$fields[$alt_cnt_column];\n+ my (@pct_nonvar, @pct_var);\n+ for (my $i = 0; $i <= $#tot_bases; $i++){\n+ push @pct_nonvar, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int(($tot_bases[$i]-$var_bases[$i])/$tot_bases[$i]*100+0.5);\n+ push @pct_var, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int($var_bases[$i]/$tot_bases[$i]*100+0.5);\n+ }\n+\n+ push @lines, [\n+\t\t $fields[$chr_column], # chr\n+\t\t $fields[$pos_column], # pos\n+\t\t $fields[$to_column], \n+\t\t $fields[$ref_column],\n+\t\t $fields[$alt_column],\n+\t\t join("; ", @pct_nonvar), # pct ref\n+\t\t join("; ", @pct_var), # pct var\n+\t\t $fields[$alt_cnt_column], # num var\n+\t\t $fields[$tot_cnt_column], # num reads\n+\t\t $transcript_type, # protein_coding, pseudogene, etc.\n+\t\t $type,\n+\t\t $location,\t\t \n+\t\t $rsID,\n+\t\t $popFreqSrc,\n+\t\t (defined $internal_freq_column ? ($popFreq,$internalFreq) : ($popFreq)),\n+\t\t $sift_score,\n+\t\t $variant_key, # 16th field\n+\t\t join("; ", @gene_names),\n+\t\t $transcript_hgvs,\n+\t\t $protein_hgvs,\n+\t\t $zygosity,\n+ $fields[$splice_dist_column], # distance to exon edge\n+ $splicing_potential,\n+ $splicing_details,\n+ $morbid,\n+\t\t $tissue,\n+\t\t $pathways,\n+\t\t $pathway_ids,\n+\t\t $function,\n+ $domains,\n+ $fields[$pvalue_column],\n+ $fields[$caveats_column], # caveats\n+ (defined $rares_column ? ($fields[$rares_column],$transcript_id) : ($transcript_id)),\n+\t\t $transcript_length,\n+ $transcript_strand,\n+\t\t (defined $srcs_column ? ($fields[$phase_column],$fields[$srcs_column]) : ($fields[$phase_column]))];\n+}\n+\n+print STDERR "Adding Polyphen2 and GERP info...\\n" unless $quiet;\n+# retrieve the polyphen and gerp info en masse for each chromosome, as this is much more efficient\n+my $vkey_col = 16+(defined $internal_freq_column ? 1 : 0);\n+for my $line_fields (@lines){\n+ # expand the key into the relevant MAF values\n+ $line_fields->[$vkey_col] = join("\\t", polyphen_gerp_info($gerp_dir,$polyphen_file,$line_fields->[$vkey_col])); # fields[$vkey_col] is variant key\n+ print OUT join("\\t", @{$line_fields}), "\\n";\n+}\n+close(OUT); \n+unlink $tmpfile if defined $tmpfile;\n+\n+sub load_header_columns{\n+ my ($reqs_hash_ref, $headers_array_ref) = @_;\n+ my %unfulfilled;\n+ for my $field_name (keys %$reqs_hash_ref){\n+ $unfulfilled{$field_name} = 1;\n+ }\n+ for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){\n+ for my $req_header_name (keys %$reqs_hash_ref){\n+ if($req_header_name eq $headers_array_ref->[$i]){\n+ ${$reqs_hash_ref->{$req_header_name}} = $i;\n+ delete $unfulfilled{$req_header_name};\n+ last;\n+ }\n+ }\n+ }\n+ if(keys %unfulfilled){\n+ die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\\n";\n+ }\n+}\n'

diff -r 000000000000 -r 9977d1935a07 tool-data/annotate_hgvs.loc
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/annotate_hgvs.loc Wed Mar 25 13:10:47 2015 -0600

@@ -0,0 +1,1 @@
+dbs_dir /export/achri_galaxy/dbs