changeset 0:138d81f259c8 default tip

intial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:38:09 -0600
parents
children
files HGVS2VCF.xml hgvs_to_vcf
diffstat 2 files changed, 134 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/HGVS2VCF.xml	Wed Mar 25 13:38:09 2015 -0600
@@ -0,0 +1,19 @@
+<?xml version="1.0"?>
+
+<tool id="hgvs2vcf_1" name="Convert a variant table with HGVS syntax into a VCF">
+  <version_string>echo 1.0.0</version_string>
+  <command interpreter="perl">hgvs_to_vcf $in_hgvs_table $output_vcf $sample_name hg19
+  </command>
+  <inputs>
+    <param name="in_hgvs_table" format="achri_snp_table" type="data" label="SNP table with HGVS calls"/>
+    <param type="text" name="sample_name" value="ANON" label="Name for the sample in the output VCF file's header"/>
+  </inputs>
+  <outputs>
+    <data format="vcf" name="output_vcf" type="data" label="Sequence variant calls"/>
+  </outputs>
+
+  <tests>
+  </tests>
+
+  <help>This tools converts the tabular format used to describe variants and associate annotation at ACHRI into a standard Variant Calls Format (VCF) version 4.1 file, which is accepted by many genetic analysis programs.</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgvs_to_vcf	Wed Mar 25 13:38:09 2015 -0600
@@ -0,0 +1,115 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+# Converts a HGVS tab delimited table into a VCF file for processing by tools that need that format.
+
+@ARGV == 4 or die "Usage: $0 <input hgvs.txt> <output.vcf> <sample id> <reference genome>\n";
+
+open(HGVS, $ARGV[0])
+  or die "Cannot open $ARGV[0] for reading: $!\n";
+my ($chr_column, $pos_column, $ref_column, $alt_column, $alt_cnt_column, $tot_cnt_column, $dbid_column, $cdna_hgvs_column,
+    $aa_hgvs_column, $transcript_column, $zygosity_column, $caveats_column, $phase_column, $pvalue_column, $genename_column);
+my $headers = <HGVS>;
+chomp $headers;
+my @headers = split /\t/, $headers;
+my %req_columns = (
+        "Chr" => \$chr_column,
+        "DNA From" => \$pos_column,
+        "Gene Name" => \$genename_column,
+        "Ref base" => \$ref_column,
+        "Obs base" => \$alt_column,
+        "Variant Reads" => \$alt_cnt_column,
+        "Total Reads" => \$tot_cnt_column,
+        "Variant DB ID" => \$dbid_column,
+        "Transcript HGVS" => \$cdna_hgvs_column,
+        "Protein HGVS" => \$aa_hgvs_column,
+        "Selected transcript" => \$transcript_column,
+        "Zygosity" => \$zygosity_column,
+        "Caveats" => \$caveats_column,
+        "Phase" => \$phase_column,
+        "P-value" => \$pvalue_column);
+&load_header_columns(\%req_columns, \@headers);
+my $phasing = "none";
+while(<HGVS>){
+  my @F = split /\t/, $_;
+  if($F[$phase_column] ne ""){
+    $phasing = "partial";
+    last;
+  }
+}
+close(HGVS);
+open(HGVS, $ARGV[0])
+  or die "Cannot open $ARGV[0] for reading: $!\n";
+
+open(VCFOUT, ">$ARGV[1]")
+  or die "Cannot open $ARGV[1] for writing: $!\n";
+my ($sec, $min, $hr, $day, $month, $year) = localtime();
+$year += 1900;
+$month = "0$month" if $month < 10;
+$day = "0$day" if $day < 10;
+$, = " ";
+print VCFOUT <<END;
+##fileformat=VCFv4.1
+##fileDate=$year$month$day
+##source=hgvs_to_vcf
+##reference=$ARGV[3]
+##phasing=$phasing
+##commandline="@ARGV"
+##FILTER=<ID=CAVEATS,Description="The variant call is possibly a false positive due to non-unique mapping, homopolymer stretch, etc.">
+##INFO=<ID=GENE,Number=A,Type=String,Description="List of genes that have transcripts overlapping the variant region">
+##INFO=<ID=HGVS,Number=A,Type=String,Description="Effect in HGVS syntax (protein version if available, otherwise cDNA or genomic)">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=RA,Number=1,Type=Integer,Description="Reference allele observation count">
+##FORMAT=<ID=AA,Number=1,Type=Integer,Description="Alternate allele observation count">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	$ARGV[2]
+END
+my $log10 = log(10);
+<HGVS>; # get rid of header line
+while(<HGVS>){
+  chomp;
+  my @F = split /\t/, $_;
+  my $filter = $F[$caveats_column] ne "" ? "CAVEATS" : "PASS";
+  my $ref = $F[$ref_column]; # always on + strand
+  my $alt = $F[$alt_column];
+  my $qual = $F[$pvalue_column]; # to be coverted to MAQ
+  if($qual == 0){
+    $qual = 100;
+  }
+  else{
+    $qual = -10*log($qual)/$log10; # log10 because log is actually ln in Perl
+  }
+  my $genotype = $F[$zygosity_column] =~ /homo/ ? "1/1" : "0/1"; # todo: handle compound het SNP
+  my $transcript = $F[$transcript_column];
+  $transcript =~ s/;.*//;
+  my $genenames = $F[$genename_column];
+  $genenames =~ s/;\s*/,/;
+  
+  print VCFOUT join("\t", $F[$chr_column], $F[$pos_column], ($F[$dbid_column] =~ /rs/ ? $F[$dbid_column] : "."), $ref, $alt, $qual, $filter, 
+                          ($F[$genename_column] ne "" ? "GENE:".$F[$genename_column].";" : "")."HGVS:$transcript,".($F[$aa_hgvs_column] ne "NA" ? $F[$aa_hgvs_column]:$F[$cdna_hgvs_column]), 
+                          "GT:GQ:RA:AA", "$genotype:$qual:".($F[$tot_cnt_column]-$F[$alt_cnt_column]).":".$F[$alt_cnt_column]),"\n";
+}
+
+sub load_header_columns{
+  my ($reqs_hash_ref, $headers_array_ref) = @_;
+  my %unfulfilled;
+  for my $field_name (keys %$reqs_hash_ref){
+    $unfulfilled{$field_name} = 1;
+  }
+  for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){
+    for my $req_header_name (keys %$reqs_hash_ref){
+      if($req_header_name eq $headers_array_ref->[$i]){
+        ${$reqs_hash_ref->{$req_header_name}} = $i;
+        delete $unfulfilled{$req_header_name};
+        last;
+      }
+    }
+  }
+  if(keys %unfulfilled){
+    die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n";
+  }
+}
+