changeset 0:18d965813efc default tip

intial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:37:21 -0600
parents
children
files GVF2VCF.xml dibayes_gff2vcf
diffstat 2 files changed, 137 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GVF2VCF.xml	Wed Mar 25 13:37:21 2015 -0600
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+
+<tool id="gvf_to_vcf_1" name="Convert diBayes GVF file to VCF format">
+  <version_string>dibayes_gff2vcf -v</version_string>
+  <command interpreter="perl">dibayes_gff2vcf $input_gff $output_vcf</command>
+  <inputs>
+    <param format="gff3" name="input_gff" type="data" label="GVF file generated by diBayes (may work for other, but untested)"/>
+  </inputs>
+  <outputs>
+    <data format="vcf" name="output_vcf" label="diBayes calls in VCF format"/>
+  </outputs>
+
+  <tests>
+    <test>
+     <param name="input_gff" value="depth_test.bam" ftype="gff"/>
+     <output name="output_vcf">
+       <assert_contents>
+         <has_text text="targeted nucleotide bases: 155091"/>
+         <has_text text="bases mapped to targeted regions: 11473773"/>
+         <has_text text="bases with less than 20-fold coverage: 19046"/>
+       </assert_contents>
+     </output>
+    </test>
+  </tests>
+
+  <help>
+This tool an ABI colorspace sequencing run's variant calls (using LifeScope's diBayes) into a format amenable to many variant processing tools. 
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dibayes_gff2vcf	Wed Mar 25 13:37:21 2015 -0600
@@ -0,0 +1,107 @@
+#!/usr/bin/env perl
+
+use Statistics::Zed;
+use strict;
+use warnings;
+
+if($ARGV[0] eq "-v"){
+  print "Version 1.0\n";
+  exit;
+}
+
+@ARGV == 2 or die "Usage: $0 <diBayes_snp_calls.gff3> <output.vcf>\n";
+
+my $zed = new Statistics::Zed;
+my $log_10_inv = 1/log(10);
+open(GFF3IN, $ARGV[0]) 
+  or die "Cannot open input GFF file $ARGV[0] for reading: $!\n";
+open(VCFOUT, ">$ARGV[1]")
+  or die "Cannot open output VCF file $ARGV[1] for writing: $!\n";
+
+my @date = localtime();
+my $date = ($date[5]+1900).($date[4] < 10 ? "0" : "").$date[4].($date[3] < 10 ? "0" : "").$date[3];
+
+my $ref_file;
+my $source;
+while(<GFF3IN>){
+  last if /^#[^#]/; # end of file headers marked by lack of double hash
+  if(/source-version\s+(.*)$/){
+    $source = $1;
+  }
+  elsif(/genome-build\s+(.*)$/){
+    $ref_file = $1;
+  } 
+}
+
+print VCFOUT <<END;
+##fileformat=VCFv4.0
+##fileDate=$date
+##source=$source
+##reference=$ref_file
+##INFO=<ID=Z,Number=1,Type=Float,Description="SNP z-score">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=RR,Number=1,Type=Integer,Description="Reference Read Depth">
+##FORMAT=<ID=VR,Number=1,Type=Integer,Description="Major Variant Read Depth">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	Unknown
+END
+
+while(<GFF3IN>){
+  next if /^#/;
+  chomp;
+  my @F = split /\t/, $_;
+  my $seq = $F[0];
+  my $pos = $F[3];
+  my $z_score = sprintf "%.4f", ($F[5] < 0.0000000001 ? "200" : $zed->p2z(value => $F[5]));
+  my $qual = $F[5] == 0 ? 100 : int(-10*log($F[5])*$log_10_inv);
+  my @info = split /;/, $F[8];
+  my $depth_reads = ".";
+  my $ref_base;
+  my @alleles;
+  my $ref_reads;
+  my $var_reads;
+  for my $tag (@info){
+    my ($key, $value) = split /=/, $tag;
+    if($key eq "coverage"){
+      $depth_reads = $value;
+    }
+    elsif($key eq "reference"){
+      $ref_base = $value;
+    }
+    elsif($key eq "allele-call"){
+      push @alleles, split /\//, $value;
+    }
+    elsif($key eq "refAlleleCounts"){
+      $ref_reads = $value;
+    }
+    elsif($key eq "novelAlleleCounts"){
+      $var_reads = $value; # note: when het for two non-ref alleles,  this is a wierd value to be ignored
+    }
+  }
+  
+  my $genotype = "0/1"; # default het for one non-ref allele
+  if(@alleles == 1){ # homo for one var allele
+    $genotype = "1/1";
+  }
+  elsif($ref_base eq $alleles[0]){
+    # het for one non-reference
+    shift @alleles;
+  }
+  elsif($ref_base eq $alleles[1]){
+    # het for one non-reference
+    pop @alleles;
+  }
+  else{
+    # het for two non-ref alleles
+    $genotype = "1/2";
+    $var_reads = int(($depth_reads-$ref_reads)/2);  #diBayes needs guesstimate since GFF doesn't encode this properly
+    $var_reads = "$var_reads,$var_reads";
+  }
+
+  # Required VCF format is CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE_NAME...
+  print VCFOUT join("\t", $seq, $pos, ".", $ref_base, join(",", @alleles), $qual, ".", "Z=$z_score", "GT:VR:RR:DP:GQ",
+                          "$genotype:$var_reads:$ref_reads:$depth_reads:."), "\n";
+}
+close(VCFOUT);
+close(GFF3IN);