Mercurial > repos > yusuf > gvf_to_vcf
changeset 0:18d965813efc default tip
intial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:37:21 -0600 |
parents | |
children | |
files | GVF2VCF.xml dibayes_gff2vcf |
diffstat | 2 files changed, 137 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GVF2VCF.xml Wed Mar 25 13:37:21 2015 -0600 @@ -0,0 +1,30 @@ +<?xml version="1.0"?> + +<tool id="gvf_to_vcf_1" name="Convert diBayes GVF file to VCF format"> + <version_string>dibayes_gff2vcf -v</version_string> + <command interpreter="perl">dibayes_gff2vcf $input_gff $output_vcf</command> + <inputs> + <param format="gff3" name="input_gff" type="data" label="GVF file generated by diBayes (may work for other, but untested)"/> + </inputs> + <outputs> + <data format="vcf" name="output_vcf" label="diBayes calls in VCF format"/> + </outputs> + + <tests> + <test> + <param name="input_gff" value="depth_test.bam" ftype="gff"/> + <output name="output_vcf"> + <assert_contents> + <has_text text="targeted nucleotide bases: 155091"/> + <has_text text="bases mapped to targeted regions: 11473773"/> + <has_text text="bases with less than 20-fold coverage: 19046"/> + </assert_contents> + </output> + </test> + </tests> + + <help> +This tool an ABI colorspace sequencing run's variant calls (using LifeScope's diBayes) into a format amenable to many variant processing tools. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dibayes_gff2vcf Wed Mar 25 13:37:21 2015 -0600 @@ -0,0 +1,107 @@ +#!/usr/bin/env perl + +use Statistics::Zed; +use strict; +use warnings; + +if($ARGV[0] eq "-v"){ + print "Version 1.0\n"; + exit; +} + +@ARGV == 2 or die "Usage: $0 <diBayes_snp_calls.gff3> <output.vcf>\n"; + +my $zed = new Statistics::Zed; +my $log_10_inv = 1/log(10); +open(GFF3IN, $ARGV[0]) + or die "Cannot open input GFF file $ARGV[0] for reading: $!\n"; +open(VCFOUT, ">$ARGV[1]") + or die "Cannot open output VCF file $ARGV[1] for writing: $!\n"; + +my @date = localtime(); +my $date = ($date[5]+1900).($date[4] < 10 ? "0" : "").$date[4].($date[3] < 10 ? "0" : "").$date[3]; + +my $ref_file; +my $source; +while(<GFF3IN>){ + last if /^#[^#]/; # end of file headers marked by lack of double hash + if(/source-version\s+(.*)$/){ + $source = $1; + } + elsif(/genome-build\s+(.*)$/){ + $ref_file = $1; + } +} + +print VCFOUT <<END; +##fileformat=VCFv4.0 +##fileDate=$date +##source=$source +##reference=$ref_file +##INFO=<ID=Z,Number=1,Type=Float,Description="SNP z-score"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> +##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> +##FORMAT=<ID=RR,Number=1,Type=Integer,Description="Reference Read Depth"> +##FORMAT=<ID=VR,Number=1,Type=Integer,Description="Major Variant Read Depth"> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Unknown +END + +while(<GFF3IN>){ + next if /^#/; + chomp; + my @F = split /\t/, $_; + my $seq = $F[0]; + my $pos = $F[3]; + my $z_score = sprintf "%.4f", ($F[5] < 0.0000000001 ? "200" : $zed->p2z(value => $F[5])); + my $qual = $F[5] == 0 ? 100 : int(-10*log($F[5])*$log_10_inv); + my @info = split /;/, $F[8]; + my $depth_reads = "."; + my $ref_base; + my @alleles; + my $ref_reads; + my $var_reads; + for my $tag (@info){ + my ($key, $value) = split /=/, $tag; + if($key eq "coverage"){ + $depth_reads = $value; + } + elsif($key eq "reference"){ + $ref_base = $value; + } + elsif($key eq "allele-call"){ + push @alleles, split /\//, $value; + } + elsif($key eq "refAlleleCounts"){ + $ref_reads = $value; + } + elsif($key eq "novelAlleleCounts"){ + $var_reads = $value; # note: when het for two non-ref alleles, this is a wierd value to be ignored + } + } + + my $genotype = "0/1"; # default het for one non-ref allele + if(@alleles == 1){ # homo for one var allele + $genotype = "1/1"; + } + elsif($ref_base eq $alleles[0]){ + # het for one non-reference + shift @alleles; + } + elsif($ref_base eq $alleles[1]){ + # het for one non-reference + pop @alleles; + } + else{ + # het for two non-ref alleles + $genotype = "1/2"; + $var_reads = int(($depth_reads-$ref_reads)/2); #diBayes needs guesstimate since GFF doesn't encode this properly + $var_reads = "$var_reads,$var_reads"; + } + + # Required VCF format is CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE_NAME... + print VCFOUT join("\t", $seq, $pos, ".", $ref_base, join(",", @alleles), $qual, ".", "Z=$z_score", "GT:VR:RR:DP:GQ", + "$genotype:$var_reads:$ref_reads:$depth_reads:."), "\n"; +} +close(VCFOUT); +close(GFF3IN);