# HG changeset patch # User Yusuf Ali # Date 1427312241 21600 # Node ID 18d965813efc6a2c3ff3d26befc5fe0453027ab4 intial commit diff -r 000000000000 -r 18d965813efc GVF2VCF.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GVF2VCF.xml Wed Mar 25 13:37:21 2015 -0600 @@ -0,0 +1,30 @@ + + + + dibayes_gff2vcf -v + dibayes_gff2vcf $input_gff $output_vcf + + + + + + + + + + + + + + + + + + + + + +This tool an ABI colorspace sequencing run's variant calls (using LifeScope's diBayes) into a format amenable to many variant processing tools. + + + diff -r 000000000000 -r 18d965813efc dibayes_gff2vcf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dibayes_gff2vcf Wed Mar 25 13:37:21 2015 -0600 @@ -0,0 +1,107 @@ +#!/usr/bin/env perl + +use Statistics::Zed; +use strict; +use warnings; + +if($ARGV[0] eq "-v"){ + print "Version 1.0\n"; + exit; +} + +@ARGV == 2 or die "Usage: $0 \n"; + +my $zed = new Statistics::Zed; +my $log_10_inv = 1/log(10); +open(GFF3IN, $ARGV[0]) + or die "Cannot open input GFF file $ARGV[0] for reading: $!\n"; +open(VCFOUT, ">$ARGV[1]") + or die "Cannot open output VCF file $ARGV[1] for writing: $!\n"; + +my @date = localtime(); +my $date = ($date[5]+1900).($date[4] < 10 ? "0" : "").$date[4].($date[3] < 10 ? "0" : "").$date[3]; + +my $ref_file; +my $source; +while(){ + last if /^#[^#]/; # end of file headers marked by lack of double hash + if(/source-version\s+(.*)$/){ + $source = $1; + } + elsif(/genome-build\s+(.*)$/){ + $ref_file = $1; + } +} + +print VCFOUT < +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Unknown +END + +while(){ + next if /^#/; + chomp; + my @F = split /\t/, $_; + my $seq = $F[0]; + my $pos = $F[3]; + my $z_score = sprintf "%.4f", ($F[5] < 0.0000000001 ? "200" : $zed->p2z(value => $F[5])); + my $qual = $F[5] == 0 ? 100 : int(-10*log($F[5])*$log_10_inv); + my @info = split /;/, $F[8]; + my $depth_reads = "."; + my $ref_base; + my @alleles; + my $ref_reads; + my $var_reads; + for my $tag (@info){ + my ($key, $value) = split /=/, $tag; + if($key eq "coverage"){ + $depth_reads = $value; + } + elsif($key eq "reference"){ + $ref_base = $value; + } + elsif($key eq "allele-call"){ + push @alleles, split /\//, $value; + } + elsif($key eq "refAlleleCounts"){ + $ref_reads = $value; + } + elsif($key eq "novelAlleleCounts"){ + $var_reads = $value; # note: when het for two non-ref alleles, this is a wierd value to be ignored + } + } + + my $genotype = "0/1"; # default het for one non-ref allele + if(@alleles == 1){ # homo for one var allele + $genotype = "1/1"; + } + elsif($ref_base eq $alleles[0]){ + # het for one non-reference + shift @alleles; + } + elsif($ref_base eq $alleles[1]){ + # het for one non-reference + pop @alleles; + } + else{ + # het for two non-ref alleles + $genotype = "1/2"; + $var_reads = int(($depth_reads-$ref_reads)/2); #diBayes needs guesstimate since GFF doesn't encode this properly + $var_reads = "$var_reads,$var_reads"; + } + + # Required VCF format is CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE_NAME... + print VCFOUT join("\t", $seq, $pos, ".", $ref_base, join(",", @alleles), $qual, ".", "Z=$z_score", "GT:VR:RR:DP:GQ", + "$genotype:$var_reads:$ref_reads:$depth_reads:."), "\n"; +} +close(VCFOUT); +close(GFF3IN);