view dibayes_gff2vcf @ 0:18d965813efc default tip

intial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:37:21 -0600
parents
children
line wrap: on
line source

#!/usr/bin/env perl

use Statistics::Zed;
use strict;
use warnings;

if($ARGV[0] eq "-v"){
  print "Version 1.0\n";
  exit;
}

@ARGV == 2 or die "Usage: $0 <diBayes_snp_calls.gff3> <output.vcf>\n";

my $zed = new Statistics::Zed;
my $log_10_inv = 1/log(10);
open(GFF3IN, $ARGV[0]) 
  or die "Cannot open input GFF file $ARGV[0] for reading: $!\n";
open(VCFOUT, ">$ARGV[1]")
  or die "Cannot open output VCF file $ARGV[1] for writing: $!\n";

my @date = localtime();
my $date = ($date[5]+1900).($date[4] < 10 ? "0" : "").$date[4].($date[3] < 10 ? "0" : "").$date[3];

my $ref_file;
my $source;
while(<GFF3IN>){
  last if /^#[^#]/; # end of file headers marked by lack of double hash
  if(/source-version\s+(.*)$/){
    $source = $1;
  }
  elsif(/genome-build\s+(.*)$/){
    $ref_file = $1;
  } 
}

print VCFOUT <<END;
##fileformat=VCFv4.0
##fileDate=$date
##source=$source
##reference=$ref_file
##INFO=<ID=Z,Number=1,Type=Float,Description="SNP z-score">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
##FORMAT=<ID=RR,Number=1,Type=Integer,Description="Reference Read Depth">
##FORMAT=<ID=VR,Number=1,Type=Integer,Description="Major Variant Read Depth">
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	Unknown
END

while(<GFF3IN>){
  next if /^#/;
  chomp;
  my @F = split /\t/, $_;
  my $seq = $F[0];
  my $pos = $F[3];
  my $z_score = sprintf "%.4f", ($F[5] < 0.0000000001 ? "200" : $zed->p2z(value => $F[5]));
  my $qual = $F[5] == 0 ? 100 : int(-10*log($F[5])*$log_10_inv);
  my @info = split /;/, $F[8];
  my $depth_reads = ".";
  my $ref_base;
  my @alleles;
  my $ref_reads;
  my $var_reads;
  for my $tag (@info){
    my ($key, $value) = split /=/, $tag;
    if($key eq "coverage"){
      $depth_reads = $value;
    }
    elsif($key eq "reference"){
      $ref_base = $value;
    }
    elsif($key eq "allele-call"){
      push @alleles, split /\//, $value;
    }
    elsif($key eq "refAlleleCounts"){
      $ref_reads = $value;
    }
    elsif($key eq "novelAlleleCounts"){
      $var_reads = $value; # note: when het for two non-ref alleles,  this is a wierd value to be ignored
    }
  }
  
  my $genotype = "0/1"; # default het for one non-ref allele
  if(@alleles == 1){ # homo for one var allele
    $genotype = "1/1";
  }
  elsif($ref_base eq $alleles[0]){
    # het for one non-reference
    shift @alleles;
  }
  elsif($ref_base eq $alleles[1]){
    # het for one non-reference
    pop @alleles;
  }
  else{
    # het for two non-ref alleles
    $genotype = "1/2";
    $var_reads = int(($depth_reads-$ref_reads)/2);  #diBayes needs guesstimate since GFF doesn't encode this properly
    $var_reads = "$var_reads,$var_reads";
  }

  # Required VCF format is CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE_NAME...
  print VCFOUT join("\t", $seq, $pos, ".", $ref_base, join(",", @alleles), $qual, ".", "Z=$z_score", "GT:VR:RR:DP:GQ",
                          "$genotype:$var_reads:$ref_reads:$depth_reads:."), "\n";
}
close(VCFOUT);
close(GFF3IN);