Mercurial > repos > yusuf > gvf_to_vcf
view dibayes_gff2vcf @ 0:18d965813efc default tip
intial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:37:21 -0600 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl use Statistics::Zed; use strict; use warnings; if($ARGV[0] eq "-v"){ print "Version 1.0\n"; exit; } @ARGV == 2 or die "Usage: $0 <diBayes_snp_calls.gff3> <output.vcf>\n"; my $zed = new Statistics::Zed; my $log_10_inv = 1/log(10); open(GFF3IN, $ARGV[0]) or die "Cannot open input GFF file $ARGV[0] for reading: $!\n"; open(VCFOUT, ">$ARGV[1]") or die "Cannot open output VCF file $ARGV[1] for writing: $!\n"; my @date = localtime(); my $date = ($date[5]+1900).($date[4] < 10 ? "0" : "").$date[4].($date[3] < 10 ? "0" : "").$date[3]; my $ref_file; my $source; while(<GFF3IN>){ last if /^#[^#]/; # end of file headers marked by lack of double hash if(/source-version\s+(.*)$/){ $source = $1; } elsif(/genome-build\s+(.*)$/){ $ref_file = $1; } } print VCFOUT <<END; ##fileformat=VCFv4.0 ##fileDate=$date ##source=$source ##reference=$ref_file ##INFO=<ID=Z,Number=1,Type=Float,Description="SNP z-score"> ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> ##FORMAT=<ID=RR,Number=1,Type=Integer,Description="Reference Read Depth"> ##FORMAT=<ID=VR,Number=1,Type=Integer,Description="Major Variant Read Depth"> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Unknown END while(<GFF3IN>){ next if /^#/; chomp; my @F = split /\t/, $_; my $seq = $F[0]; my $pos = $F[3]; my $z_score = sprintf "%.4f", ($F[5] < 0.0000000001 ? "200" : $zed->p2z(value => $F[5])); my $qual = $F[5] == 0 ? 100 : int(-10*log($F[5])*$log_10_inv); my @info = split /;/, $F[8]; my $depth_reads = "."; my $ref_base; my @alleles; my $ref_reads; my $var_reads; for my $tag (@info){ my ($key, $value) = split /=/, $tag; if($key eq "coverage"){ $depth_reads = $value; } elsif($key eq "reference"){ $ref_base = $value; } elsif($key eq "allele-call"){ push @alleles, split /\//, $value; } elsif($key eq "refAlleleCounts"){ $ref_reads = $value; } elsif($key eq "novelAlleleCounts"){ $var_reads = $value; # note: when het for two non-ref alleles, this is a wierd value to be ignored } } my $genotype = "0/1"; # default het for one non-ref allele if(@alleles == 1){ # homo for one var allele $genotype = "1/1"; } elsif($ref_base eq $alleles[0]){ # het for one non-reference shift @alleles; } elsif($ref_base eq $alleles[1]){ # het for one non-reference pop @alleles; } else{ # het for two non-ref alleles $genotype = "1/2"; $var_reads = int(($depth_reads-$ref_reads)/2); #diBayes needs guesstimate since GFF doesn't encode this properly $var_reads = "$var_reads,$var_reads"; } # Required VCF format is CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE_NAME... print VCFOUT join("\t", $seq, $pos, ".", $ref_base, join(",", @alleles), $qual, ".", "Z=$z_score", "GT:VR:RR:DP:GQ", "$genotype:$var_reads:$ref_reads:$depth_reads:."), "\n"; } close(VCFOUT); close(GFF3IN);