Mercurial > repos > yusuf > gvf_to_vcf
comparison dibayes_gff2vcf @ 0:18d965813efc default tip
intial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:37:21 -0600 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:18d965813efc |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 use Statistics::Zed; | |
4 use strict; | |
5 use warnings; | |
6 | |
7 if($ARGV[0] eq "-v"){ | |
8 print "Version 1.0\n"; | |
9 exit; | |
10 } | |
11 | |
12 @ARGV == 2 or die "Usage: $0 <diBayes_snp_calls.gff3> <output.vcf>\n"; | |
13 | |
14 my $zed = new Statistics::Zed; | |
15 my $log_10_inv = 1/log(10); | |
16 open(GFF3IN, $ARGV[0]) | |
17 or die "Cannot open input GFF file $ARGV[0] for reading: $!\n"; | |
18 open(VCFOUT, ">$ARGV[1]") | |
19 or die "Cannot open output VCF file $ARGV[1] for writing: $!\n"; | |
20 | |
21 my @date = localtime(); | |
22 my $date = ($date[5]+1900).($date[4] < 10 ? "0" : "").$date[4].($date[3] < 10 ? "0" : "").$date[3]; | |
23 | |
24 my $ref_file; | |
25 my $source; | |
26 while(<GFF3IN>){ | |
27 last if /^#[^#]/; # end of file headers marked by lack of double hash | |
28 if(/source-version\s+(.*)$/){ | |
29 $source = $1; | |
30 } | |
31 elsif(/genome-build\s+(.*)$/){ | |
32 $ref_file = $1; | |
33 } | |
34 } | |
35 | |
36 print VCFOUT <<END; | |
37 ##fileformat=VCFv4.0 | |
38 ##fileDate=$date | |
39 ##source=$source | |
40 ##reference=$ref_file | |
41 ##INFO=<ID=Z,Number=1,Type=Float,Description="SNP z-score"> | |
42 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |
43 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> | |
44 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> | |
45 ##FORMAT=<ID=RR,Number=1,Type=Integer,Description="Reference Read Depth"> | |
46 ##FORMAT=<ID=VR,Number=1,Type=Integer,Description="Major Variant Read Depth"> | |
47 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Unknown | |
48 END | |
49 | |
50 while(<GFF3IN>){ | |
51 next if /^#/; | |
52 chomp; | |
53 my @F = split /\t/, $_; | |
54 my $seq = $F[0]; | |
55 my $pos = $F[3]; | |
56 my $z_score = sprintf "%.4f", ($F[5] < 0.0000000001 ? "200" : $zed->p2z(value => $F[5])); | |
57 my $qual = $F[5] == 0 ? 100 : int(-10*log($F[5])*$log_10_inv); | |
58 my @info = split /;/, $F[8]; | |
59 my $depth_reads = "."; | |
60 my $ref_base; | |
61 my @alleles; | |
62 my $ref_reads; | |
63 my $var_reads; | |
64 for my $tag (@info){ | |
65 my ($key, $value) = split /=/, $tag; | |
66 if($key eq "coverage"){ | |
67 $depth_reads = $value; | |
68 } | |
69 elsif($key eq "reference"){ | |
70 $ref_base = $value; | |
71 } | |
72 elsif($key eq "allele-call"){ | |
73 push @alleles, split /\//, $value; | |
74 } | |
75 elsif($key eq "refAlleleCounts"){ | |
76 $ref_reads = $value; | |
77 } | |
78 elsif($key eq "novelAlleleCounts"){ | |
79 $var_reads = $value; # note: when het for two non-ref alleles, this is a wierd value to be ignored | |
80 } | |
81 } | |
82 | |
83 my $genotype = "0/1"; # default het for one non-ref allele | |
84 if(@alleles == 1){ # homo for one var allele | |
85 $genotype = "1/1"; | |
86 } | |
87 elsif($ref_base eq $alleles[0]){ | |
88 # het for one non-reference | |
89 shift @alleles; | |
90 } | |
91 elsif($ref_base eq $alleles[1]){ | |
92 # het for one non-reference | |
93 pop @alleles; | |
94 } | |
95 else{ | |
96 # het for two non-ref alleles | |
97 $genotype = "1/2"; | |
98 $var_reads = int(($depth_reads-$ref_reads)/2); #diBayes needs guesstimate since GFF doesn't encode this properly | |
99 $var_reads = "$var_reads,$var_reads"; | |
100 } | |
101 | |
102 # Required VCF format is CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE_NAME... | |
103 print VCFOUT join("\t", $seq, $pos, ".", $ref_base, join(",", @alleles), $qual, ".", "Z=$z_score", "GT:VR:RR:DP:GQ", | |
104 "$genotype:$var_reads:$ref_reads:$depth_reads:."), "\n"; | |
105 } | |
106 close(VCFOUT); | |
107 close(GFF3IN); |