0
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 use Statistics::Zed;
|
|
4 use strict;
|
|
5 use warnings;
|
|
6
|
|
7 if($ARGV[0] eq "-v"){
|
|
8 print "Version 1.0\n";
|
|
9 exit;
|
|
10 }
|
|
11
|
|
12 @ARGV == 2 or die "Usage: $0 <diBayes_snp_calls.gff3> <output.vcf>\n";
|
|
13
|
|
14 my $zed = new Statistics::Zed;
|
|
15 my $log_10_inv = 1/log(10);
|
|
16 open(GFF3IN, $ARGV[0])
|
|
17 or die "Cannot open input GFF file $ARGV[0] for reading: $!\n";
|
|
18 open(VCFOUT, ">$ARGV[1]")
|
|
19 or die "Cannot open output VCF file $ARGV[1] for writing: $!\n";
|
|
20
|
|
21 my @date = localtime();
|
|
22 my $date = ($date[5]+1900).($date[4] < 10 ? "0" : "").$date[4].($date[3] < 10 ? "0" : "").$date[3];
|
|
23
|
|
24 my $ref_file;
|
|
25 my $source;
|
|
26 while(<GFF3IN>){
|
|
27 last if /^#[^#]/; # end of file headers marked by lack of double hash
|
|
28 if(/source-version\s+(.*)$/){
|
|
29 $source = $1;
|
|
30 }
|
|
31 elsif(/genome-build\s+(.*)$/){
|
|
32 $ref_file = $1;
|
|
33 }
|
|
34 }
|
|
35
|
|
36 print VCFOUT <<END;
|
|
37 ##fileformat=VCFv4.0
|
|
38 ##fileDate=$date
|
|
39 ##source=$source
|
|
40 ##reference=$ref_file
|
|
41 ##INFO=<ID=Z,Number=1,Type=Float,Description="SNP z-score">
|
|
42 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
|
|
43 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
|
|
44 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
|
|
45 ##FORMAT=<ID=RR,Number=1,Type=Integer,Description="Reference Read Depth">
|
|
46 ##FORMAT=<ID=VR,Number=1,Type=Integer,Description="Major Variant Read Depth">
|
|
47 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Unknown
|
|
48 END
|
|
49
|
|
50 while(<GFF3IN>){
|
|
51 next if /^#/;
|
|
52 chomp;
|
|
53 my @F = split /\t/, $_;
|
|
54 my $seq = $F[0];
|
|
55 my $pos = $F[3];
|
|
56 my $z_score = sprintf "%.4f", ($F[5] < 0.0000000001 ? "200" : $zed->p2z(value => $F[5]));
|
|
57 my $qual = $F[5] == 0 ? 100 : int(-10*log($F[5])*$log_10_inv);
|
|
58 my @info = split /;/, $F[8];
|
|
59 my $depth_reads = ".";
|
|
60 my $ref_base;
|
|
61 my @alleles;
|
|
62 my $ref_reads;
|
|
63 my $var_reads;
|
|
64 for my $tag (@info){
|
|
65 my ($key, $value) = split /=/, $tag;
|
|
66 if($key eq "coverage"){
|
|
67 $depth_reads = $value;
|
|
68 }
|
|
69 elsif($key eq "reference"){
|
|
70 $ref_base = $value;
|
|
71 }
|
|
72 elsif($key eq "allele-call"){
|
|
73 push @alleles, split /\//, $value;
|
|
74 }
|
|
75 elsif($key eq "refAlleleCounts"){
|
|
76 $ref_reads = $value;
|
|
77 }
|
|
78 elsif($key eq "novelAlleleCounts"){
|
|
79 $var_reads = $value; # note: when het for two non-ref alleles, this is a wierd value to be ignored
|
|
80 }
|
|
81 }
|
|
82
|
|
83 my $genotype = "0/1"; # default het for one non-ref allele
|
|
84 if(@alleles == 1){ # homo for one var allele
|
|
85 $genotype = "1/1";
|
|
86 }
|
|
87 elsif($ref_base eq $alleles[0]){
|
|
88 # het for one non-reference
|
|
89 shift @alleles;
|
|
90 }
|
|
91 elsif($ref_base eq $alleles[1]){
|
|
92 # het for one non-reference
|
|
93 pop @alleles;
|
|
94 }
|
|
95 else{
|
|
96 # het for two non-ref alleles
|
|
97 $genotype = "1/2";
|
|
98 $var_reads = int(($depth_reads-$ref_reads)/2); #diBayes needs guesstimate since GFF doesn't encode this properly
|
|
99 $var_reads = "$var_reads,$var_reads";
|
|
100 }
|
|
101
|
|
102 # Required VCF format is CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE_NAME...
|
|
103 print VCFOUT join("\t", $seq, $pos, ".", $ref_base, join(",", @alleles), $qual, ".", "Z=$z_score", "GT:VR:RR:DP:GQ",
|
|
104 "$genotype:$var_reads:$ref_reads:$depth_reads:."), "\n";
|
|
105 }
|
|
106 close(VCFOUT);
|
|
107 close(GFF3IN);
|