Mercurial > repos > yusuf > gvf_to_vcf
comparison dibayes_gff2vcf @ 0:18d965813efc default tip
intial commit
| author | Yusuf Ali <ali@yusuf.email> |
|---|---|
| date | Wed, 25 Mar 2015 13:37:21 -0600 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:18d965813efc |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 use Statistics::Zed; | |
| 4 use strict; | |
| 5 use warnings; | |
| 6 | |
| 7 if($ARGV[0] eq "-v"){ | |
| 8 print "Version 1.0\n"; | |
| 9 exit; | |
| 10 } | |
| 11 | |
| 12 @ARGV == 2 or die "Usage: $0 <diBayes_snp_calls.gff3> <output.vcf>\n"; | |
| 13 | |
| 14 my $zed = new Statistics::Zed; | |
| 15 my $log_10_inv = 1/log(10); | |
| 16 open(GFF3IN, $ARGV[0]) | |
| 17 or die "Cannot open input GFF file $ARGV[0] for reading: $!\n"; | |
| 18 open(VCFOUT, ">$ARGV[1]") | |
| 19 or die "Cannot open output VCF file $ARGV[1] for writing: $!\n"; | |
| 20 | |
| 21 my @date = localtime(); | |
| 22 my $date = ($date[5]+1900).($date[4] < 10 ? "0" : "").$date[4].($date[3] < 10 ? "0" : "").$date[3]; | |
| 23 | |
| 24 my $ref_file; | |
| 25 my $source; | |
| 26 while(<GFF3IN>){ | |
| 27 last if /^#[^#]/; # end of file headers marked by lack of double hash | |
| 28 if(/source-version\s+(.*)$/){ | |
| 29 $source = $1; | |
| 30 } | |
| 31 elsif(/genome-build\s+(.*)$/){ | |
| 32 $ref_file = $1; | |
| 33 } | |
| 34 } | |
| 35 | |
| 36 print VCFOUT <<END; | |
| 37 ##fileformat=VCFv4.0 | |
| 38 ##fileDate=$date | |
| 39 ##source=$source | |
| 40 ##reference=$ref_file | |
| 41 ##INFO=<ID=Z,Number=1,Type=Float,Description="SNP z-score"> | |
| 42 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |
| 43 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> | |
| 44 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> | |
| 45 ##FORMAT=<ID=RR,Number=1,Type=Integer,Description="Reference Read Depth"> | |
| 46 ##FORMAT=<ID=VR,Number=1,Type=Integer,Description="Major Variant Read Depth"> | |
| 47 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Unknown | |
| 48 END | |
| 49 | |
| 50 while(<GFF3IN>){ | |
| 51 next if /^#/; | |
| 52 chomp; | |
| 53 my @F = split /\t/, $_; | |
| 54 my $seq = $F[0]; | |
| 55 my $pos = $F[3]; | |
| 56 my $z_score = sprintf "%.4f", ($F[5] < 0.0000000001 ? "200" : $zed->p2z(value => $F[5])); | |
| 57 my $qual = $F[5] == 0 ? 100 : int(-10*log($F[5])*$log_10_inv); | |
| 58 my @info = split /;/, $F[8]; | |
| 59 my $depth_reads = "."; | |
| 60 my $ref_base; | |
| 61 my @alleles; | |
| 62 my $ref_reads; | |
| 63 my $var_reads; | |
| 64 for my $tag (@info){ | |
| 65 my ($key, $value) = split /=/, $tag; | |
| 66 if($key eq "coverage"){ | |
| 67 $depth_reads = $value; | |
| 68 } | |
| 69 elsif($key eq "reference"){ | |
| 70 $ref_base = $value; | |
| 71 } | |
| 72 elsif($key eq "allele-call"){ | |
| 73 push @alleles, split /\//, $value; | |
| 74 } | |
| 75 elsif($key eq "refAlleleCounts"){ | |
| 76 $ref_reads = $value; | |
| 77 } | |
| 78 elsif($key eq "novelAlleleCounts"){ | |
| 79 $var_reads = $value; # note: when het for two non-ref alleles, this is a wierd value to be ignored | |
| 80 } | |
| 81 } | |
| 82 | |
| 83 my $genotype = "0/1"; # default het for one non-ref allele | |
| 84 if(@alleles == 1){ # homo for one var allele | |
| 85 $genotype = "1/1"; | |
| 86 } | |
| 87 elsif($ref_base eq $alleles[0]){ | |
| 88 # het for one non-reference | |
| 89 shift @alleles; | |
| 90 } | |
| 91 elsif($ref_base eq $alleles[1]){ | |
| 92 # het for one non-reference | |
| 93 pop @alleles; | |
| 94 } | |
| 95 else{ | |
| 96 # het for two non-ref alleles | |
| 97 $genotype = "1/2"; | |
| 98 $var_reads = int(($depth_reads-$ref_reads)/2); #diBayes needs guesstimate since GFF doesn't encode this properly | |
| 99 $var_reads = "$var_reads,$var_reads"; | |
| 100 } | |
| 101 | |
| 102 # Required VCF format is CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE_NAME... | |
| 103 print VCFOUT join("\t", $seq, $pos, ".", $ref_base, join(",", @alleles), $qual, ".", "Z=$z_score", "GT:VR:RR:DP:GQ", | |
| 104 "$genotype:$var_reads:$ref_reads:$depth_reads:."), "\n"; | |
| 105 } | |
| 106 close(VCFOUT); | |
| 107 close(GFF3IN); |
