comparison vcf_gff.py @ 1:a0689dc29b7f draft

Updated vcf to gff conversion tool
author john-mccallum
date Tue, 31 Jul 2012 00:33:11 -0400
parents
children 402c3f0fe807
comparison
equal deleted inserted replaced
0:21053f7f9ed1 1:a0689dc29b7f
1 #!/usr/local/bin/python2.6
2 """
3 Usage vcf_gff.py <vcf_file input> <gff_file output>
4
5 Input vcf file format:
6 CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
7
8 Note: Generating vcf from a single merged bam file, using multiple bam files results in multiple FORMAT columns!!!
9
10 Output gff format:
11 SEQID SOURCE TYPE START END SCORE STRAND PHASE ATTRIBUTES
12
13 """
14 #Copyright 2012 Susan Thomson
15 #New Zealand Institute for Plant and Food Research
16 #This program is free software: you can redistribute it and/or modify
17 # it under the terms of the GNU General Public License as published by
18 # the Free Software Foundation, either version 3 of the License, or
19 # (at your option) any later version.
20 #
21 # This program is distributed in the hope that it will be useful,
22 # but WITHOUT ANY WARRANTY; without even the implied warranty of
23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 # GNU General Public License for more details.
25 #
26 # You should have received a copy of the GNU General Public License
27 # along with this program. If not, see <http://www.gnu.org/licenses/>.
28
29
30 import sys
31
32 in_vcf_file = open(sys.argv[1], 'r')
33 out_gff_file = open(sys.argv[2], 'w')
34
35 def get_info(attribute_input):
36 """
37 Get the record_type by determining if INDEL is stated in column INFO; get
38 raw read count
39 """
40 INFO = {}
41 rec = attribute_input.split(";")
42 if "INDEL" in rec:
43 record_type = "INDEL"
44 rec = rec[1:]
45 else:
46 record_type = "SNP"
47 for entry in rec:
48 detail = entry.split("=")
49 INFO[detail[0]] = detail[1]
50 if INFO.has_key("DP"):
51 reads = INFO.get("DP")
52 else:
53 reads = "NA"
54 data = (record_type, reads)
55 return data
56
57 def get_gen(formatcols, ref):
58 """
59 Get info on heterozyosity or homozygosity (could be useful later),
60 by estimating genotype(GT) calling ref allele=0, variant allele=1
61
62 """
63 formats = []
64 sample_dict = {}
65 genos = ""
66 format_types = formatcols[0].split(":")
67 samples = formatcols[1:]
68 for entry in format_types:
69 formats.append(entry)
70 for sample in samples:
71 var = ""
72 data = sample.split(":")
73 sample_dict = dict(zip(formats, data))
74 if sample_dict.has_key("DP"):
75 reads = sample_dict.get("DP")
76 else:
77 reads = "NA"
78 if sample_dict.has_key("NF"):
79 """
80 polysnp tool output
81 """
82 freq = sample_dict.get("NF")
83 variants = freq.split("|")
84 if int(variants[0]) >= 1:
85 var += "A"
86 if int(variants[1]) >= 1:
87 var += "C"
88 if int(variants[2]) >= 1:
89 var += "G"
90 if int(variants[3]) >= 1:
91 var += "T"
92 if var == "":
93 gen = "NA"
94 elif var == ref:
95 gen = "HOM_ref"
96 elif len(var) >= 2:
97 gen = "HET"
98 else:
99 gen = "HOM_mut"
100 elif sample_dict.has_key("GT"):
101 """
102 mpileup output, recommend ALWAYS have GT, note this only good scoring for diploids too!
103 """
104 genotypes = sample_dict.get("GT")
105 if genotypes == "1/1":
106 gen = "HOM_mut"
107 if genotypes == "0/1":
108 gen = "HET"
109 if genotypes == "0/0":
110 gen = "HOM_ref"
111 else:
112 gen = "NA"
113 geno = ("%s:%s;" % (reads, gen))
114 genos += geno
115 sample_dict = {}
116 return genos
117
118 attributes = {}
119 """
120 Get relevant info from vcf file and put to proper gff columns
121 """
122
123 for line in in_vcf_file:
124 if line.startswith("#") == False:
125 info = line.split()
126 seqid = info[0].strip()
127 source = "SAMTOOLS"
128 start = int(info[1])
129 score = info[5]
130 strand = "."
131 phase = "."
132 reference = info[3].strip()
133 variant = info[4].strip()
134 attr = get_info(info[7])
135 record_type = attr[0]
136 reads = attr[1]
137 if record_type == "INDEL":
138 end = start + len(reference)
139 else:
140 end = start
141 gen = get_gen(info[8:], reference)
142 out_gff_file.write(
143 ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\tID=%s:%s:%d;Variant" +
144 "_seq=%s;Reference_seq=%s;Total_reads=%s:Zygosity=%s\n") %
145 ( seqid, source,record_type, start, end, score, strand, phase,seqid,
146 record_type, start, variant, reference, reads, gen))
147
148 out_gff_file.close()
149
150