Mercurial > repos > iuc > medaka_variant_pipeline
comparison annotateVCF.py @ 3:148f32860445 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit e80b649094384fc6d7a8f917300db3550cc99a44"
author | iuc |
---|---|
date | Tue, 01 Sep 2020 03:06:38 -0400 |
parents | |
children | efca385ce679 |
comparison
equal
deleted
inserted
replaced
2:ce2bf68a260d | 3:148f32860445 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 # Takes in VCF file and a samtools mpileup output file | |
4 # Fills in annotation for the VCF file including AF, DP | |
5 # SB, and DP4 | |
6 # | |
7 # Usage statement: | |
8 # python annotateVCF.py in_vcf.vcf in_mpileup.txt out_vcf.vcf | |
9 # | |
10 # Can generate in_mileup.txt with samtools mpileup (and can restrict which sites to generate pileups for with in_vcf.vcf) | |
11 | |
12 # 08/24/2020 - Nathan P. Roach, natproach@gmail.com | |
13 | |
14 import sys | |
15 from math import isnan, log10 | |
16 | |
17 from scipy.stats import fisher_exact | |
18 | |
19 | |
20 def pval_to_phredqual(pval): | |
21 return int(round(-10. * log10(pval))) | |
22 | |
23 | |
24 def parseSimpleSNPpileup(fields, ref_base, alt_base): | |
25 base_to_idx = { | |
26 'A': 0, | |
27 'a': 0, | |
28 'T': 1, | |
29 't': 1, | |
30 'C': 2, | |
31 'c': 2, | |
32 'G': 3, | |
33 'g': 3 | |
34 } | |
35 | |
36 base_to_idx_stranded = { | |
37 'A': 0, | |
38 'T': 1, | |
39 'C': 2, | |
40 'G': 3, | |
41 'a': 4, | |
42 't': 5, | |
43 'c': 6, | |
44 'g': 7 | |
45 } | |
46 ref_base2 = fields[2] | |
47 counts = [0, 0, 0, 0] | |
48 stranded_counts = [0, 0, 0, 0, 0, 0, 0, 0] | |
49 ref_idx = base_to_idx[fields[2]] | |
50 dp = int(fields[3]) | |
51 carrot_flag = False | |
52 ins_flag = False | |
53 ins_str = "" | |
54 ins_len = 0 | |
55 insertion = "" | |
56 del_flag = False | |
57 del_str = "" | |
58 del_len = 0 | |
59 deletion = "" | |
60 # dollar_flag = False | |
61 for base in fields[4]: | |
62 if carrot_flag: | |
63 carrot_flag = False | |
64 continue | |
65 if ins_len > 0: | |
66 insertion += base | |
67 ins_len -= 1 | |
68 continue | |
69 if del_len > 0: | |
70 deletion += base | |
71 del_len -= 1 | |
72 continue | |
73 if ins_flag: | |
74 if base.isdigit(): | |
75 ins_str += base | |
76 else: | |
77 ins_len = int(ins_str) - 1 | |
78 insertion = base | |
79 ins_flag = False | |
80 elif del_flag: | |
81 if base.isdigit(): | |
82 del_str += base | |
83 else: | |
84 del_len = int(del_str) - 1 | |
85 deletion = base | |
86 del_flag = False | |
87 else: | |
88 if base == '^': | |
89 carrot_flag = True | |
90 continue | |
91 elif base == '$': | |
92 continue | |
93 elif base == '+': | |
94 ins_flag = True | |
95 elif base == '-': | |
96 del_flag = True | |
97 elif base == '.': | |
98 counts[ref_idx] += 1 | |
99 stranded_counts[base_to_idx_stranded[ref_base2]] += 1 | |
100 elif base == ',': | |
101 counts[ref_idx] += 1 | |
102 stranded_counts[base_to_idx_stranded[ref_base2.lower()]] += 1 | |
103 elif base == 'N' or base == 'n': | |
104 continue | |
105 elif base == '*': | |
106 continue | |
107 else: | |
108 counts[base_to_idx[base]] += 1 | |
109 stranded_counts[base_to_idx_stranded[base]] += 1 | |
110 af = float(counts[base_to_idx[alt_base]]) / float(sum(counts)) | |
111 if float(sum(stranded_counts[0:4])) == 0: | |
112 faf = float("nan") | |
113 else: | |
114 faf = float(stranded_counts[base_to_idx_stranded[alt_base]]) / float(sum(stranded_counts[0:4])) | |
115 if float(sum(stranded_counts[4:])) == 0: | |
116 raf = float("nan") | |
117 else: | |
118 raf = float(stranded_counts[base_to_idx_stranded[alt_base.lower()]]) / float(sum(stranded_counts[4:])) | |
119 dp4 = [stranded_counts[base_to_idx_stranded[ref_base]], | |
120 stranded_counts[base_to_idx_stranded[ref_base.lower()]], | |
121 stranded_counts[base_to_idx_stranded[alt_base]], | |
122 stranded_counts[base_to_idx_stranded[alt_base.lower()]]] | |
123 return (dp, af, faf, raf, dp4) | |
124 | |
125 | |
126 def parseIndelPileup(fields, ref_base, alt_base): | |
127 counts = [0, 0, 0, 0, 0, 0, 0, 0, 0] # indel ref match, indel fwd ref match, indel rev ref match, indel alt match, indel fwd alt match, indel rev alt match, other, other fwd, other rev | |
128 ref_base2 = fields[2] | |
129 | |
130 carrot_flag = False | |
131 ins_flag = False | |
132 ins_str = "" | |
133 ins_len = 0 | |
134 del_flag = False | |
135 del_str = "" | |
136 del_len = 0 | |
137 first_iter = True | |
138 forward_flag = False | |
139 last_seq = "" | |
140 last_seq_code = 'b' | |
141 for base in fields[4]: | |
142 if ins_flag: | |
143 if base.isdigit(): | |
144 ins_str += base | |
145 else: | |
146 ins_len = int(ins_str) | |
147 ins_flag = False | |
148 if del_flag: | |
149 if base.isdigit(): | |
150 del_str += base | |
151 else: | |
152 del_len = int(del_str) | |
153 del_flag = False | |
154 if ins_len > 0: | |
155 last_seq += base | |
156 last_seq_code = 'i' | |
157 ins_len -= 1 | |
158 continue | |
159 if del_len > 0: | |
160 last_seq += base | |
161 last_seq_code = 'd' | |
162 del_len -= 1 | |
163 continue | |
164 if carrot_flag: | |
165 carrot_flag = False | |
166 continue | |
167 if base == '.' or base == ','\ | |
168 or base == 'A' or base == 'a'\ | |
169 or base == 'C' or base == 'c'\ | |
170 or base == 'G' or base == 'g'\ | |
171 or base == 'T' or base == 't'\ | |
172 or base == 'N' or base == 'n'\ | |
173 or base == '>' or base == '<'\ | |
174 or base == '*' or base == '#': | |
175 if first_iter: | |
176 first_iter = False | |
177 else: | |
178 if last_seq_code == 'i': | |
179 if last_seq.upper() == alt_base.upper(): | |
180 counts[3] += 1 | |
181 if forward_flag: | |
182 counts[4] += 1 | |
183 else: | |
184 counts[5] += 1 | |
185 else: | |
186 counts[6] += 1 | |
187 if forward_flag: | |
188 counts[7] += 1 | |
189 else: | |
190 counts[8] += 1 | |
191 elif last_seq_code == 'd': | |
192 if last_seq.upper() == ref_base.upper(): | |
193 counts[3] += 1 | |
194 if forward_flag: | |
195 counts[4] += 1 | |
196 else: | |
197 counts[5] += 1 | |
198 else: | |
199 counts[6] += 1 | |
200 if forward_flag: | |
201 counts[7] += 1 | |
202 else: | |
203 counts[8] += 1 | |
204 elif last_seq_code == 'b': | |
205 if last_seq.upper() == ref_base.upper(): | |
206 counts[0] += 1 | |
207 if forward_flag: | |
208 counts[1] += 1 | |
209 else: | |
210 counts[2] += 1 | |
211 elif last_seq.upper() == alt_base.upper(): | |
212 counts[3] += 1 | |
213 if forward_flag: | |
214 counts[4] += 1 | |
215 else: | |
216 counts[5] += 1 | |
217 else: | |
218 counts[6] += 1 | |
219 if forward_flag: | |
220 counts[7] += 1 | |
221 else: | |
222 counts[8] += 1 | |
223 if base == '.': | |
224 last_seq = ref_base2 | |
225 forward_flag = True | |
226 last_seq_code = 'b' | |
227 elif base == ',': | |
228 last_seq = ref_base2 | |
229 forward_flag = False | |
230 last_seq_code = 'b' | |
231 elif base == '>' or base == '<' or base == '*' or base == '#': | |
232 continue | |
233 else: | |
234 forward_flag = base.isupper() | |
235 last_seq = base.upper() | |
236 last_seq_code = 'b' | |
237 elif base == '+': | |
238 ins_flag = True | |
239 ins_str = "" | |
240 elif base == '-': | |
241 del_flag = True | |
242 del_str = "" | |
243 elif base == '^': | |
244 carrot_flag = True | |
245 elif base == '$': | |
246 continue | |
247 if first_iter: | |
248 first_iter = False | |
249 | |
250 if last_seq_code == 'i': | |
251 if last_seq.upper() == alt_base.upper(): | |
252 counts[3] += 1 | |
253 if forward_flag: | |
254 counts[4] += 1 | |
255 else: | |
256 counts[5] += 1 | |
257 else: | |
258 counts[6] += 1 | |
259 if forward_flag: | |
260 counts[7] += 1 | |
261 else: | |
262 counts[8] += 1 | |
263 elif last_seq_code == 'd': | |
264 if last_seq.upper() == ref_base.upper(): | |
265 counts[3] += 1 | |
266 if forward_flag: | |
267 counts[4] += 1 | |
268 else: | |
269 counts[5] += 1 | |
270 else: | |
271 counts[6] += 1 | |
272 if forward_flag: | |
273 counts[7] += 1 | |
274 else: | |
275 counts[8] += 1 | |
276 elif last_seq_code == 'b': | |
277 if last_seq.upper() == ref_base.upper(): | |
278 counts[0] += 1 | |
279 if forward_flag: | |
280 counts[1] += 1 | |
281 else: | |
282 counts[2] += 1 | |
283 elif last_seq.upper() == alt_base.upper(): | |
284 counts[3] += 1 | |
285 if forward_flag: | |
286 counts[4] += 1 | |
287 else: | |
288 counts[5] += 1 | |
289 else: | |
290 counts[6] += 1 | |
291 if forward_flag: | |
292 counts[7] += 1 | |
293 else: | |
294 counts[8] += 1 | |
295 dp = int(fields[3]) | |
296 af = float(counts[3]) / float(sum([counts[0], counts[3], counts[6]])) | |
297 if sum([counts[1], counts[4], counts[7]]) == 0: | |
298 faf = float("nan") | |
299 else: | |
300 faf = float(counts[4]) / float(sum([counts[1], counts[4], counts[7]])) | |
301 if sum([counts[2], counts[5], counts[8]]) == 0: | |
302 raf = float("nan") | |
303 else: | |
304 raf = float(counts[5]) / float(sum([counts[2], counts[5], counts[8]])) | |
305 dp4 = [counts[1], counts[2], counts[4], counts[5]] | |
306 return (dp, af, faf, raf, dp4) | |
307 | |
308 | |
309 def annotateVCF(in_vcf_filepath, in_mpileup_filepath, out_vcf_filepath): | |
310 in_vcf = open(in_vcf_filepath, 'r') | |
311 in_mpileup = open(in_mpileup_filepath, 'r') | |
312 out_vcf = open(out_vcf_filepath, 'w') | |
313 | |
314 # First pass parsing of input vcf, output headerlines + new headerlines, add VCF sites we care about to to_examine (limits memory usage for sites that don't need annotation) | |
315 to_examine = {} | |
316 for line in in_vcf: | |
317 if line[0:2] == "##": | |
318 out_vcf.write(line) | |
319 elif line[0] == "#": | |
320 out_vcf.write("##annotateVCFVersion=0.1\n") | |
321 out_vcf.write("##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw Depth\">\n") | |
322 out_vcf.write("##INFO=<ID=AF,Number=1,Type=Float,Description=\"Allele Frequency\">\n") | |
323 out_vcf.write("##INFO=<ID=FAF,Number=1,Type=Float,Description=\"Forward Allele Frequency\">\n") | |
324 out_vcf.write("##INFO=<ID=RAF,Number=1,Type=Float,Description=\"Reverse Allele Frequency\">\n") | |
325 out_vcf.write("##INFO=<ID=SB,Number=1,Type=Integer,Description=\"Phred-scaled strand bias at this position\">\n") | |
326 out_vcf.write("##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases\">\n") | |
327 out_vcf.write(line) | |
328 else: | |
329 fields = line.strip().split() | |
330 if fields[0] in to_examine: | |
331 to_examine[fields[0]][int(fields[1])] = (fields[3], fields[4]) | |
332 else: | |
333 to_examine[fields[0]] = {int(fields[1]): (fields[3], fields[4])} | |
334 in_vcf.close() | |
335 data = {} | |
336 | |
337 # Populate data dictionary, which relates chromosome and position to the following: | |
338 # depth of coverage | |
339 # allele frequency | |
340 # forward strand allele frequency | |
341 # reverse strand allele frequency | |
342 # dp4 - depth of coverage of ref allele fwd strand, DOC of ref allele rev strand, DOC of alt allele fwd strand, DOC of alt allele rev strand | |
343 for line in in_mpileup: | |
344 fields = line.strip().split() | |
345 if fields[0] not in to_examine: | |
346 continue | |
347 if int(fields[1]) not in to_examine[fields[0]]: | |
348 continue | |
349 (ref_base, alt_base) = to_examine[fields[0]][int(fields[1])] | |
350 if len(ref_base.split(',')) > 1: # Can't handle multiple ref alleles | |
351 continue | |
352 if len(alt_base.split(',')) > 1: # Can't handle multiple alt alleles | |
353 continue | |
354 if len(ref_base) > 1 or len(alt_base) > 1: | |
355 if len(ref_base) > 1 and len(alt_base) > 1: # Can't handle complex indels | |
356 continue | |
357 data[(fields[0], int(fields[1]))] = parseIndelPileup(fields, ref_base, alt_base) | |
358 if len(ref_base) == 1 and len(alt_base) == 1: | |
359 data[(fields[0], int(fields[1]))] = parseSimpleSNPpileup(fields, ref_base, alt_base) | |
360 in_mpileup.close() | |
361 # Reopen vcf, this time, skip header, annotate all the sites for which there is an entry in data dictionary | |
362 # (Sites without entries have either multiple ref or alt bases, or have complex indels. Not supported (for now), and not reported as a result) | |
363 in_vcf = open(in_vcf_filepath, 'r') | |
364 for line in in_vcf: | |
365 if line[0] == '#': | |
366 continue | |
367 fields = line.strip().split('\t') | |
368 if (fields[0], int(fields[1])) not in data: | |
369 continue | |
370 (dp, af, faf, raf, dp4) = data[(fields[0], int(fields[1]))] | |
371 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] | |
372 _, p_val = fisher_exact(dp2x2) | |
373 sb = pval_to_phredqual(p_val) | |
374 if fields[7] == "": | |
375 info = [] | |
376 else: | |
377 info = fields[7].split(';') | |
378 info.append("DP=%d" % (dp)) | |
379 info.append("AF=%.6f" % (af)) | |
380 if isnan(faf): | |
381 info.append("FAF=NaN") | |
382 else: | |
383 info.append("FAF=%.6f" % (faf)) | |
384 if isnan(raf): | |
385 info.append("RAF=NaN") | |
386 else: | |
387 info.append("RAF=%.6f" % (raf)) | |
388 info.append("SB=%d" % (sb)) | |
389 info.append("DP4=%s" % (','.join([str(x) for x in dp4]))) | |
390 new_info = ';'.join(info) | |
391 fields[7] = new_info | |
392 out_vcf.write("%s\n" % ("\t".join(fields))) | |
393 in_vcf.close() | |
394 out_vcf.close() | |
395 | |
396 | |
397 if __name__ == "__main__": | |
398 annotateVCF(sys.argv[1], sys.argv[2], sys.argv[3]) |