comparison annotateVCF.py @ 3:35666d44fe7a draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit e80b649094384fc6d7a8f917300db3550cc99a44"
author iuc
date Tue, 01 Sep 2020 03:08:04 -0400
parents
children b35de691c42c
comparison
equal deleted inserted replaced
2:6a87478ed985 3:35666d44fe7a
1 #!/usr/bin/env python3
2
3 # Takes in VCF file and a samtools mpileup output file
4 # Fills in annotation for the VCF file including AF, DP
5 # SB, and DP4
6 #
7 # Usage statement:
8 # python annotateVCF.py in_vcf.vcf in_mpileup.txt out_vcf.vcf
9 #
10 # Can generate in_mileup.txt with samtools mpileup (and can restrict which sites to generate pileups for with in_vcf.vcf)
11
12 # 08/24/2020 - Nathan P. Roach, natproach@gmail.com
13
14 import sys
15 from math import isnan, log10
16
17 from scipy.stats import fisher_exact
18
19
20 def pval_to_phredqual(pval):
21 return int(round(-10. * log10(pval)))
22
23
24 def parseSimpleSNPpileup(fields, ref_base, alt_base):
25 base_to_idx = {
26 'A': 0,
27 'a': 0,
28 'T': 1,
29 't': 1,
30 'C': 2,
31 'c': 2,
32 'G': 3,
33 'g': 3
34 }
35
36 base_to_idx_stranded = {
37 'A': 0,
38 'T': 1,
39 'C': 2,
40 'G': 3,
41 'a': 4,
42 't': 5,
43 'c': 6,
44 'g': 7
45 }
46 ref_base2 = fields[2]
47 counts = [0, 0, 0, 0]
48 stranded_counts = [0, 0, 0, 0, 0, 0, 0, 0]
49 ref_idx = base_to_idx[fields[2]]
50 dp = int(fields[3])
51 carrot_flag = False
52 ins_flag = False
53 ins_str = ""
54 ins_len = 0
55 insertion = ""
56 del_flag = False
57 del_str = ""
58 del_len = 0
59 deletion = ""
60 # dollar_flag = False
61 for base in fields[4]:
62 if carrot_flag:
63 carrot_flag = False
64 continue
65 if ins_len > 0:
66 insertion += base
67 ins_len -= 1
68 continue
69 if del_len > 0:
70 deletion += base
71 del_len -= 1
72 continue
73 if ins_flag:
74 if base.isdigit():
75 ins_str += base
76 else:
77 ins_len = int(ins_str) - 1
78 insertion = base
79 ins_flag = False
80 elif del_flag:
81 if base.isdigit():
82 del_str += base
83 else:
84 del_len = int(del_str) - 1
85 deletion = base
86 del_flag = False
87 else:
88 if base == '^':
89 carrot_flag = True
90 continue
91 elif base == '$':
92 continue
93 elif base == '+':
94 ins_flag = True
95 elif base == '-':
96 del_flag = True
97 elif base == '.':
98 counts[ref_idx] += 1
99 stranded_counts[base_to_idx_stranded[ref_base2]] += 1
100 elif base == ',':
101 counts[ref_idx] += 1
102 stranded_counts[base_to_idx_stranded[ref_base2.lower()]] += 1
103 elif base == 'N' or base == 'n':
104 continue
105 elif base == '*':
106 continue
107 else:
108 counts[base_to_idx[base]] += 1
109 stranded_counts[base_to_idx_stranded[base]] += 1
110 af = float(counts[base_to_idx[alt_base]]) / float(sum(counts))
111 if float(sum(stranded_counts[0:4])) == 0:
112 faf = float("nan")
113 else:
114 faf = float(stranded_counts[base_to_idx_stranded[alt_base]]) / float(sum(stranded_counts[0:4]))
115 if float(sum(stranded_counts[4:])) == 0:
116 raf = float("nan")
117 else:
118 raf = float(stranded_counts[base_to_idx_stranded[alt_base.lower()]]) / float(sum(stranded_counts[4:]))
119 dp4 = [stranded_counts[base_to_idx_stranded[ref_base]],
120 stranded_counts[base_to_idx_stranded[ref_base.lower()]],
121 stranded_counts[base_to_idx_stranded[alt_base]],
122 stranded_counts[base_to_idx_stranded[alt_base.lower()]]]
123 return (dp, af, faf, raf, dp4)
124
125
126 def parseIndelPileup(fields, ref_base, alt_base):
127 counts = [0, 0, 0, 0, 0, 0, 0, 0, 0] # indel ref match, indel fwd ref match, indel rev ref match, indel alt match, indel fwd alt match, indel rev alt match, other, other fwd, other rev
128 ref_base2 = fields[2]
129
130 carrot_flag = False
131 ins_flag = False
132 ins_str = ""
133 ins_len = 0
134 del_flag = False
135 del_str = ""
136 del_len = 0
137 first_iter = True
138 forward_flag = False
139 last_seq = ""
140 last_seq_code = 'b'
141 for base in fields[4]:
142 if ins_flag:
143 if base.isdigit():
144 ins_str += base
145 else:
146 ins_len = int(ins_str)
147 ins_flag = False
148 if del_flag:
149 if base.isdigit():
150 del_str += base
151 else:
152 del_len = int(del_str)
153 del_flag = False
154 if ins_len > 0:
155 last_seq += base
156 last_seq_code = 'i'
157 ins_len -= 1
158 continue
159 if del_len > 0:
160 last_seq += base
161 last_seq_code = 'd'
162 del_len -= 1
163 continue
164 if carrot_flag:
165 carrot_flag = False
166 continue
167 if base == '.' or base == ','\
168 or base == 'A' or base == 'a'\
169 or base == 'C' or base == 'c'\
170 or base == 'G' or base == 'g'\
171 or base == 'T' or base == 't'\
172 or base == 'N' or base == 'n'\
173 or base == '>' or base == '<'\
174 or base == '*' or base == '#':
175 if first_iter:
176 first_iter = False
177 else:
178 if last_seq_code == 'i':
179 if last_seq.upper() == alt_base.upper():
180 counts[3] += 1
181 if forward_flag:
182 counts[4] += 1
183 else:
184 counts[5] += 1
185 else:
186 counts[6] += 1
187 if forward_flag:
188 counts[7] += 1
189 else:
190 counts[8] += 1
191 elif last_seq_code == 'd':
192 if last_seq.upper() == ref_base.upper():
193 counts[3] += 1
194 if forward_flag:
195 counts[4] += 1
196 else:
197 counts[5] += 1
198 else:
199 counts[6] += 1
200 if forward_flag:
201 counts[7] += 1
202 else:
203 counts[8] += 1
204 elif last_seq_code == 'b':
205 if last_seq.upper() == ref_base.upper():
206 counts[0] += 1
207 if forward_flag:
208 counts[1] += 1
209 else:
210 counts[2] += 1
211 elif last_seq.upper() == alt_base.upper():
212 counts[3] += 1
213 if forward_flag:
214 counts[4] += 1
215 else:
216 counts[5] += 1
217 else:
218 counts[6] += 1
219 if forward_flag:
220 counts[7] += 1
221 else:
222 counts[8] += 1
223 if base == '.':
224 last_seq = ref_base2
225 forward_flag = True
226 last_seq_code = 'b'
227 elif base == ',':
228 last_seq = ref_base2
229 forward_flag = False
230 last_seq_code = 'b'
231 elif base == '>' or base == '<' or base == '*' or base == '#':
232 continue
233 else:
234 forward_flag = base.isupper()
235 last_seq = base.upper()
236 last_seq_code = 'b'
237 elif base == '+':
238 ins_flag = True
239 ins_str = ""
240 elif base == '-':
241 del_flag = True
242 del_str = ""
243 elif base == '^':
244 carrot_flag = True
245 elif base == '$':
246 continue
247 if first_iter:
248 first_iter = False
249
250 if last_seq_code == 'i':
251 if last_seq.upper() == alt_base.upper():
252 counts[3] += 1
253 if forward_flag:
254 counts[4] += 1
255 else:
256 counts[5] += 1
257 else:
258 counts[6] += 1
259 if forward_flag:
260 counts[7] += 1
261 else:
262 counts[8] += 1
263 elif last_seq_code == 'd':
264 if last_seq.upper() == ref_base.upper():
265 counts[3] += 1
266 if forward_flag:
267 counts[4] += 1
268 else:
269 counts[5] += 1
270 else:
271 counts[6] += 1
272 if forward_flag:
273 counts[7] += 1
274 else:
275 counts[8] += 1
276 elif last_seq_code == 'b':
277 if last_seq.upper() == ref_base.upper():
278 counts[0] += 1
279 if forward_flag:
280 counts[1] += 1
281 else:
282 counts[2] += 1
283 elif last_seq.upper() == alt_base.upper():
284 counts[3] += 1
285 if forward_flag:
286 counts[4] += 1
287 else:
288 counts[5] += 1
289 else:
290 counts[6] += 1
291 if forward_flag:
292 counts[7] += 1
293 else:
294 counts[8] += 1
295 dp = int(fields[3])
296 af = float(counts[3]) / float(sum([counts[0], counts[3], counts[6]]))
297 if sum([counts[1], counts[4], counts[7]]) == 0:
298 faf = float("nan")
299 else:
300 faf = float(counts[4]) / float(sum([counts[1], counts[4], counts[7]]))
301 if sum([counts[2], counts[5], counts[8]]) == 0:
302 raf = float("nan")
303 else:
304 raf = float(counts[5]) / float(sum([counts[2], counts[5], counts[8]]))
305 dp4 = [counts[1], counts[2], counts[4], counts[5]]
306 return (dp, af, faf, raf, dp4)
307
308
309 def annotateVCF(in_vcf_filepath, in_mpileup_filepath, out_vcf_filepath):
310 in_vcf = open(in_vcf_filepath, 'r')
311 in_mpileup = open(in_mpileup_filepath, 'r')
312 out_vcf = open(out_vcf_filepath, 'w')
313
314 # First pass parsing of input vcf, output headerlines + new headerlines, add VCF sites we care about to to_examine (limits memory usage for sites that don't need annotation)
315 to_examine = {}
316 for line in in_vcf:
317 if line[0:2] == "##":
318 out_vcf.write(line)
319 elif line[0] == "#":
320 out_vcf.write("##annotateVCFVersion=0.1\n")
321 out_vcf.write("##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw Depth\">\n")
322 out_vcf.write("##INFO=<ID=AF,Number=1,Type=Float,Description=\"Allele Frequency\">\n")
323 out_vcf.write("##INFO=<ID=FAF,Number=1,Type=Float,Description=\"Forward Allele Frequency\">\n")
324 out_vcf.write("##INFO=<ID=RAF,Number=1,Type=Float,Description=\"Reverse Allele Frequency\">\n")
325 out_vcf.write("##INFO=<ID=SB,Number=1,Type=Integer,Description=\"Phred-scaled strand bias at this position\">\n")
326 out_vcf.write("##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases\">\n")
327 out_vcf.write(line)
328 else:
329 fields = line.strip().split()
330 if fields[0] in to_examine:
331 to_examine[fields[0]][int(fields[1])] = (fields[3], fields[4])
332 else:
333 to_examine[fields[0]] = {int(fields[1]): (fields[3], fields[4])}
334 in_vcf.close()
335 data = {}
336
337 # Populate data dictionary, which relates chromosome and position to the following:
338 # depth of coverage
339 # allele frequency
340 # forward strand allele frequency
341 # reverse strand allele frequency
342 # dp4 - depth of coverage of ref allele fwd strand, DOC of ref allele rev strand, DOC of alt allele fwd strand, DOC of alt allele rev strand
343 for line in in_mpileup:
344 fields = line.strip().split()
345 if fields[0] not in to_examine:
346 continue
347 if int(fields[1]) not in to_examine[fields[0]]:
348 continue
349 (ref_base, alt_base) = to_examine[fields[0]][int(fields[1])]
350 if len(ref_base.split(',')) > 1: # Can't handle multiple ref alleles
351 continue
352 if len(alt_base.split(',')) > 1: # Can't handle multiple alt alleles
353 continue
354 if len(ref_base) > 1 or len(alt_base) > 1:
355 if len(ref_base) > 1 and len(alt_base) > 1: # Can't handle complex indels
356 continue
357 data[(fields[0], int(fields[1]))] = parseIndelPileup(fields, ref_base, alt_base)
358 if len(ref_base) == 1 and len(alt_base) == 1:
359 data[(fields[0], int(fields[1]))] = parseSimpleSNPpileup(fields, ref_base, alt_base)
360 in_mpileup.close()
361 # Reopen vcf, this time, skip header, annotate all the sites for which there is an entry in data dictionary
362 # (Sites without entries have either multiple ref or alt bases, or have complex indels. Not supported (for now), and not reported as a result)
363 in_vcf = open(in_vcf_filepath, 'r')
364 for line in in_vcf:
365 if line[0] == '#':
366 continue
367 fields = line.strip().split('\t')
368 if (fields[0], int(fields[1])) not in data:
369 continue
370 (dp, af, faf, raf, dp4) = data[(fields[0], int(fields[1]))]
371 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]]
372 _, p_val = fisher_exact(dp2x2)
373 sb = pval_to_phredqual(p_val)
374 if fields[7] == "":
375 info = []
376 else:
377 info = fields[7].split(';')
378 info.append("DP=%d" % (dp))
379 info.append("AF=%.6f" % (af))
380 if isnan(faf):
381 info.append("FAF=NaN")
382 else:
383 info.append("FAF=%.6f" % (faf))
384 if isnan(raf):
385 info.append("RAF=NaN")
386 else:
387 info.append("RAF=%.6f" % (raf))
388 info.append("SB=%d" % (sb))
389 info.append("DP4=%s" % (','.join([str(x) for x in dp4])))
390 new_info = ';'.join(info)
391 fields[7] = new_info
392 out_vcf.write("%s\n" % ("\t".join(fields)))
393 in_vcf.close()
394 out_vcf.close()
395
396
397 if __name__ == "__main__":
398 annotateVCF(sys.argv[1], sys.argv[2], sys.argv[3])