comparison convert_VCF_info_fields.py @ 0:179342c7b86c draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 4d3dfd4bcb567178107dcfd808ff03f9fec0bdbd
author iuc
date Wed, 12 Oct 2022 07:43:59 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:179342c7b86c
1 #!/usr/bin/env python3
2
3 # Takes in VCF file annotated with medaka tools annotate and converts
4 #
5 # Usage statement:
6 # python convert_VCF_info_fields.py in_vcf.vcf out_vcf.vcf
7
8 # 10/21/2020 - Nathan P. Roach, natproach@gmail.com
9
10 import re
11 import sys
12 from collections import OrderedDict
13 from math import log10
14
15 import scipy.stats
16
17
18 def pval_to_phredqual(pval):
19 try:
20 ret = round(-10 * log10(pval))
21 except ValueError:
22 ret = 2147483647 # transform pval of 0.0 to max signed 32 bit int
23 return ret
24
25
26 def parseInfoField(info):
27 info_fields = info.split(";")
28 info_dict = OrderedDict()
29 for info_field in info_fields:
30 code, val = info_field.split("=")
31 info_dict[code] = val
32 return info_dict
33
34
35 def annotateVCF(in_vcf_filepath, out_vcf_filepath):
36 """Postprocess output of medaka tools annotate.
37
38 Splits multiallelic sites into separate records.
39 Replaces medaka INFO fields that might represent information of the ref
40 and multiple alternate alleles with simple ref, alt allele counterparts.
41 """
42
43 in_vcf = open(in_vcf_filepath, "r")
44 # medaka INFO fields that do not make sense after splitting of
45 # multi-allelic records
46 # DP will be overwritten with the value of DPSP because medaka tools
47 # annotate currently only calculates the latter correctly
48 # (https://github.com/nanoporetech/medaka/issues/192).
49 # DPS, which is as unreliable as DP, gets skipped and the code
50 # calculates the spanning reads equivalent DPSPS instead.
51 to_skip = {"SC", "SR", "AR", "DP", "DPSP", "DPS"}
52 struct_meta_pat = re.compile("##(.+)=<ID=([^,]+)(,.+)?>")
53 header_lines = []
54 contig_ids = set()
55 contig_ids_simple = set()
56 # parse the metadata lines of the input VCF and drop:
57 # - duplicate lines
58 # - INFO lines declaring keys we are not going to write
59 # - redundant contig information
60 while True:
61 line = in_vcf.readline()
62 if line[:2] != "##":
63 assert line.startswith("#CHROM")
64 break
65 if line in header_lines:
66 # the annotate tool may generate lines already written by
67 # medaka variant again (example: medaka version line)
68 continue
69 match = struct_meta_pat.match(line)
70 if match:
71 match_type, match_id, match_misc = match.groups()
72 if match_type == "INFO":
73 if match_id == "DPSP":
74 line = line.replace("DPSP", "DP")
75 elif match_id in to_skip:
76 continue
77 elif match_type == "contig":
78 contig_ids.add(match_id)
79 if not match_misc:
80 # the annotate tools writes its own contig info,
81 # which is redundant with contig info generated by
82 # medaka variant, but lacks a length value.
83 # We don't need the incomplete line.
84 contig_ids_simple.add(match_id)
85 continue
86 header_lines.append(line)
87 # Lets check the above assumption about each ID-only contig line
88 # having a more complete counterpart.
89 assert not (contig_ids_simple - contig_ids)
90 header_lines.insert(1, "##convert_VCF_info_fields=0.2\n")
91 header_lines += [
92 '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n',
93 '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n',
94 '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n',
95 '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n',
96 '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n',
97 '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n',
98 '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n',
99 line,
100 ]
101
102 with open(out_vcf_filepath, "w") as out_vcf:
103 out_vcf.writelines(header_lines)
104 for line in in_vcf:
105 fields = line.split("\t")
106 info_dict = parseInfoField(fields[7])
107 sr_list = [int(x) for x in info_dict["SR"].split(",")]
108 sc_list = [int(x) for x in info_dict["SC"].split(",")]
109 if len(sr_list) != len(sc_list):
110 print("WARNING - SR and SC are different lengths, " "skipping variant")
111 print(line.strip()) # Print the line for debugging purposes
112 continue
113 variant_list = fields[4].split(",")
114 dpsp = int(info_dict["DPSP"])
115 ref_fwd, ref_rev = 0, 1
116 dpspf, dpspr = (int(x) for x in info_dict["AR"].split(","))
117 for i in range(0, len(sr_list), 2):
118 dpspf += sr_list[i]
119 dpspr += sr_list[i + 1]
120 for j, i in enumerate(range(2, len(sr_list), 2)):
121 dp4 = (sr_list[ref_fwd], sr_list[ref_rev], sr_list[i], sr_list[i + 1])
122 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]]
123 _, p_val = scipy.stats.fisher_exact(dp2x2)
124 sb = pval_to_phredqual(p_val)
125
126 as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1])
127
128 info = []
129 for code in info_dict:
130 if code in to_skip:
131 continue
132 val = info_dict[code]
133 info.append("%s=%s" % (code, val))
134
135 info.append("DP=%d" % dpsp)
136 info.append("DPSPS=%d,%d" % (dpspf, dpspr))
137
138 if dpsp == 0:
139 info.append("AF=NaN")
140 else:
141 af = (dp4[2] + dp4[3]) / dpsp
142 info.append("AF=%.6f" % af)
143 if dpspf == 0:
144 info.append("FAF=NaN")
145 else:
146 faf = dp4[2] / dpspf
147 info.append("FAF=%.6f" % faf)
148 if dpspr == 0:
149 info.append("RAF=NaN")
150 else:
151 raf = dp4[3] / dpspr
152 info.append("RAF=%.6f" % raf)
153 info.append("SB=%d" % sb)
154 info.append("DP4=%d,%d,%d,%d" % dp4)
155 info.append("AS=%d,%d,%d,%d" % as_)
156 new_info = ";".join(info)
157 fields[4] = variant_list[j]
158 fields[7] = new_info
159 out_vcf.write("\t".join(fields))
160 in_vcf.close()
161
162
163 if __name__ == "__main__":
164 annotateVCF(sys.argv[1], sys.argv[2])