Mercurial > repos > iuc > medaka_variant
comparison convert_VCF_info_fields.py @ 12:0f5f4a208660 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
| author | iuc |
|---|---|
| date | Fri, 17 Sep 2021 20:22:49 +0000 |
| parents | 9fb055604648 |
| children | 222669c4afb6 |
comparison
equal
deleted
inserted
replaced
| 11:2bf63b38ee9b | 12:0f5f4a208660 |
|---|---|
| 5 # Usage statement: | 5 # Usage statement: |
| 6 # python convert_VCF_info_fields.py in_vcf.vcf out_vcf.vcf | 6 # python convert_VCF_info_fields.py in_vcf.vcf out_vcf.vcf |
| 7 | 7 |
| 8 # 10/21/2020 - Nathan P. Roach, natproach@gmail.com | 8 # 10/21/2020 - Nathan P. Roach, natproach@gmail.com |
| 9 | 9 |
| 10 import re | |
| 10 import sys | 11 import sys |
| 11 from collections import OrderedDict | 12 from collections import OrderedDict |
| 12 from math import log10 | 13 from math import log10 |
| 13 | 14 |
| 14 import scipy | |
| 15 import scipy.stats | 15 import scipy.stats |
| 16 | 16 |
| 17 | 17 |
| 18 def pval_to_phredqual(pval): | 18 def pval_to_phredqual(pval): |
| 19 try: | 19 try: |
| 31 info_dict[code] = val | 31 info_dict[code] = val |
| 32 return info_dict | 32 return info_dict |
| 33 | 33 |
| 34 | 34 |
| 35 def annotateVCF(in_vcf_filepath, out_vcf_filepath): | 35 def annotateVCF(in_vcf_filepath, out_vcf_filepath): |
| 36 """Postprocess output of medaka tools annotate. | |
| 37 | |
| 38 Splits multiallelic sites into separate records. | |
| 39 Replaces medaka INFO fields that might represent information of the ref | |
| 40 and multiple alternate alleles with simple ref, alt allele counterparts. | |
| 41 """ | |
| 42 | |
| 36 in_vcf = open(in_vcf_filepath, 'r') | 43 in_vcf = open(in_vcf_filepath, 'r') |
| 37 out_vcf = open(out_vcf_filepath, 'w') | 44 # medaka INFO fields that do not make sense after splitting of |
| 38 to_skip = set(['SC', 'SR']) | 45 # multi-allelic records |
| 39 for i, line in enumerate(in_vcf): | 46 # DP will be overwritten with the value of DPSP because medaka tools |
| 40 if i == 1: | 47 # annotate currently only calculates the latter correctly |
| 41 out_vcf.write("##convert_VCF_info_fields=0.2\n") | 48 # (https://github.com/nanoporetech/medaka/issues/192). |
| 42 if line[0:2] == "##": | 49 # DPS, which is as unreliable as DP, gets skipped and the code |
| 43 if line[0:11] == "##INFO=<ID=": | 50 # calculates the spanning reads equivalent DPSPS instead. |
| 44 id_ = line[11:].split(',')[0] | 51 to_skip = {'SC', 'SR', 'AR', 'DP', 'DPSP', 'DPS'} |
| 45 if id_ in to_skip: | 52 struct_meta_pat = re.compile('##(.+)=<ID=([^,]+)(,.+)?>') |
| 53 header_lines = [] | |
| 54 contig_ids = set() | |
| 55 contig_ids_simple = set() | |
| 56 # parse the metadata lines of the input VCF and drop: | |
| 57 # - duplicate lines | |
| 58 # - INFO lines declaring keys we are not going to write | |
| 59 # - redundant contig information | |
| 60 while True: | |
| 61 line = in_vcf.readline() | |
| 62 if line[:2] != '##': | |
| 63 assert line.startswith('#CHROM') | |
| 64 break | |
| 65 if line in header_lines: | |
| 66 # the annotate tool may generate lines already written by | |
| 67 # medaka variant again (example: medaka version line) | |
| 68 continue | |
| 69 match = struct_meta_pat.match(line) | |
| 70 if match: | |
| 71 match_type, match_id, match_misc = match.groups() | |
| 72 if match_type == 'INFO': | |
| 73 if match_id == 'DPSP': | |
| 74 line = line.replace('DPSP', 'DP') | |
| 75 elif match_id in to_skip: | |
| 46 continue | 76 continue |
| 47 out_vcf.write(line) | 77 elif match_type == 'contig': |
| 48 elif line[0] == "#": | 78 contig_ids.add(match_id) |
| 49 out_vcf.write('##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Spanning Reads Allele Frequency By Strand">\n') | 79 if not match_misc: |
| 50 out_vcf.write('##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n') | 80 # the annotate tools writes its own contig info, |
| 51 out_vcf.write('##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n') | 81 # which is redundant with contig info generated by |
| 52 out_vcf.write('##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n') | 82 # medaka variant, but lacks a length value. |
| 53 out_vcf.write('##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n') | 83 # We don't need the incomplete line. |
| 54 out_vcf.write('##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n') | 84 contig_ids_simple.add(match_id) |
| 55 out_vcf.write('##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n') | 85 continue |
| 56 out_vcf.write(line) | 86 header_lines.append(line) |
| 57 else: | 87 # Lets check the above assumption about each ID-only contig line |
| 88 # having a more complete counterpart. | |
| 89 assert not (contig_ids_simple - contig_ids) | |
| 90 header_lines.insert(1, '##convert_VCF_info_fields=0.2\n') | |
| 91 header_lines += [ | |
| 92 '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n', | |
| 93 '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n', | |
| 94 '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n', | |
| 95 '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n', | |
| 96 '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n', | |
| 97 '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n', | |
| 98 '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n', | |
| 99 line | |
| 100 ] | |
| 101 | |
| 102 with open(out_vcf_filepath, 'w') as out_vcf: | |
| 103 out_vcf.writelines(header_lines) | |
| 104 for line in in_vcf: | |
| 58 fields = line.split('\t') | 105 fields = line.split('\t') |
| 59 info_dict = parseInfoField(fields[7]) | 106 info_dict = parseInfoField(fields[7]) |
| 60 sr_list = [int(x) for x in info_dict["SR"].split(',')] | 107 sr_list = [int(x) for x in info_dict["SR"].split(',')] |
| 61 sc_list = [int(x) for x in info_dict["SC"].split(',')] | 108 sc_list = [int(x) for x in info_dict["SC"].split(',')] |
| 62 if len(sr_list) == len(sc_list): | 109 if len(sr_list) != len(sc_list): |
| 63 variant_list = fields[4].split(',') | 110 print( |
| 64 dpsp = int(info_dict["DPSP"]) | 111 'WARNING - SR and SC are different lengths, ' |
| 65 ref_fwd, ref_rev = 0, 1 | 112 'skipping variant' |
| 66 dpspf, dpspr = (int(x) for x in info_dict["AR"].split(',')) | 113 ) |
| 67 for i in range(0, len(sr_list), 2): | 114 print(line.strip()) # Print the line for debugging purposes |
| 68 dpspf += sr_list[i] | 115 continue |
| 69 dpspr += sr_list[i + 1] | 116 variant_list = fields[4].split(',') |
| 70 for j, i in enumerate(range(2, len(sr_list), 2)): | 117 dpsp = int(info_dict['DPSP']) |
| 71 dp4 = (sr_list[ref_fwd], sr_list[ref_rev], sr_list[i], sr_list[i + 1]) | 118 ref_fwd, ref_rev = 0, 1 |
| 72 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] | 119 dpspf, dpspr = (int(x) for x in info_dict['AR'].split(',')) |
| 73 _, p_val = scipy.stats.fisher_exact(dp2x2) | 120 for i in range(0, len(sr_list), 2): |
| 74 sb = pval_to_phredqual(p_val) | 121 dpspf += sr_list[i] |
| 122 dpspr += sr_list[i + 1] | |
| 123 for j, i in enumerate(range(2, len(sr_list), 2)): | |
| 124 dp4 = ( | |
| 125 sr_list[ref_fwd], | |
| 126 sr_list[ref_rev], | |
| 127 sr_list[i], | |
| 128 sr_list[i + 1] | |
| 129 ) | |
| 130 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] | |
| 131 _, p_val = scipy.stats.fisher_exact(dp2x2) | |
| 132 sb = pval_to_phredqual(p_val) | |
| 75 | 133 |
| 76 as_ = (sc_list[ref_fwd], sc_list[ref_rev], sc_list[i], sc_list[i + 1]) | 134 as_ = ( |
| 135 sc_list[ref_fwd], | |
| 136 sc_list[ref_rev], | |
| 137 sc_list[i], | |
| 138 sc_list[i + 1] | |
| 139 ) | |
| 77 | 140 |
| 78 info = [] | 141 info = [] |
| 79 for code in info_dict: | 142 for code in info_dict: |
| 80 if code in to_skip: | 143 if code in to_skip: |
| 81 continue | 144 continue |
| 82 val = info_dict[code] | 145 val = info_dict[code] |
| 83 info.append("%s=%s" % (code, val)) | 146 info.append("%s=%s" % (code, val)) |
| 84 | 147 |
| 85 info.append("DPSPS=%d,%d" % (dpspf, dpspr)) | 148 info.append('DP=%d' % dpsp) |
| 149 info.append('DPSPS=%d,%d' % (dpspf, dpspr)) | |
| 86 | 150 |
| 87 if dpsp == 0: | 151 if dpsp == 0: |
| 88 info.append("AF=NaN") | 152 info.append('AF=NaN') |
| 89 else: | 153 else: |
| 90 af = (dp4[2] + dp4[3]) / dpsp | 154 af = (dp4[2] + dp4[3]) / dpsp |
| 91 info.append("AF=%.6f" % (af)) | 155 info.append('AF=%.6f' % af) |
| 92 if dpspf == 0: | 156 if dpspf == 0: |
| 93 info.append("FAF=NaN") | 157 info.append('FAF=NaN') |
| 94 else: | 158 else: |
| 95 faf = dp4[2] / dpspf | 159 faf = dp4[2] / dpspf |
| 96 info.append("FAF=%.6f" % (faf)) | 160 info.append('FAF=%.6f' % faf) |
| 97 if dpspr == 0: | 161 if dpspr == 0: |
| 98 info.append("RAF=NaN") | 162 info.append('RAF=NaN') |
| 99 else: | 163 else: |
| 100 raf = dp4[3] / dpspr | 164 raf = dp4[3] / dpspr |
| 101 info.append("RAF=%.6f" % (raf)) | 165 info.append('RAF=%.6f' % raf) |
| 102 info.append("SB=%d" % (sb)) | 166 info.append('SB=%d' % sb) |
| 103 info.append("DP4=%d,%d,%d,%d" % (dp4)) | 167 info.append('DP4=%d,%d,%d,%d' % dp4) |
| 104 info.append("AS=%d,%d,%d,%d" % (as_)) | 168 info.append('AS=%d,%d,%d,%d' % as_) |
| 105 new_info = ';'.join(info) | 169 new_info = ';'.join(info) |
| 106 fields[4] = variant_list[j] | 170 fields[4] = variant_list[j] |
| 107 fields[7] = new_info | 171 fields[7] = new_info |
| 108 out_vcf.write("%s" % ("\t".join(fields))) | 172 out_vcf.write('\t'.join(fields)) |
| 109 else: | |
| 110 print("WARNING - SR and SC are different lengths, skipping variant") | |
| 111 print(line.strip()) # Print the line for debugging purposes | |
| 112 in_vcf.close() | 173 in_vcf.close() |
| 113 out_vcf.close() | |
| 114 | 174 |
| 115 | 175 |
| 116 if __name__ == "__main__": | 176 if __name__ == "__main__": |
| 117 annotateVCF(sys.argv[1], sys.argv[2]) | 177 annotateVCF(sys.argv[1], sys.argv[2]) |
