Mercurial > repos > iuc > medaka_variant_pipeline
annotate convert_VCF_info_fields.py @ 12:597407d61386 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
author | iuc |
---|---|
date | Fri, 17 Sep 2021 20:22:27 +0000 |
parents | 7623e5888be9 |
children | 3fbefde449bc |
rev | line source |
---|---|
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
1 #!/usr/bin/env python3 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
2 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
3 # Takes in VCF file annotated with medaka tools annotate and converts |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
4 # |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
5 # Usage statement: |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
6 # python convert_VCF_info_fields.py in_vcf.vcf out_vcf.vcf |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
7 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
8 # 10/21/2020 - Nathan P. Roach, natproach@gmail.com |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
9 |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
10 import re |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
11 import sys |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
12 from collections import OrderedDict |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
13 from math import log10 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
14 |
10
7623e5888be9
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 0faf0ade3f13d7c78d93869823ea9fdf25c21b13"
iuc
parents:
8
diff
changeset
|
15 import scipy.stats |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
16 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
17 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
18 def pval_to_phredqual(pval): |
7
08e0d74aac22
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit ed5a3aadbecc0decf9a797447f3ac7700683ea9a"
iuc
parents:
6
diff
changeset
|
19 try: |
08e0d74aac22
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit ed5a3aadbecc0decf9a797447f3ac7700683ea9a"
iuc
parents:
6
diff
changeset
|
20 ret = round(-10 * log10(pval)) |
08e0d74aac22
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit ed5a3aadbecc0decf9a797447f3ac7700683ea9a"
iuc
parents:
6
diff
changeset
|
21 except ValueError: |
08e0d74aac22
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit ed5a3aadbecc0decf9a797447f3ac7700683ea9a"
iuc
parents:
6
diff
changeset
|
22 ret = 2147483647 # transform pval of 0.0 to max signed 32 bit int |
08e0d74aac22
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit ed5a3aadbecc0decf9a797447f3ac7700683ea9a"
iuc
parents:
6
diff
changeset
|
23 return ret |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
24 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
25 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
26 def parseInfoField(info): |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
27 info_fields = info.split(';') |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
28 info_dict = OrderedDict() |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
29 for info_field in info_fields: |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
30 code, val = info_field.split('=') |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
31 info_dict[code] = val |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
32 return info_dict |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
33 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
34 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
35 def annotateVCF(in_vcf_filepath, out_vcf_filepath): |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
36 """Postprocess output of medaka tools annotate. |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
37 |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
38 Splits multiallelic sites into separate records. |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
39 Replaces medaka INFO fields that might represent information of the ref |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
40 and multiple alternate alleles with simple ref, alt allele counterparts. |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
41 """ |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
42 |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
43 in_vcf = open(in_vcf_filepath, 'r') |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
44 # medaka INFO fields that do not make sense after splitting of |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
45 # multi-allelic records |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
46 # DP will be overwritten with the value of DPSP because medaka tools |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
47 # annotate currently only calculates the latter correctly |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
48 # (https://github.com/nanoporetech/medaka/issues/192). |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
49 # DPS, which is as unreliable as DP, gets skipped and the code |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
50 # calculates the spanning reads equivalent DPSPS instead. |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
51 to_skip = {'SC', 'SR', 'AR', 'DP', 'DPSP', 'DPS'} |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
52 struct_meta_pat = re.compile('##(.+)=<ID=([^,]+)(,.+)?>') |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
53 header_lines = [] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
54 contig_ids = set() |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
55 contig_ids_simple = set() |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
56 # parse the metadata lines of the input VCF and drop: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
57 # - duplicate lines |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
58 # - INFO lines declaring keys we are not going to write |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
59 # - redundant contig information |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
60 while True: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
61 line = in_vcf.readline() |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
62 if line[:2] != '##': |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
63 assert line.startswith('#CHROM') |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
64 break |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
65 if line in header_lines: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
66 # the annotate tool may generate lines already written by |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
67 # medaka variant again (example: medaka version line) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
68 continue |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
69 match = struct_meta_pat.match(line) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
70 if match: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
71 match_type, match_id, match_misc = match.groups() |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
72 if match_type == 'INFO': |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
73 if match_id == 'DPSP': |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
74 line = line.replace('DPSP', 'DP') |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
75 elif match_id in to_skip: |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
76 continue |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
77 elif match_type == 'contig': |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
78 contig_ids.add(match_id) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
79 if not match_misc: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
80 # the annotate tools writes its own contig info, |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
81 # which is redundant with contig info generated by |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
82 # medaka variant, but lacks a length value. |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
83 # We don't need the incomplete line. |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
84 contig_ids_simple.add(match_id) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
85 continue |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
86 header_lines.append(line) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
87 # Lets check the above assumption about each ID-only contig line |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
88 # having a more complete counterpart. |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
89 assert not (contig_ids_simple - contig_ids) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
90 header_lines.insert(1, '##convert_VCF_info_fields=0.2\n') |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
91 header_lines += [ |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
92 '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n', |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
93 '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n', |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
94 '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n', |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
95 '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n', |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
96 '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n', |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
97 '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n', |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
98 '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n', |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
99 line |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
100 ] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
101 |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
102 with open(out_vcf_filepath, 'w') as out_vcf: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
103 out_vcf.writelines(header_lines) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
104 for line in in_vcf: |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
105 fields = line.split('\t') |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
106 info_dict = parseInfoField(fields[7]) |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
107 sr_list = [int(x) for x in info_dict["SR"].split(',')] |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
108 sc_list = [int(x) for x in info_dict["SC"].split(',')] |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
109 if len(sr_list) != len(sc_list): |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
110 print( |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
111 'WARNING - SR and SC are different lengths, ' |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
112 'skipping variant' |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
113 ) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
114 print(line.strip()) # Print the line for debugging purposes |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
115 continue |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
116 variant_list = fields[4].split(',') |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
117 dpsp = int(info_dict['DPSP']) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
118 ref_fwd, ref_rev = 0, 1 |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
119 dpspf, dpspr = (int(x) for x in info_dict['AR'].split(',')) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
120 for i in range(0, len(sr_list), 2): |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
121 dpspf += sr_list[i] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
122 dpspr += sr_list[i + 1] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
123 for j, i in enumerate(range(2, len(sr_list), 2)): |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
124 dp4 = ( |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
125 sr_list[ref_fwd], |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
126 sr_list[ref_rev], |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
127 sr_list[i], |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
128 sr_list[i + 1] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
129 ) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
130 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
131 _, p_val = scipy.stats.fisher_exact(dp2x2) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
132 sb = pval_to_phredqual(p_val) |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
133 |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
134 as_ = ( |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
135 sc_list[ref_fwd], |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
136 sc_list[ref_rev], |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
137 sc_list[i], |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
138 sc_list[i + 1] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
139 ) |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
140 |
12
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
141 info = [] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
142 for code in info_dict: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
143 if code in to_skip: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
144 continue |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
145 val = info_dict[code] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
146 info.append("%s=%s" % (code, val)) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
147 |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
148 info.append('DP=%d' % dpsp) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
149 info.append('DPSPS=%d,%d' % (dpspf, dpspr)) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
150 |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
151 if dpsp == 0: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
152 info.append('AF=NaN') |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
153 else: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
154 af = (dp4[2] + dp4[3]) / dpsp |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
155 info.append('AF=%.6f' % af) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
156 if dpspf == 0: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
157 info.append('FAF=NaN') |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
158 else: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
159 faf = dp4[2] / dpspf |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
160 info.append('FAF=%.6f' % faf) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
161 if dpspr == 0: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
162 info.append('RAF=NaN') |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
163 else: |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
164 raf = dp4[3] / dpspr |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
165 info.append('RAF=%.6f' % raf) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
166 info.append('SB=%d' % sb) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
167 info.append('DP4=%d,%d,%d,%d' % dp4) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
168 info.append('AS=%d,%d,%d,%d' % as_) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
169 new_info = ';'.join(info) |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
170 fields[4] = variant_list[j] |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
171 fields[7] = new_info |
597407d61386
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents:
10
diff
changeset
|
172 out_vcf.write('\t'.join(fields)) |
6
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
173 in_vcf.close() |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
174 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
175 |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
176 if __name__ == "__main__": |
ea1833858055
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff
changeset
|
177 annotateVCF(sys.argv[1], sys.argv[2]) |