annotate convert_lineage_defs.py @ 1:99494998688a draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 6a6a37f2574954dae65f9ec407fe38594ed37659
author iuc
date Sun, 25 Feb 2024 09:49:20 +0000
parents 6ddf5a9ce4a5
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
1 # Try to convert constellations files into the format expected by lineagespot.
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
2 # Constellations files can define parent lineages, in which case the script
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
3 # parses parent mutations recursively and adds them to the signature of the
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
4 # child.
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
5
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
6 # CURRENT AND GENERAL LIMITATIONS
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
7 # Important to understand, please read carefully
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
8 # 1. Constellations sometimes uses base instead of amino acid positions for
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
9 # defining mutations. These can take two forms like in these examples:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
10 # "nuc:C8986T", i.e. a SNV given in base coordinates
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
11 # "del:22029:6", i.e. a deletion of 6 bases given in base coordinates
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
12 # The current version of the script makes no attempt to convert such lines to
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
13 # amino acid coordinates, but simply drops them.
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
14 # 2. In other cases, constellations lists deletions in amino acid poisitions like
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
15 # this:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
16 # "s:HV69-"
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
17 # While this notation could be parsed such lines are currently *also* dropped
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
18 # because it's not entirely clear how lineagespot describes deletions.
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
19 # 3. In some cases, constellation also provides mutations in mature peptide
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
20 # coordinates, like "nsp15:K259R". Lines like this are currently dropped, too.
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
21 # 4. The constellations data provided by
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
22 # https://github.com/cov-lineages/constellations
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
23 # lists mostly lineage-defining mutations that can be used to *distinguish*
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
24 # between lineages, but makes no attempt to provide complete lists of mutations
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
25 # (even through parent lineage definitions) for any lineage.
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
26
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
27 import argparse
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
28 import json
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
29 import os
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
30 import re
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
31 import sys
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
32
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
33
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
34 genes_names_translation = {
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
35 "orf1a": "ORF1a",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
36 "orf1ab": "ORF1ab",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
37 "1ab": "ORF1ab",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
38 "orf1b": "ORF1b",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
39 "s": "S",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
40 "spike": "S",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
41 "orf3a": "ORF3a",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
42 "e": "E",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
43 "m": "M",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
44 "n": "N",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
45 "orf6": "ORF6",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
46 "orf7a": "ORF7a",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
47 "orf7b": "ORF7b",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
48 "orf8": "ORF8",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
49 "8": "ORF8",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
50 "n": "N",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
51 # NOTE: in constellations, mutations are sometimes, but not always, given
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
52 # in nsp coordinates instead of ORF1a/b ones. Currently, we drop these,
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
53 # while we should convert instead!!!
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
54 "nsp2": "NSP2",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
55 "nsp3": "NSP3",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
56 "nsp4": "NSP4",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
57 "nsp5": "NSP5",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
58 "nsp6": "NSP6",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
59 "nsp7": "NSP7",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
60 "nsp8": "NSP8",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
61 "nsp9": "NSP9",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
62 "nsp10": "NSP10",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
63 "nsp12": "NSP12",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
64 "nsp13": "NSP13",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
65 "nsp14": "NSP14",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
66 "nsp15": "NSP15",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
67 "nsp16": "NSP16",
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
68 }
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
69
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
70
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
71 lineagespot_template = dict.fromkeys(["ORF1a", "ORF1b", "S", "ORF3a", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "E", "N"])
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
72 definitions = {}
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
73
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
74 pat = re.compile(r'(?P<gene>.+):(?P<ref>[A-Z]+)(?P<pos>\d+)(?P<alt>[A-Z*]+)')
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
75
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
76
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
77 def read_lineage_variants(x, lineage_name):
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
78 data = json.load(x)
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
79
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
80 sites = {}
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
81 for mut in data["sites"]:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
82 match = pat.match(mut)
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
83 if match is None:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
84 # Likely a del or nuc mutation given at the base level
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
85 continue
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
86 # try to get a canonical gene name
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
87 gene = genes_names_translation.get(
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
88 match.group('gene'),
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
89 match.group('gene')
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
90 )
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
91 pos = int(match.group('pos'))
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
92 if gene == 'ORF1ab':
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
93 # constellations isn't very consistent in representing ORF1ab
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
94 # mutations. They may be provided in ORF1a or ORF1b coordinates,
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
95 # but could also just be given as ORF1ab.
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
96 if pos <= 4401:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
97 gene = 'ORF1a'
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
98 else:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
99 gene = 'ORF1b'
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
100 # 4715 == 314 in constellations
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
101 pos = pos - 4401
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
102 if gene not in sites:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
103 sites[gene] = {}
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
104 sites[gene][pos] = (match.group('ref'), match.group('alt'))
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
105
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
106 # recursively parse parent lineages and
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
107 # add their mutations to the global definitions
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
108 if "parent_lineage" in data["variant"]:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
109 x_parent = data["variant"]["parent_lineage"]
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
110 if x_parent not in definitions:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
111 parent_filename = f"c{x_parent}.json"
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
112 lineage_def_dir = os.path.dirname(x.name)
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
113 parent_file = os.path.join(lineage_def_dir, parent_filename)
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
114 if not os.path.isfile(parent_file):
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
115 raise FileNotFoundError(
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
116 f"{x_parent} is defined as a parent of {lineage_name}, but "
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
117 f"definitions file {parent_filename} not found in "
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
118 f"{lineage_def_dir}."
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
119 )
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
120 with open(parent_file) as parent_in:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
121 read_lineage_variants(parent_in, x_parent)
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
122
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
123 # update the sites dictionary to include also mutations defined for the parent
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
124 for gene, muts in definitions[x_parent].items():
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
125 if gene in sites:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
126 for pos, ref_alt in muts.items():
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
127 if pos in sites[gene]:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
128 # exotic case of a parent site being affected in the child
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
129 # lineage again. Kepp the child site unaltered.
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
130 continue
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
131 sites[gene][pos] = ref_alt
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
132 else:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
133 # only the parent has mutations in this gene listed
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
134 sites[gene] = muts
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
135 # done with this lineage and all of its parents
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
136 definitions[lineage_name] = sites
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
137
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
138
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
139 parser = argparse.ArgumentParser()
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
140 parser.add_argument(
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
141 "-i", "--input", required=True,
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
142 help="Name of the input folder"
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
143 )
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
144 parser.add_argument(
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
145 "-o", "--output", required=True,
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
146 help="Name of the output folder"
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
147 )
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
148 if len(sys.argv) < 2:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
149 sys.exit('Please run with -h / --help for help.')
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
150
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
151 args = parser.parse_args()
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
152
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
153 for definitions_file in os.listdir(args.input):
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
154 # In constellations, the only reliable way to get the lineage name is from
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
155 # the file name by stripping the .json suffix from it and dropping the
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
156 # leading 'c' (e.g. cBA.5.json holds the definition for lineage BA.5).
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
157 if definitions_file[0] != 'c' or definitions_file[-5:] != '.json':
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
158 continue
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
159 lineage_name_from_file = definitions_file[1:-5]
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
160 if lineage_name_from_file in definitions:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
161 # seems we have parsed this lineage already as a parent of another lineage
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
162 continue
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
163 with open(os.path.join(args.input, definitions_file)) as data_in:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
164 read_lineage_variants(data_in, lineage_name_from_file)
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
165
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
166 for lineage, sites in definitions.items():
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
167 # if path isn't there, create one could be added
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
168 with open(os.path.join(args.output, lineage) + '.txt', "w") as data_out:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
169 data_out.write('gene\tamino acid\n')
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
170 for gene, muts in sites.items():
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
171 if gene in lineagespot_template:
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
172 for pos, ref_alt in muts.items():
6ddf5a9ce4a5 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/lineagespot commit 0bc6ed15054577af1089d55ef9aa1071d122eb6b
iuc
parents:
diff changeset
173 data_out.write(f'{gene}\t{ref_alt[0]}{pos}{ref_alt[1]}\n')