Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
annotate scripts/ReMatCh/modules/checkMLST.py @ 3:0cbed1c0a762 draft default tip
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Tue, 28 Jan 2020 10:42:31 -0500 |
parents | 965517909457 |
children |
rev | line source |
---|---|
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
1 import sys |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
2 import os |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
3 import urllib.request |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
4 import csv |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
5 from glob import glob |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
6 import re |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
7 import functools |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
8 try: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
9 import xml.etree.cElementTree as ET |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
10 except ImportError: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
11 import xml.etree.ElementTree as ET |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
12 from . import utils |
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
13 from . import rematch_module |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
14 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
15 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
16 def determine_species(species): |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
17 species = species.lower().split(' ') |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
18 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
19 if len(species) >= 2: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
20 species = species[:2] |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
21 if species[1] in ('spp', 'spp.', 'complex'): |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
22 species = [species[0]] |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
23 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
24 return species |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
25 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
26 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
27 def check_existing_schema(species, schema_number, script_path): |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
28 species = determine_species(species) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
29 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
30 if schema_number is None: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
31 schema_number = '' |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
32 else: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
33 schema_number = '#' + str(schema_number) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
34 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
35 mlst_schemas_folder = os.path.join(os.path.dirname(script_path), 'modules', 'mlst_schemas', '') |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
36 reference = [] |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
37 files = [f for f in os.listdir(mlst_schemas_folder) if not f.startswith('.') and os.path.isfile(os.path.join(mlst_schemas_folder, f))] |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
38 for file_found in files: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
39 file_path = os.path.join(mlst_schemas_folder, file_found) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
40 if file_found.startswith('_'.join(species) + schema_number) and file_found.endswith('.fasta'): |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
41 reference = file_path |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
42 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
43 if len(reference) > 1: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
44 if schema_number == '': |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
45 schema_number = '#1' |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
46 for scheme in reference: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
47 if os.path.splitext(scheme)[0].endswith(schema_number): |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
48 reference = [scheme] |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
49 break |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
50 if len(reference) == 0: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
51 reference = None |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
52 elif len(reference) == 1: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
53 reference = reference[0] |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
54 return reference |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
55 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
56 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
57 def write_mlst_reference(species, mlst_sequences, outdir, time_str): |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
58 print('Writing MLST alleles as reference_sequences' + '\n') |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
59 reference_file = os.path.join(outdir, str(species.replace('/', '_').replace(' ', '_') + '.' + time_str + '.fasta')) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
60 with open(reference_file, 'wt') as writer: |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
61 for header, sequence in list(mlst_sequences.items()): |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
62 writer.write('>' + header + '\n') |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
63 fasta_sequence_lines = rematch_module.chunkstring(sequence, 80) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
64 for line in fasta_sequence_lines: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
65 writer.write(line + '\n') |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
66 return reference_file |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
67 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
68 |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
69 def get_st(mlst_dicts, dict_sequences): |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
70 SequenceDict = mlst_dicts[0] |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
71 STdict = mlst_dicts[1] |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
72 lociOrder = mlst_dicts[2] |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
73 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
74 alleles_profile = ['-'] * len(lociOrder) |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
75 for x, sequence_data in list(dict_sequences.items()): |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
76 if sequence_data['header'] not in SequenceDict: |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
77 print(sequence_data['header'] + ' not found between consensus sequences!') |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
78 break |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
79 if sequence_data['sequence'] in list(SequenceDict[sequence_data['header']].keys()): |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
80 allele_number = SequenceDict[sequence_data['header']][sequence_data['sequence']] |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
81 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
82 else: |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
83 for sequence_st, allele_number in list(SequenceDict[sequence_data['header']].items()): |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
84 if sequence_st in sequence_data['sequence']: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
85 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
86 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
87 alleles_profile = ','.join(alleles_profile) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
88 st = '-' |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
89 if alleles_profile in STdict: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
90 st = STdict[alleles_profile] |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
91 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
92 return st, alleles_profile |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
93 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
94 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
95 downloadPubMLST = functools.partial(utils.timer, name='Download PubMLST module') |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
96 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
97 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
98 @downloadPubMLST |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
99 def download_pub_mlst_xml(originalSpecies, schema_number, outdir): |
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
100 print('Searching MLST database for ' + originalSpecies) |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
101 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
102 xmlURL = 'http://pubmlst.org/data/dbases.xml' |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
103 try: |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
104 content = urllib.request.urlopen(xmlURL) |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
105 xml = content.read() |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
106 tree = ET.fromstring(xml) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
107 except: |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
108 print("Ooops! There might be a problem with the PubMLST service, try later or check if the xml is well formated" |
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
109 " at " + xmlURL) |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
110 raise |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
111 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
112 xmlData = {} |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
113 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
114 if schema_number is None: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
115 schema_number = 1 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
116 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
117 success = 0 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
118 for scheme in tree.findall('species'): |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
119 species_scheme = scheme.text.rstrip('\r\n').rsplit('#', 1) |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
120 number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
121 species_scheme = species_scheme[0] |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
122 if determine_species(species_scheme) == determine_species(originalSpecies): |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
123 if schema_number == number_scheme: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
124 success += 1 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
125 xmlData[scheme.text.strip()] = {} |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
126 for info in scheme: # mlst |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
127 for database in info: # database |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
128 for retrievedDate in database.findall('retrieved'): |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
129 retrieved = retrievedDate.text |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
130 xmlData[scheme.text.strip()][retrieved] = [] |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
131 for profile in database.findall('profiles'): |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
132 profileURl = profile.find('url').text |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
133 xmlData[scheme.text.strip()][retrieved].append(profileURl) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
134 for lociScheme in database.findall('loci'): |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
135 loci = {} |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
136 for locus in lociScheme: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
137 locusID = locus.text |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
138 for locusInfo in locus: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
139 locusUrl = locusInfo.text |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
140 loci[locusID.strip()] = locusUrl |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
141 xmlData[scheme.text.strip()][retrieved].append(loci) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
142 if success == 0: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
143 sys.exit("\tError. No schema found for %s. Please refer to https://pubmlst.org/databases/" % (originalSpecies)) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
144 elif success > 1: |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
145 keys = list(xmlData.keys()) |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
146 keys = sorted(keys) |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
147 print("\tWarning. More than one schema found for %s. only keeping the first" |
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
148 " one... %s" % (originalSpecies, keys[0])) |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
149 for key in keys[1:]: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
150 del xmlData[key] |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
151 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
152 pubmlst_dir = os.path.join(outdir, 'pubmlst', '') |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
153 if not os.path.isdir(pubmlst_dir): |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
154 os.makedirs(pubmlst_dir) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
155 |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
156 for SchemaName, info in list(xmlData.items()): |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
157 STdict = {} |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
158 SequenceDict = {} |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
159 mlst_sequences = {} |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
160 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
161 species_name = '_'.join(determine_species(SchemaName)).replace('/', '_') |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
162 |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
163 for RetrievalDate, URL in list(info.items()): |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
164 schema_date = species_name + '_' + RetrievalDate |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
165 outDit = os.path.join(pubmlst_dir, schema_date) # compatible with windows? See if it already exists, if so, break |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
166 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
167 if os.path.isdir(outDit): |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
168 pickle = os.path.join(outDit, str(schema_date + '.pkl')) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
169 if os.path.isfile(pickle): |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
170 print("\tschema files already exist for %s" % (SchemaName)) |
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
171 mlst_dicts = utils.extract_variable_from_pickle(pickle) |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
172 SequenceDict = mlst_dicts[0] |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
173 for lociName, alleleSequences in list(SequenceDict.items()): |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
174 for sequence in alleleSequences: |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
175 if lociName not in list(mlst_sequences.keys()): |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
176 mlst_sequences[lociName] = sequence |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
177 else: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
178 break |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
179 return mlst_dicts, mlst_sequences |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
180 |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
181 elif any(species_name in x for x in os.listdir(pubmlst_dir)): |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
182 print("Older version of %s's scheme found! Deleting..." % (SchemaName)) |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
183 for directory in glob(str(pubmlst_dir + str(species_name + '_*'))): |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
184 utils.remove_directory(directory) |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
185 os.makedirs(outDit) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
186 else: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
187 os.makedirs(outDit) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
188 |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
189 contentProfile = urllib.request.urlopen(URL[0]) |
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
190 header = next(contentProfile).decode("utf8").strip().split('\t') # skip header |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
191 try: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
192 indexCC = header.index('clonal_complex') if 'clonal_complex' in header else header.index('CC') |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
193 except: |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
194 indexCC = len(header) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
195 lociOrder = header[1:indexCC] |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
196 for row in contentProfile: |
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
197 row = row.decode("utf8").strip().split('\t') |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
198 ST = row[0] |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
199 alleles = ','.join(row[1:indexCC]) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
200 STdict[alleles] = ST |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
201 for lociName, lociURL in list(URL[1].items()): |
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
202 if lociName not in list(SequenceDict.keys()): |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
203 SequenceDict[lociName] = {} |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
204 url_file = os.path.join(outDit, lociURL.rsplit('/', 1)[1]) |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
205 urllib.request.urlretrieve(lociURL, url_file) |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
206 sequences, ignore, ignore = rematch_module.get_sequence_information(url_file, 0) |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
207 for key in list(sequences.keys()): |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
208 header = re.sub("\D", "", sequences[key]['header']) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
209 sequence = sequences[key]['sequence'].upper() |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
210 SequenceDict[lociName][sequence] = header |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
211 if lociName not in list(mlst_sequences.keys()): |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
212 mlst_sequences[lociName] = sequence |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
213 os.remove(url_file) |
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
214 mlst_dicts = [SequenceDict, STdict, lociOrder] |
3
0cbed1c0a762
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
0
diff
changeset
|
215 utils.save_variable_to_pickle(mlst_dicts, outDit, schema_date) |
0
965517909457
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
cstrittmatter
parents:
diff
changeset
|
216 return mlst_dicts, mlst_sequences |