Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
comparison scripts/ReMatCh/modules/checkMLST.py @ 0:965517909457 draft
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Wed, 22 Jan 2020 08:41:44 -0500 |
parents | |
children | 0cbed1c0a762 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:965517909457 |
---|---|
1 import sys | |
2 import os | |
3 import urllib2 | |
4 import urllib | |
5 import csv | |
6 from glob import glob | |
7 import re | |
8 import functools | |
9 try: | |
10 import xml.etree.cElementTree as ET | |
11 except ImportError: | |
12 import xml.etree.ElementTree as ET | |
13 import utils | |
14 import rematch_module | |
15 | |
16 | |
17 def determine_species(species): | |
18 species = species.lower().split(' ') | |
19 | |
20 if len(species) >= 2: | |
21 species = species[:2] | |
22 if species[1] in ('spp', 'spp.', 'complex'): | |
23 species = [species[0]] | |
24 | |
25 return species | |
26 | |
27 | |
28 def check_existing_schema(species, schema_number, script_path): | |
29 species = determine_species(species) | |
30 | |
31 if schema_number is None: | |
32 schema_number = '' | |
33 else: | |
34 schema_number = '#' + str(schema_number) | |
35 | |
36 mlst_schemas_folder = os.path.join(os.path.dirname(script_path), 'modules', 'mlst_schemas', '') | |
37 reference = [] | |
38 files = [f for f in os.listdir(mlst_schemas_folder) if not f.startswith('.') and os.path.isfile(os.path.join(mlst_schemas_folder, f))] | |
39 for file_found in files: | |
40 file_path = os.path.join(mlst_schemas_folder, file_found) | |
41 if file_found.startswith('_'.join(species) + schema_number) and file_found.endswith('.fasta'): | |
42 reference = file_path | |
43 | |
44 if len(reference) > 1: | |
45 if schema_number == '': | |
46 schema_number = '#1' | |
47 for scheme in reference: | |
48 if os.path.splitext(scheme)[0].endswith(schema_number): | |
49 reference = [scheme] | |
50 break | |
51 if len(reference) == 0: | |
52 reference = None | |
53 elif len(reference) == 1: | |
54 reference = reference[0] | |
55 return reference | |
56 | |
57 | |
58 def write_mlst_reference(species, mlst_sequences, outdir, time_str): | |
59 print 'Writing MLST alleles as reference_sequences' + '\n' | |
60 reference_file = os.path.join(outdir, str(species.replace('/', '_').replace(' ', '_') + '.' + time_str + '.fasta')) | |
61 with open(reference_file, 'wt') as writer: | |
62 for header, sequence in mlst_sequences.items(): | |
63 writer.write('>' + header + '\n') | |
64 fasta_sequence_lines = rematch_module.chunkstring(sequence, 80) | |
65 for line in fasta_sequence_lines: | |
66 writer.write(line + '\n') | |
67 return reference_file | |
68 | |
69 | |
70 def getST(mlst_dicts, dict_sequences): | |
71 SequenceDict = mlst_dicts[0] | |
72 STdict = mlst_dicts[1] | |
73 lociOrder = mlst_dicts[2] | |
74 | |
75 alleles_profile = ['-'] * len(lociOrder) | |
76 for x, sequence_data in dict_sequences.items(): | |
77 if sequence_data['header'] not in SequenceDict: | |
78 print sequence_data['header'] + ' not found between consensus sequences!' | |
79 break | |
80 if sequence_data['sequence'] in SequenceDict[sequence_data['header']].keys(): | |
81 allele_number = SequenceDict[sequence_data['header']][sequence_data['sequence']] | |
82 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number | |
83 else: | |
84 for sequence_st, allele_number in SequenceDict[sequence_data['header']].items(): | |
85 if sequence_st in sequence_data['sequence']: | |
86 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number | |
87 | |
88 alleles_profile = ','.join(alleles_profile) | |
89 st = '-' | |
90 if alleles_profile in STdict: | |
91 st = STdict[alleles_profile] | |
92 | |
93 return st, alleles_profile | |
94 | |
95 | |
96 downloadPubMLST = functools.partial(utils.timer, name='Download PubMLST module') | |
97 | |
98 | |
99 @downloadPubMLST | |
100 def downloadPubMLSTxml(originalSpecies, schema_number, outdir): | |
101 print 'Searching MLST database for ' + originalSpecies | |
102 | |
103 xmlURL = 'http://pubmlst.org/data/dbases.xml' | |
104 try: | |
105 content = urllib2.urlopen(xmlURL) | |
106 xml = content.read() | |
107 tree = ET.fromstring(xml) | |
108 except: | |
109 print "Ooops! There might be a problem with the PubMLST service, try later or check if the xml is well formated at " + xmlURL | |
110 raise | |
111 | |
112 xmlData = {} | |
113 | |
114 if schema_number is None: | |
115 schema_number = 1 | |
116 | |
117 success = 0 | |
118 for scheme in tree.findall('species'): | |
119 species_scheme = scheme.text.splitlines()[0].rsplit('#', 1) | |
120 number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1 | |
121 species_scheme = species_scheme[0] | |
122 if determine_species(species_scheme) == determine_species(originalSpecies): | |
123 if schema_number == number_scheme: | |
124 success += 1 | |
125 xmlData[scheme.text.strip()] = {} | |
126 for info in scheme: # mlst | |
127 for database in info: # database | |
128 for retrievedDate in database.findall('retrieved'): | |
129 retrieved = retrievedDate.text | |
130 xmlData[scheme.text.strip()][retrieved] = [] | |
131 for profile in database.findall('profiles'): | |
132 profileURl = profile.find('url').text | |
133 xmlData[scheme.text.strip()][retrieved].append(profileURl) | |
134 for lociScheme in database.findall('loci'): | |
135 loci = {} | |
136 for locus in lociScheme: | |
137 locusID = locus.text | |
138 for locusInfo in locus: | |
139 locusUrl = locusInfo.text | |
140 loci[locusID.strip()] = locusUrl | |
141 xmlData[scheme.text.strip()][retrieved].append(loci) | |
142 if success == 0: | |
143 sys.exit("\tError. No schema found for %s. Please refer to https://pubmlst.org/databases/" % (originalSpecies)) | |
144 elif success > 1: | |
145 keys = xmlData.keys() | |
146 keys = sorted(keys) | |
147 print "\tWarning. More than one schema found for %s. only keeping the first one... %s" % (originalSpecies, keys[0]) | |
148 for key in keys[1:]: | |
149 del xmlData[key] | |
150 | |
151 pubmlst_dir = os.path.join(outdir, 'pubmlst', '') | |
152 if not os.path.isdir(pubmlst_dir): | |
153 os.makedirs(pubmlst_dir) | |
154 | |
155 for SchemaName, info in xmlData.items(): | |
156 STdict = {} | |
157 SequenceDict = {} | |
158 mlst_sequences = {} | |
159 | |
160 species_name = '_'.join(determine_species(SchemaName)).replace('/', '_') | |
161 | |
162 for RetrievalDate, URL in info.items(): | |
163 schema_date = species_name + '_' + RetrievalDate | |
164 outDit = os.path.join(pubmlst_dir, schema_date) # compatible with windows? See if it already exists, if so, break | |
165 | |
166 if os.path.isdir(outDit): | |
167 pickle = os.path.join(outDit, str(schema_date + '.pkl')) | |
168 if os.path.isfile(pickle): | |
169 print "\tschema files already exist for %s" % (SchemaName) | |
170 mlst_dicts = utils.extractVariableFromPickle(pickle) | |
171 SequenceDict = mlst_dicts[0] | |
172 for lociName, alleleSequences in SequenceDict.items(): | |
173 for sequence in alleleSequences: | |
174 if lociName not in mlst_sequences.keys(): | |
175 mlst_sequences[lociName] = sequence | |
176 else: | |
177 break | |
178 return mlst_dicts, mlst_sequences | |
179 | |
180 elif any(species_name in x for x in os.listdir(pubmlst_dir)): | |
181 print "Older version of %s's scheme found! Deleting..." % (SchemaName) | |
182 for directory in glob(str(pubmlst_dir + str(species_name + '_*'))): | |
183 utils.removeDirectory(directory) | |
184 os.makedirs(outDit) | |
185 else: | |
186 os.makedirs(outDit) | |
187 | |
188 contentProfile = urllib2.urlopen(URL[0]) | |
189 profileFile = csv.reader(contentProfile, delimiter='\t') | |
190 header = profileFile.next() # skip header | |
191 try: | |
192 indexCC = header.index('clonal_complex') if 'clonal_complex' in header else header.index('CC') | |
193 except: | |
194 indexCC = len(header) | |
195 lociOrder = header[1:indexCC] | |
196 for row in profileFile: | |
197 ST = row[0] | |
198 alleles = ','.join(row[1:indexCC]) | |
199 STdict[alleles] = ST | |
200 for lociName, lociURL in URL[1].items(): | |
201 if lociName not in SequenceDict.keys(): | |
202 SequenceDict[lociName] = {} | |
203 url_file = os.path.join(outDit, lociURL.rsplit('/', 1)[1]) | |
204 urllib.urlretrieve(lociURL, url_file) | |
205 sequences, ignore, ignore = rematch_module.get_sequence_information(url_file, 0) | |
206 for key in sequences.keys(): | |
207 header = re.sub("\D", "", sequences[key]['header']) | |
208 sequence = sequences[key]['sequence'].upper() | |
209 SequenceDict[lociName][sequence] = header | |
210 if lociName not in mlst_sequences.keys(): | |
211 mlst_sequences[lociName] = sequence | |
212 os.remove(url_file) | |
213 mlst_dicts = [SequenceDict, STdict, lociOrder] | |
214 utils.saveVariableToPickle(mlst_dicts, outDit, schema_date) | |
215 return mlst_dicts, mlst_sequences |