comparison scripts/ReMatCh/modules/checkMLST.py @ 0:965517909457 draft

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Wed, 22 Jan 2020 08:41:44 -0500
parents
children 0cbed1c0a762
comparison
equal deleted inserted replaced
-1:000000000000 0:965517909457
1 import sys
2 import os
3 import urllib2
4 import urllib
5 import csv
6 from glob import glob
7 import re
8 import functools
9 try:
10 import xml.etree.cElementTree as ET
11 except ImportError:
12 import xml.etree.ElementTree as ET
13 import utils
14 import rematch_module
15
16
17 def determine_species(species):
18 species = species.lower().split(' ')
19
20 if len(species) >= 2:
21 species = species[:2]
22 if species[1] in ('spp', 'spp.', 'complex'):
23 species = [species[0]]
24
25 return species
26
27
28 def check_existing_schema(species, schema_number, script_path):
29 species = determine_species(species)
30
31 if schema_number is None:
32 schema_number = ''
33 else:
34 schema_number = '#' + str(schema_number)
35
36 mlst_schemas_folder = os.path.join(os.path.dirname(script_path), 'modules', 'mlst_schemas', '')
37 reference = []
38 files = [f for f in os.listdir(mlst_schemas_folder) if not f.startswith('.') and os.path.isfile(os.path.join(mlst_schemas_folder, f))]
39 for file_found in files:
40 file_path = os.path.join(mlst_schemas_folder, file_found)
41 if file_found.startswith('_'.join(species) + schema_number) and file_found.endswith('.fasta'):
42 reference = file_path
43
44 if len(reference) > 1:
45 if schema_number == '':
46 schema_number = '#1'
47 for scheme in reference:
48 if os.path.splitext(scheme)[0].endswith(schema_number):
49 reference = [scheme]
50 break
51 if len(reference) == 0:
52 reference = None
53 elif len(reference) == 1:
54 reference = reference[0]
55 return reference
56
57
58 def write_mlst_reference(species, mlst_sequences, outdir, time_str):
59 print 'Writing MLST alleles as reference_sequences' + '\n'
60 reference_file = os.path.join(outdir, str(species.replace('/', '_').replace(' ', '_') + '.' + time_str + '.fasta'))
61 with open(reference_file, 'wt') as writer:
62 for header, sequence in mlst_sequences.items():
63 writer.write('>' + header + '\n')
64 fasta_sequence_lines = rematch_module.chunkstring(sequence, 80)
65 for line in fasta_sequence_lines:
66 writer.write(line + '\n')
67 return reference_file
68
69
70 def getST(mlst_dicts, dict_sequences):
71 SequenceDict = mlst_dicts[0]
72 STdict = mlst_dicts[1]
73 lociOrder = mlst_dicts[2]
74
75 alleles_profile = ['-'] * len(lociOrder)
76 for x, sequence_data in dict_sequences.items():
77 if sequence_data['header'] not in SequenceDict:
78 print sequence_data['header'] + ' not found between consensus sequences!'
79 break
80 if sequence_data['sequence'] in SequenceDict[sequence_data['header']].keys():
81 allele_number = SequenceDict[sequence_data['header']][sequence_data['sequence']]
82 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number
83 else:
84 for sequence_st, allele_number in SequenceDict[sequence_data['header']].items():
85 if sequence_st in sequence_data['sequence']:
86 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number
87
88 alleles_profile = ','.join(alleles_profile)
89 st = '-'
90 if alleles_profile in STdict:
91 st = STdict[alleles_profile]
92
93 return st, alleles_profile
94
95
96 downloadPubMLST = functools.partial(utils.timer, name='Download PubMLST module')
97
98
99 @downloadPubMLST
100 def downloadPubMLSTxml(originalSpecies, schema_number, outdir):
101 print 'Searching MLST database for ' + originalSpecies
102
103 xmlURL = 'http://pubmlst.org/data/dbases.xml'
104 try:
105 content = urllib2.urlopen(xmlURL)
106 xml = content.read()
107 tree = ET.fromstring(xml)
108 except:
109 print "Ooops! There might be a problem with the PubMLST service, try later or check if the xml is well formated at " + xmlURL
110 raise
111
112 xmlData = {}
113
114 if schema_number is None:
115 schema_number = 1
116
117 success = 0
118 for scheme in tree.findall('species'):
119 species_scheme = scheme.text.splitlines()[0].rsplit('#', 1)
120 number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1
121 species_scheme = species_scheme[0]
122 if determine_species(species_scheme) == determine_species(originalSpecies):
123 if schema_number == number_scheme:
124 success += 1
125 xmlData[scheme.text.strip()] = {}
126 for info in scheme: # mlst
127 for database in info: # database
128 for retrievedDate in database.findall('retrieved'):
129 retrieved = retrievedDate.text
130 xmlData[scheme.text.strip()][retrieved] = []
131 for profile in database.findall('profiles'):
132 profileURl = profile.find('url').text
133 xmlData[scheme.text.strip()][retrieved].append(profileURl)
134 for lociScheme in database.findall('loci'):
135 loci = {}
136 for locus in lociScheme:
137 locusID = locus.text
138 for locusInfo in locus:
139 locusUrl = locusInfo.text
140 loci[locusID.strip()] = locusUrl
141 xmlData[scheme.text.strip()][retrieved].append(loci)
142 if success == 0:
143 sys.exit("\tError. No schema found for %s. Please refer to https://pubmlst.org/databases/" % (originalSpecies))
144 elif success > 1:
145 keys = xmlData.keys()
146 keys = sorted(keys)
147 print "\tWarning. More than one schema found for %s. only keeping the first one... %s" % (originalSpecies, keys[0])
148 for key in keys[1:]:
149 del xmlData[key]
150
151 pubmlst_dir = os.path.join(outdir, 'pubmlst', '')
152 if not os.path.isdir(pubmlst_dir):
153 os.makedirs(pubmlst_dir)
154
155 for SchemaName, info in xmlData.items():
156 STdict = {}
157 SequenceDict = {}
158 mlst_sequences = {}
159
160 species_name = '_'.join(determine_species(SchemaName)).replace('/', '_')
161
162 for RetrievalDate, URL in info.items():
163 schema_date = species_name + '_' + RetrievalDate
164 outDit = os.path.join(pubmlst_dir, schema_date) # compatible with windows? See if it already exists, if so, break
165
166 if os.path.isdir(outDit):
167 pickle = os.path.join(outDit, str(schema_date + '.pkl'))
168 if os.path.isfile(pickle):
169 print "\tschema files already exist for %s" % (SchemaName)
170 mlst_dicts = utils.extractVariableFromPickle(pickle)
171 SequenceDict = mlst_dicts[0]
172 for lociName, alleleSequences in SequenceDict.items():
173 for sequence in alleleSequences:
174 if lociName not in mlst_sequences.keys():
175 mlst_sequences[lociName] = sequence
176 else:
177 break
178 return mlst_dicts, mlst_sequences
179
180 elif any(species_name in x for x in os.listdir(pubmlst_dir)):
181 print "Older version of %s's scheme found! Deleting..." % (SchemaName)
182 for directory in glob(str(pubmlst_dir + str(species_name + '_*'))):
183 utils.removeDirectory(directory)
184 os.makedirs(outDit)
185 else:
186 os.makedirs(outDit)
187
188 contentProfile = urllib2.urlopen(URL[0])
189 profileFile = csv.reader(contentProfile, delimiter='\t')
190 header = profileFile.next() # skip header
191 try:
192 indexCC = header.index('clonal_complex') if 'clonal_complex' in header else header.index('CC')
193 except:
194 indexCC = len(header)
195 lociOrder = header[1:indexCC]
196 for row in profileFile:
197 ST = row[0]
198 alleles = ','.join(row[1:indexCC])
199 STdict[alleles] = ST
200 for lociName, lociURL in URL[1].items():
201 if lociName not in SequenceDict.keys():
202 SequenceDict[lociName] = {}
203 url_file = os.path.join(outDit, lociURL.rsplit('/', 1)[1])
204 urllib.urlretrieve(lociURL, url_file)
205 sequences, ignore, ignore = rematch_module.get_sequence_information(url_file, 0)
206 for key in sequences.keys():
207 header = re.sub("\D", "", sequences[key]['header'])
208 sequence = sequences[key]['sequence'].upper()
209 SequenceDict[lociName][sequence] = header
210 if lociName not in mlst_sequences.keys():
211 mlst_sequences[lociName] = sequence
212 os.remove(url_file)
213 mlst_dicts = [SequenceDict, STdict, lociOrder]
214 utils.saveVariableToPickle(mlst_dicts, outDit, schema_date)
215 return mlst_dicts, mlst_sequences