comparison scripts/ReMatCh/modules/checkMLST.py @ 3:0cbed1c0a762 draft default tip

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Tue, 28 Jan 2020 10:42:31 -0500
parents 965517909457
children
comparison
equal deleted inserted replaced
2:6837f733b4aa 3:0cbed1c0a762
1 import sys 1 import sys
2 import os 2 import os
3 import urllib2 3 import urllib.request
4 import urllib
5 import csv 4 import csv
6 from glob import glob 5 from glob import glob
7 import re 6 import re
8 import functools 7 import functools
9 try: 8 try:
10 import xml.etree.cElementTree as ET 9 import xml.etree.cElementTree as ET
11 except ImportError: 10 except ImportError:
12 import xml.etree.ElementTree as ET 11 import xml.etree.ElementTree as ET
13 import utils 12 from . import utils
14 import rematch_module 13 from . import rematch_module
15 14
16 15
17 def determine_species(species): 16 def determine_species(species):
18 species = species.lower().split(' ') 17 species = species.lower().split(' ')
19 18
54 reference = reference[0] 53 reference = reference[0]
55 return reference 54 return reference
56 55
57 56
58 def write_mlst_reference(species, mlst_sequences, outdir, time_str): 57 def write_mlst_reference(species, mlst_sequences, outdir, time_str):
59 print 'Writing MLST alleles as reference_sequences' + '\n' 58 print('Writing MLST alleles as reference_sequences' + '\n')
60 reference_file = os.path.join(outdir, str(species.replace('/', '_').replace(' ', '_') + '.' + time_str + '.fasta')) 59 reference_file = os.path.join(outdir, str(species.replace('/', '_').replace(' ', '_') + '.' + time_str + '.fasta'))
61 with open(reference_file, 'wt') as writer: 60 with open(reference_file, 'wt') as writer:
62 for header, sequence in mlst_sequences.items(): 61 for header, sequence in list(mlst_sequences.items()):
63 writer.write('>' + header + '\n') 62 writer.write('>' + header + '\n')
64 fasta_sequence_lines = rematch_module.chunkstring(sequence, 80) 63 fasta_sequence_lines = rematch_module.chunkstring(sequence, 80)
65 for line in fasta_sequence_lines: 64 for line in fasta_sequence_lines:
66 writer.write(line + '\n') 65 writer.write(line + '\n')
67 return reference_file 66 return reference_file
68 67
69 68
70 def getST(mlst_dicts, dict_sequences): 69 def get_st(mlst_dicts, dict_sequences):
71 SequenceDict = mlst_dicts[0] 70 SequenceDict = mlst_dicts[0]
72 STdict = mlst_dicts[1] 71 STdict = mlst_dicts[1]
73 lociOrder = mlst_dicts[2] 72 lociOrder = mlst_dicts[2]
74 73
75 alleles_profile = ['-'] * len(lociOrder) 74 alleles_profile = ['-'] * len(lociOrder)
76 for x, sequence_data in dict_sequences.items(): 75 for x, sequence_data in list(dict_sequences.items()):
77 if sequence_data['header'] not in SequenceDict: 76 if sequence_data['header'] not in SequenceDict:
78 print sequence_data['header'] + ' not found between consensus sequences!' 77 print(sequence_data['header'] + ' not found between consensus sequences!')
79 break 78 break
80 if sequence_data['sequence'] in SequenceDict[sequence_data['header']].keys(): 79 if sequence_data['sequence'] in list(SequenceDict[sequence_data['header']].keys()):
81 allele_number = SequenceDict[sequence_data['header']][sequence_data['sequence']] 80 allele_number = SequenceDict[sequence_data['header']][sequence_data['sequence']]
82 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number 81 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number
83 else: 82 else:
84 for sequence_st, allele_number in SequenceDict[sequence_data['header']].items(): 83 for sequence_st, allele_number in list(SequenceDict[sequence_data['header']].items()):
85 if sequence_st in sequence_data['sequence']: 84 if sequence_st in sequence_data['sequence']:
86 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number 85 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number
87 86
88 alleles_profile = ','.join(alleles_profile) 87 alleles_profile = ','.join(alleles_profile)
89 st = '-' 88 st = '-'
95 94
96 downloadPubMLST = functools.partial(utils.timer, name='Download PubMLST module') 95 downloadPubMLST = functools.partial(utils.timer, name='Download PubMLST module')
97 96
98 97
99 @downloadPubMLST 98 @downloadPubMLST
100 def downloadPubMLSTxml(originalSpecies, schema_number, outdir): 99 def download_pub_mlst_xml(originalSpecies, schema_number, outdir):
101 print 'Searching MLST database for ' + originalSpecies 100 print('Searching MLST database for ' + originalSpecies)
102 101
103 xmlURL = 'http://pubmlst.org/data/dbases.xml' 102 xmlURL = 'http://pubmlst.org/data/dbases.xml'
104 try: 103 try:
105 content = urllib2.urlopen(xmlURL) 104 content = urllib.request.urlopen(xmlURL)
106 xml = content.read() 105 xml = content.read()
107 tree = ET.fromstring(xml) 106 tree = ET.fromstring(xml)
108 except: 107 except:
109 print "Ooops! There might be a problem with the PubMLST service, try later or check if the xml is well formated at " + xmlURL 108 print("Ooops! There might be a problem with the PubMLST service, try later or check if the xml is well formated"
109 " at " + xmlURL)
110 raise 110 raise
111 111
112 xmlData = {} 112 xmlData = {}
113 113
114 if schema_number is None: 114 if schema_number is None:
115 schema_number = 1 115 schema_number = 1
116 116
117 success = 0 117 success = 0
118 for scheme in tree.findall('species'): 118 for scheme in tree.findall('species'):
119 species_scheme = scheme.text.splitlines()[0].rsplit('#', 1) 119 species_scheme = scheme.text.rstrip('\r\n').rsplit('#', 1)
120 number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1 120 number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1
121 species_scheme = species_scheme[0] 121 species_scheme = species_scheme[0]
122 if determine_species(species_scheme) == determine_species(originalSpecies): 122 if determine_species(species_scheme) == determine_species(originalSpecies):
123 if schema_number == number_scheme: 123 if schema_number == number_scheme:
124 success += 1 124 success += 1
140 loci[locusID.strip()] = locusUrl 140 loci[locusID.strip()] = locusUrl
141 xmlData[scheme.text.strip()][retrieved].append(loci) 141 xmlData[scheme.text.strip()][retrieved].append(loci)
142 if success == 0: 142 if success == 0:
143 sys.exit("\tError. No schema found for %s. Please refer to https://pubmlst.org/databases/" % (originalSpecies)) 143 sys.exit("\tError. No schema found for %s. Please refer to https://pubmlst.org/databases/" % (originalSpecies))
144 elif success > 1: 144 elif success > 1:
145 keys = xmlData.keys() 145 keys = list(xmlData.keys())
146 keys = sorted(keys) 146 keys = sorted(keys)
147 print "\tWarning. More than one schema found for %s. only keeping the first one... %s" % (originalSpecies, keys[0]) 147 print("\tWarning. More than one schema found for %s. only keeping the first"
148 " one... %s" % (originalSpecies, keys[0]))
148 for key in keys[1:]: 149 for key in keys[1:]:
149 del xmlData[key] 150 del xmlData[key]
150 151
151 pubmlst_dir = os.path.join(outdir, 'pubmlst', '') 152 pubmlst_dir = os.path.join(outdir, 'pubmlst', '')
152 if not os.path.isdir(pubmlst_dir): 153 if not os.path.isdir(pubmlst_dir):
153 os.makedirs(pubmlst_dir) 154 os.makedirs(pubmlst_dir)
154 155
155 for SchemaName, info in xmlData.items(): 156 for SchemaName, info in list(xmlData.items()):
156 STdict = {} 157 STdict = {}
157 SequenceDict = {} 158 SequenceDict = {}
158 mlst_sequences = {} 159 mlst_sequences = {}
159 160
160 species_name = '_'.join(determine_species(SchemaName)).replace('/', '_') 161 species_name = '_'.join(determine_species(SchemaName)).replace('/', '_')
161 162
162 for RetrievalDate, URL in info.items(): 163 for RetrievalDate, URL in list(info.items()):
163 schema_date = species_name + '_' + RetrievalDate 164 schema_date = species_name + '_' + RetrievalDate
164 outDit = os.path.join(pubmlst_dir, schema_date) # compatible with windows? See if it already exists, if so, break 165 outDit = os.path.join(pubmlst_dir, schema_date) # compatible with windows? See if it already exists, if so, break
165 166
166 if os.path.isdir(outDit): 167 if os.path.isdir(outDit):
167 pickle = os.path.join(outDit, str(schema_date + '.pkl')) 168 pickle = os.path.join(outDit, str(schema_date + '.pkl'))
168 if os.path.isfile(pickle): 169 if os.path.isfile(pickle):
169 print "\tschema files already exist for %s" % (SchemaName) 170 print("\tschema files already exist for %s" % (SchemaName))
170 mlst_dicts = utils.extractVariableFromPickle(pickle) 171 mlst_dicts = utils.extract_variable_from_pickle(pickle)
171 SequenceDict = mlst_dicts[0] 172 SequenceDict = mlst_dicts[0]
172 for lociName, alleleSequences in SequenceDict.items(): 173 for lociName, alleleSequences in list(SequenceDict.items()):
173 for sequence in alleleSequences: 174 for sequence in alleleSequences:
174 if lociName not in mlst_sequences.keys(): 175 if lociName not in list(mlst_sequences.keys()):
175 mlst_sequences[lociName] = sequence 176 mlst_sequences[lociName] = sequence
176 else: 177 else:
177 break 178 break
178 return mlst_dicts, mlst_sequences 179 return mlst_dicts, mlst_sequences
179 180
180 elif any(species_name in x for x in os.listdir(pubmlst_dir)): 181 elif any(species_name in x for x in os.listdir(pubmlst_dir)):
181 print "Older version of %s's scheme found! Deleting..." % (SchemaName) 182 print("Older version of %s's scheme found! Deleting..." % (SchemaName))
182 for directory in glob(str(pubmlst_dir + str(species_name + '_*'))): 183 for directory in glob(str(pubmlst_dir + str(species_name + '_*'))):
183 utils.removeDirectory(directory) 184 utils.remove_directory(directory)
184 os.makedirs(outDit) 185 os.makedirs(outDit)
185 else: 186 else:
186 os.makedirs(outDit) 187 os.makedirs(outDit)
187 188
188 contentProfile = urllib2.urlopen(URL[0]) 189 contentProfile = urllib.request.urlopen(URL[0])
189 profileFile = csv.reader(contentProfile, delimiter='\t') 190 header = next(contentProfile).decode("utf8").strip().split('\t') # skip header
190 header = profileFile.next() # skip header
191 try: 191 try:
192 indexCC = header.index('clonal_complex') if 'clonal_complex' in header else header.index('CC') 192 indexCC = header.index('clonal_complex') if 'clonal_complex' in header else header.index('CC')
193 except: 193 except:
194 indexCC = len(header) 194 indexCC = len(header)
195 lociOrder = header[1:indexCC] 195 lociOrder = header[1:indexCC]
196 for row in profileFile: 196 for row in contentProfile:
197 row = row.decode("utf8").strip().split('\t')
197 ST = row[0] 198 ST = row[0]
198 alleles = ','.join(row[1:indexCC]) 199 alleles = ','.join(row[1:indexCC])
199 STdict[alleles] = ST 200 STdict[alleles] = ST
200 for lociName, lociURL in URL[1].items(): 201 for lociName, lociURL in list(URL[1].items()):
201 if lociName not in SequenceDict.keys(): 202 if lociName not in list(SequenceDict.keys()):
202 SequenceDict[lociName] = {} 203 SequenceDict[lociName] = {}
203 url_file = os.path.join(outDit, lociURL.rsplit('/', 1)[1]) 204 url_file = os.path.join(outDit, lociURL.rsplit('/', 1)[1])
204 urllib.urlretrieve(lociURL, url_file) 205 urllib.request.urlretrieve(lociURL, url_file)
205 sequences, ignore, ignore = rematch_module.get_sequence_information(url_file, 0) 206 sequences, ignore, ignore = rematch_module.get_sequence_information(url_file, 0)
206 for key in sequences.keys(): 207 for key in list(sequences.keys()):
207 header = re.sub("\D", "", sequences[key]['header']) 208 header = re.sub("\D", "", sequences[key]['header'])
208 sequence = sequences[key]['sequence'].upper() 209 sequence = sequences[key]['sequence'].upper()
209 SequenceDict[lociName][sequence] = header 210 SequenceDict[lociName][sequence] = header
210 if lociName not in mlst_sequences.keys(): 211 if lociName not in list(mlst_sequences.keys()):
211 mlst_sequences[lociName] = sequence 212 mlst_sequences[lociName] = sequence
212 os.remove(url_file) 213 os.remove(url_file)
213 mlst_dicts = [SequenceDict, STdict, lociOrder] 214 mlst_dicts = [SequenceDict, STdict, lociOrder]
214 utils.saveVariableToPickle(mlst_dicts, outDit, schema_date) 215 utils.save_variable_to_pickle(mlst_dicts, outDit, schema_date)
215 return mlst_dicts, mlst_sequences 216 return mlst_dicts, mlst_sequences