Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
comparison scripts/ReMatCh/modules/checkMLST.py @ 3:0cbed1c0a762 draft default tip
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Tue, 28 Jan 2020 10:42:31 -0500 |
parents | 965517909457 |
children |
comparison
equal
deleted
inserted
replaced
2:6837f733b4aa | 3:0cbed1c0a762 |
---|---|
1 import sys | 1 import sys |
2 import os | 2 import os |
3 import urllib2 | 3 import urllib.request |
4 import urllib | |
5 import csv | 4 import csv |
6 from glob import glob | 5 from glob import glob |
7 import re | 6 import re |
8 import functools | 7 import functools |
9 try: | 8 try: |
10 import xml.etree.cElementTree as ET | 9 import xml.etree.cElementTree as ET |
11 except ImportError: | 10 except ImportError: |
12 import xml.etree.ElementTree as ET | 11 import xml.etree.ElementTree as ET |
13 import utils | 12 from . import utils |
14 import rematch_module | 13 from . import rematch_module |
15 | 14 |
16 | 15 |
17 def determine_species(species): | 16 def determine_species(species): |
18 species = species.lower().split(' ') | 17 species = species.lower().split(' ') |
19 | 18 |
54 reference = reference[0] | 53 reference = reference[0] |
55 return reference | 54 return reference |
56 | 55 |
57 | 56 |
58 def write_mlst_reference(species, mlst_sequences, outdir, time_str): | 57 def write_mlst_reference(species, mlst_sequences, outdir, time_str): |
59 print 'Writing MLST alleles as reference_sequences' + '\n' | 58 print('Writing MLST alleles as reference_sequences' + '\n') |
60 reference_file = os.path.join(outdir, str(species.replace('/', '_').replace(' ', '_') + '.' + time_str + '.fasta')) | 59 reference_file = os.path.join(outdir, str(species.replace('/', '_').replace(' ', '_') + '.' + time_str + '.fasta')) |
61 with open(reference_file, 'wt') as writer: | 60 with open(reference_file, 'wt') as writer: |
62 for header, sequence in mlst_sequences.items(): | 61 for header, sequence in list(mlst_sequences.items()): |
63 writer.write('>' + header + '\n') | 62 writer.write('>' + header + '\n') |
64 fasta_sequence_lines = rematch_module.chunkstring(sequence, 80) | 63 fasta_sequence_lines = rematch_module.chunkstring(sequence, 80) |
65 for line in fasta_sequence_lines: | 64 for line in fasta_sequence_lines: |
66 writer.write(line + '\n') | 65 writer.write(line + '\n') |
67 return reference_file | 66 return reference_file |
68 | 67 |
69 | 68 |
70 def getST(mlst_dicts, dict_sequences): | 69 def get_st(mlst_dicts, dict_sequences): |
71 SequenceDict = mlst_dicts[0] | 70 SequenceDict = mlst_dicts[0] |
72 STdict = mlst_dicts[1] | 71 STdict = mlst_dicts[1] |
73 lociOrder = mlst_dicts[2] | 72 lociOrder = mlst_dicts[2] |
74 | 73 |
75 alleles_profile = ['-'] * len(lociOrder) | 74 alleles_profile = ['-'] * len(lociOrder) |
76 for x, sequence_data in dict_sequences.items(): | 75 for x, sequence_data in list(dict_sequences.items()): |
77 if sequence_data['header'] not in SequenceDict: | 76 if sequence_data['header'] not in SequenceDict: |
78 print sequence_data['header'] + ' not found between consensus sequences!' | 77 print(sequence_data['header'] + ' not found between consensus sequences!') |
79 break | 78 break |
80 if sequence_data['sequence'] in SequenceDict[sequence_data['header']].keys(): | 79 if sequence_data['sequence'] in list(SequenceDict[sequence_data['header']].keys()): |
81 allele_number = SequenceDict[sequence_data['header']][sequence_data['sequence']] | 80 allele_number = SequenceDict[sequence_data['header']][sequence_data['sequence']] |
82 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number | 81 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number |
83 else: | 82 else: |
84 for sequence_st, allele_number in SequenceDict[sequence_data['header']].items(): | 83 for sequence_st, allele_number in list(SequenceDict[sequence_data['header']].items()): |
85 if sequence_st in sequence_data['sequence']: | 84 if sequence_st in sequence_data['sequence']: |
86 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number | 85 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number |
87 | 86 |
88 alleles_profile = ','.join(alleles_profile) | 87 alleles_profile = ','.join(alleles_profile) |
89 st = '-' | 88 st = '-' |
95 | 94 |
96 downloadPubMLST = functools.partial(utils.timer, name='Download PubMLST module') | 95 downloadPubMLST = functools.partial(utils.timer, name='Download PubMLST module') |
97 | 96 |
98 | 97 |
99 @downloadPubMLST | 98 @downloadPubMLST |
100 def downloadPubMLSTxml(originalSpecies, schema_number, outdir): | 99 def download_pub_mlst_xml(originalSpecies, schema_number, outdir): |
101 print 'Searching MLST database for ' + originalSpecies | 100 print('Searching MLST database for ' + originalSpecies) |
102 | 101 |
103 xmlURL = 'http://pubmlst.org/data/dbases.xml' | 102 xmlURL = 'http://pubmlst.org/data/dbases.xml' |
104 try: | 103 try: |
105 content = urllib2.urlopen(xmlURL) | 104 content = urllib.request.urlopen(xmlURL) |
106 xml = content.read() | 105 xml = content.read() |
107 tree = ET.fromstring(xml) | 106 tree = ET.fromstring(xml) |
108 except: | 107 except: |
109 print "Ooops! There might be a problem with the PubMLST service, try later or check if the xml is well formated at " + xmlURL | 108 print("Ooops! There might be a problem with the PubMLST service, try later or check if the xml is well formated" |
109 " at " + xmlURL) | |
110 raise | 110 raise |
111 | 111 |
112 xmlData = {} | 112 xmlData = {} |
113 | 113 |
114 if schema_number is None: | 114 if schema_number is None: |
115 schema_number = 1 | 115 schema_number = 1 |
116 | 116 |
117 success = 0 | 117 success = 0 |
118 for scheme in tree.findall('species'): | 118 for scheme in tree.findall('species'): |
119 species_scheme = scheme.text.splitlines()[0].rsplit('#', 1) | 119 species_scheme = scheme.text.rstrip('\r\n').rsplit('#', 1) |
120 number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1 | 120 number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1 |
121 species_scheme = species_scheme[0] | 121 species_scheme = species_scheme[0] |
122 if determine_species(species_scheme) == determine_species(originalSpecies): | 122 if determine_species(species_scheme) == determine_species(originalSpecies): |
123 if schema_number == number_scheme: | 123 if schema_number == number_scheme: |
124 success += 1 | 124 success += 1 |
140 loci[locusID.strip()] = locusUrl | 140 loci[locusID.strip()] = locusUrl |
141 xmlData[scheme.text.strip()][retrieved].append(loci) | 141 xmlData[scheme.text.strip()][retrieved].append(loci) |
142 if success == 0: | 142 if success == 0: |
143 sys.exit("\tError. No schema found for %s. Please refer to https://pubmlst.org/databases/" % (originalSpecies)) | 143 sys.exit("\tError. No schema found for %s. Please refer to https://pubmlst.org/databases/" % (originalSpecies)) |
144 elif success > 1: | 144 elif success > 1: |
145 keys = xmlData.keys() | 145 keys = list(xmlData.keys()) |
146 keys = sorted(keys) | 146 keys = sorted(keys) |
147 print "\tWarning. More than one schema found for %s. only keeping the first one... %s" % (originalSpecies, keys[0]) | 147 print("\tWarning. More than one schema found for %s. only keeping the first" |
148 " one... %s" % (originalSpecies, keys[0])) | |
148 for key in keys[1:]: | 149 for key in keys[1:]: |
149 del xmlData[key] | 150 del xmlData[key] |
150 | 151 |
151 pubmlst_dir = os.path.join(outdir, 'pubmlst', '') | 152 pubmlst_dir = os.path.join(outdir, 'pubmlst', '') |
152 if not os.path.isdir(pubmlst_dir): | 153 if not os.path.isdir(pubmlst_dir): |
153 os.makedirs(pubmlst_dir) | 154 os.makedirs(pubmlst_dir) |
154 | 155 |
155 for SchemaName, info in xmlData.items(): | 156 for SchemaName, info in list(xmlData.items()): |
156 STdict = {} | 157 STdict = {} |
157 SequenceDict = {} | 158 SequenceDict = {} |
158 mlst_sequences = {} | 159 mlst_sequences = {} |
159 | 160 |
160 species_name = '_'.join(determine_species(SchemaName)).replace('/', '_') | 161 species_name = '_'.join(determine_species(SchemaName)).replace('/', '_') |
161 | 162 |
162 for RetrievalDate, URL in info.items(): | 163 for RetrievalDate, URL in list(info.items()): |
163 schema_date = species_name + '_' + RetrievalDate | 164 schema_date = species_name + '_' + RetrievalDate |
164 outDit = os.path.join(pubmlst_dir, schema_date) # compatible with windows? See if it already exists, if so, break | 165 outDit = os.path.join(pubmlst_dir, schema_date) # compatible with windows? See if it already exists, if so, break |
165 | 166 |
166 if os.path.isdir(outDit): | 167 if os.path.isdir(outDit): |
167 pickle = os.path.join(outDit, str(schema_date + '.pkl')) | 168 pickle = os.path.join(outDit, str(schema_date + '.pkl')) |
168 if os.path.isfile(pickle): | 169 if os.path.isfile(pickle): |
169 print "\tschema files already exist for %s" % (SchemaName) | 170 print("\tschema files already exist for %s" % (SchemaName)) |
170 mlst_dicts = utils.extractVariableFromPickle(pickle) | 171 mlst_dicts = utils.extract_variable_from_pickle(pickle) |
171 SequenceDict = mlst_dicts[0] | 172 SequenceDict = mlst_dicts[0] |
172 for lociName, alleleSequences in SequenceDict.items(): | 173 for lociName, alleleSequences in list(SequenceDict.items()): |
173 for sequence in alleleSequences: | 174 for sequence in alleleSequences: |
174 if lociName not in mlst_sequences.keys(): | 175 if lociName not in list(mlst_sequences.keys()): |
175 mlst_sequences[lociName] = sequence | 176 mlst_sequences[lociName] = sequence |
176 else: | 177 else: |
177 break | 178 break |
178 return mlst_dicts, mlst_sequences | 179 return mlst_dicts, mlst_sequences |
179 | 180 |
180 elif any(species_name in x for x in os.listdir(pubmlst_dir)): | 181 elif any(species_name in x for x in os.listdir(pubmlst_dir)): |
181 print "Older version of %s's scheme found! Deleting..." % (SchemaName) | 182 print("Older version of %s's scheme found! Deleting..." % (SchemaName)) |
182 for directory in glob(str(pubmlst_dir + str(species_name + '_*'))): | 183 for directory in glob(str(pubmlst_dir + str(species_name + '_*'))): |
183 utils.removeDirectory(directory) | 184 utils.remove_directory(directory) |
184 os.makedirs(outDit) | 185 os.makedirs(outDit) |
185 else: | 186 else: |
186 os.makedirs(outDit) | 187 os.makedirs(outDit) |
187 | 188 |
188 contentProfile = urllib2.urlopen(URL[0]) | 189 contentProfile = urllib.request.urlopen(URL[0]) |
189 profileFile = csv.reader(contentProfile, delimiter='\t') | 190 header = next(contentProfile).decode("utf8").strip().split('\t') # skip header |
190 header = profileFile.next() # skip header | |
191 try: | 191 try: |
192 indexCC = header.index('clonal_complex') if 'clonal_complex' in header else header.index('CC') | 192 indexCC = header.index('clonal_complex') if 'clonal_complex' in header else header.index('CC') |
193 except: | 193 except: |
194 indexCC = len(header) | 194 indexCC = len(header) |
195 lociOrder = header[1:indexCC] | 195 lociOrder = header[1:indexCC] |
196 for row in profileFile: | 196 for row in contentProfile: |
197 row = row.decode("utf8").strip().split('\t') | |
197 ST = row[0] | 198 ST = row[0] |
198 alleles = ','.join(row[1:indexCC]) | 199 alleles = ','.join(row[1:indexCC]) |
199 STdict[alleles] = ST | 200 STdict[alleles] = ST |
200 for lociName, lociURL in URL[1].items(): | 201 for lociName, lociURL in list(URL[1].items()): |
201 if lociName not in SequenceDict.keys(): | 202 if lociName not in list(SequenceDict.keys()): |
202 SequenceDict[lociName] = {} | 203 SequenceDict[lociName] = {} |
203 url_file = os.path.join(outDit, lociURL.rsplit('/', 1)[1]) | 204 url_file = os.path.join(outDit, lociURL.rsplit('/', 1)[1]) |
204 urllib.urlretrieve(lociURL, url_file) | 205 urllib.request.urlretrieve(lociURL, url_file) |
205 sequences, ignore, ignore = rematch_module.get_sequence_information(url_file, 0) | 206 sequences, ignore, ignore = rematch_module.get_sequence_information(url_file, 0) |
206 for key in sequences.keys(): | 207 for key in list(sequences.keys()): |
207 header = re.sub("\D", "", sequences[key]['header']) | 208 header = re.sub("\D", "", sequences[key]['header']) |
208 sequence = sequences[key]['sequence'].upper() | 209 sequence = sequences[key]['sequence'].upper() |
209 SequenceDict[lociName][sequence] = header | 210 SequenceDict[lociName][sequence] = header |
210 if lociName not in mlst_sequences.keys(): | 211 if lociName not in list(mlst_sequences.keys()): |
211 mlst_sequences[lociName] = sequence | 212 mlst_sequences[lociName] = sequence |
212 os.remove(url_file) | 213 os.remove(url_file) |
213 mlst_dicts = [SequenceDict, STdict, lociOrder] | 214 mlst_dicts = [SequenceDict, STdict, lociOrder] |
214 utils.saveVariableToPickle(mlst_dicts, outDit, schema_date) | 215 utils.save_variable_to_pickle(mlst_dicts, outDit, schema_date) |
215 return mlst_dicts, mlst_sequences | 216 return mlst_dicts, mlst_sequences |