Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
comparison scripts/ReMatCh/modules/checkMLST.py @ 3:0cbed1c0a762 draft default tip
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
| author | cstrittmatter |
|---|---|
| date | Tue, 28 Jan 2020 10:42:31 -0500 |
| parents | 965517909457 |
| children |
comparison
equal
deleted
inserted
replaced
| 2:6837f733b4aa | 3:0cbed1c0a762 |
|---|---|
| 1 import sys | 1 import sys |
| 2 import os | 2 import os |
| 3 import urllib2 | 3 import urllib.request |
| 4 import urllib | |
| 5 import csv | 4 import csv |
| 6 from glob import glob | 5 from glob import glob |
| 7 import re | 6 import re |
| 8 import functools | 7 import functools |
| 9 try: | 8 try: |
| 10 import xml.etree.cElementTree as ET | 9 import xml.etree.cElementTree as ET |
| 11 except ImportError: | 10 except ImportError: |
| 12 import xml.etree.ElementTree as ET | 11 import xml.etree.ElementTree as ET |
| 13 import utils | 12 from . import utils |
| 14 import rematch_module | 13 from . import rematch_module |
| 15 | 14 |
| 16 | 15 |
| 17 def determine_species(species): | 16 def determine_species(species): |
| 18 species = species.lower().split(' ') | 17 species = species.lower().split(' ') |
| 19 | 18 |
| 54 reference = reference[0] | 53 reference = reference[0] |
| 55 return reference | 54 return reference |
| 56 | 55 |
| 57 | 56 |
| 58 def write_mlst_reference(species, mlst_sequences, outdir, time_str): | 57 def write_mlst_reference(species, mlst_sequences, outdir, time_str): |
| 59 print 'Writing MLST alleles as reference_sequences' + '\n' | 58 print('Writing MLST alleles as reference_sequences' + '\n') |
| 60 reference_file = os.path.join(outdir, str(species.replace('/', '_').replace(' ', '_') + '.' + time_str + '.fasta')) | 59 reference_file = os.path.join(outdir, str(species.replace('/', '_').replace(' ', '_') + '.' + time_str + '.fasta')) |
| 61 with open(reference_file, 'wt') as writer: | 60 with open(reference_file, 'wt') as writer: |
| 62 for header, sequence in mlst_sequences.items(): | 61 for header, sequence in list(mlst_sequences.items()): |
| 63 writer.write('>' + header + '\n') | 62 writer.write('>' + header + '\n') |
| 64 fasta_sequence_lines = rematch_module.chunkstring(sequence, 80) | 63 fasta_sequence_lines = rematch_module.chunkstring(sequence, 80) |
| 65 for line in fasta_sequence_lines: | 64 for line in fasta_sequence_lines: |
| 66 writer.write(line + '\n') | 65 writer.write(line + '\n') |
| 67 return reference_file | 66 return reference_file |
| 68 | 67 |
| 69 | 68 |
| 70 def getST(mlst_dicts, dict_sequences): | 69 def get_st(mlst_dicts, dict_sequences): |
| 71 SequenceDict = mlst_dicts[0] | 70 SequenceDict = mlst_dicts[0] |
| 72 STdict = mlst_dicts[1] | 71 STdict = mlst_dicts[1] |
| 73 lociOrder = mlst_dicts[2] | 72 lociOrder = mlst_dicts[2] |
| 74 | 73 |
| 75 alleles_profile = ['-'] * len(lociOrder) | 74 alleles_profile = ['-'] * len(lociOrder) |
| 76 for x, sequence_data in dict_sequences.items(): | 75 for x, sequence_data in list(dict_sequences.items()): |
| 77 if sequence_data['header'] not in SequenceDict: | 76 if sequence_data['header'] not in SequenceDict: |
| 78 print sequence_data['header'] + ' not found between consensus sequences!' | 77 print(sequence_data['header'] + ' not found between consensus sequences!') |
| 79 break | 78 break |
| 80 if sequence_data['sequence'] in SequenceDict[sequence_data['header']].keys(): | 79 if sequence_data['sequence'] in list(SequenceDict[sequence_data['header']].keys()): |
| 81 allele_number = SequenceDict[sequence_data['header']][sequence_data['sequence']] | 80 allele_number = SequenceDict[sequence_data['header']][sequence_data['sequence']] |
| 82 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number | 81 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number |
| 83 else: | 82 else: |
| 84 for sequence_st, allele_number in SequenceDict[sequence_data['header']].items(): | 83 for sequence_st, allele_number in list(SequenceDict[sequence_data['header']].items()): |
| 85 if sequence_st in sequence_data['sequence']: | 84 if sequence_st in sequence_data['sequence']: |
| 86 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number | 85 alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number |
| 87 | 86 |
| 88 alleles_profile = ','.join(alleles_profile) | 87 alleles_profile = ','.join(alleles_profile) |
| 89 st = '-' | 88 st = '-' |
| 95 | 94 |
| 96 downloadPubMLST = functools.partial(utils.timer, name='Download PubMLST module') | 95 downloadPubMLST = functools.partial(utils.timer, name='Download PubMLST module') |
| 97 | 96 |
| 98 | 97 |
| 99 @downloadPubMLST | 98 @downloadPubMLST |
| 100 def downloadPubMLSTxml(originalSpecies, schema_number, outdir): | 99 def download_pub_mlst_xml(originalSpecies, schema_number, outdir): |
| 101 print 'Searching MLST database for ' + originalSpecies | 100 print('Searching MLST database for ' + originalSpecies) |
| 102 | 101 |
| 103 xmlURL = 'http://pubmlst.org/data/dbases.xml' | 102 xmlURL = 'http://pubmlst.org/data/dbases.xml' |
| 104 try: | 103 try: |
| 105 content = urllib2.urlopen(xmlURL) | 104 content = urllib.request.urlopen(xmlURL) |
| 106 xml = content.read() | 105 xml = content.read() |
| 107 tree = ET.fromstring(xml) | 106 tree = ET.fromstring(xml) |
| 108 except: | 107 except: |
| 109 print "Ooops! There might be a problem with the PubMLST service, try later or check if the xml is well formated at " + xmlURL | 108 print("Ooops! There might be a problem with the PubMLST service, try later or check if the xml is well formated" |
| 109 " at " + xmlURL) | |
| 110 raise | 110 raise |
| 111 | 111 |
| 112 xmlData = {} | 112 xmlData = {} |
| 113 | 113 |
| 114 if schema_number is None: | 114 if schema_number is None: |
| 115 schema_number = 1 | 115 schema_number = 1 |
| 116 | 116 |
| 117 success = 0 | 117 success = 0 |
| 118 for scheme in tree.findall('species'): | 118 for scheme in tree.findall('species'): |
| 119 species_scheme = scheme.text.splitlines()[0].rsplit('#', 1) | 119 species_scheme = scheme.text.rstrip('\r\n').rsplit('#', 1) |
| 120 number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1 | 120 number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1 |
| 121 species_scheme = species_scheme[0] | 121 species_scheme = species_scheme[0] |
| 122 if determine_species(species_scheme) == determine_species(originalSpecies): | 122 if determine_species(species_scheme) == determine_species(originalSpecies): |
| 123 if schema_number == number_scheme: | 123 if schema_number == number_scheme: |
| 124 success += 1 | 124 success += 1 |
| 140 loci[locusID.strip()] = locusUrl | 140 loci[locusID.strip()] = locusUrl |
| 141 xmlData[scheme.text.strip()][retrieved].append(loci) | 141 xmlData[scheme.text.strip()][retrieved].append(loci) |
| 142 if success == 0: | 142 if success == 0: |
| 143 sys.exit("\tError. No schema found for %s. Please refer to https://pubmlst.org/databases/" % (originalSpecies)) | 143 sys.exit("\tError. No schema found for %s. Please refer to https://pubmlst.org/databases/" % (originalSpecies)) |
| 144 elif success > 1: | 144 elif success > 1: |
| 145 keys = xmlData.keys() | 145 keys = list(xmlData.keys()) |
| 146 keys = sorted(keys) | 146 keys = sorted(keys) |
| 147 print "\tWarning. More than one schema found for %s. only keeping the first one... %s" % (originalSpecies, keys[0]) | 147 print("\tWarning. More than one schema found for %s. only keeping the first" |
| 148 " one... %s" % (originalSpecies, keys[0])) | |
| 148 for key in keys[1:]: | 149 for key in keys[1:]: |
| 149 del xmlData[key] | 150 del xmlData[key] |
| 150 | 151 |
| 151 pubmlst_dir = os.path.join(outdir, 'pubmlst', '') | 152 pubmlst_dir = os.path.join(outdir, 'pubmlst', '') |
| 152 if not os.path.isdir(pubmlst_dir): | 153 if not os.path.isdir(pubmlst_dir): |
| 153 os.makedirs(pubmlst_dir) | 154 os.makedirs(pubmlst_dir) |
| 154 | 155 |
| 155 for SchemaName, info in xmlData.items(): | 156 for SchemaName, info in list(xmlData.items()): |
| 156 STdict = {} | 157 STdict = {} |
| 157 SequenceDict = {} | 158 SequenceDict = {} |
| 158 mlst_sequences = {} | 159 mlst_sequences = {} |
| 159 | 160 |
| 160 species_name = '_'.join(determine_species(SchemaName)).replace('/', '_') | 161 species_name = '_'.join(determine_species(SchemaName)).replace('/', '_') |
| 161 | 162 |
| 162 for RetrievalDate, URL in info.items(): | 163 for RetrievalDate, URL in list(info.items()): |
| 163 schema_date = species_name + '_' + RetrievalDate | 164 schema_date = species_name + '_' + RetrievalDate |
| 164 outDit = os.path.join(pubmlst_dir, schema_date) # compatible with windows? See if it already exists, if so, break | 165 outDit = os.path.join(pubmlst_dir, schema_date) # compatible with windows? See if it already exists, if so, break |
| 165 | 166 |
| 166 if os.path.isdir(outDit): | 167 if os.path.isdir(outDit): |
| 167 pickle = os.path.join(outDit, str(schema_date + '.pkl')) | 168 pickle = os.path.join(outDit, str(schema_date + '.pkl')) |
| 168 if os.path.isfile(pickle): | 169 if os.path.isfile(pickle): |
| 169 print "\tschema files already exist for %s" % (SchemaName) | 170 print("\tschema files already exist for %s" % (SchemaName)) |
| 170 mlst_dicts = utils.extractVariableFromPickle(pickle) | 171 mlst_dicts = utils.extract_variable_from_pickle(pickle) |
| 171 SequenceDict = mlst_dicts[0] | 172 SequenceDict = mlst_dicts[0] |
| 172 for lociName, alleleSequences in SequenceDict.items(): | 173 for lociName, alleleSequences in list(SequenceDict.items()): |
| 173 for sequence in alleleSequences: | 174 for sequence in alleleSequences: |
| 174 if lociName not in mlst_sequences.keys(): | 175 if lociName not in list(mlst_sequences.keys()): |
| 175 mlst_sequences[lociName] = sequence | 176 mlst_sequences[lociName] = sequence |
| 176 else: | 177 else: |
| 177 break | 178 break |
| 178 return mlst_dicts, mlst_sequences | 179 return mlst_dicts, mlst_sequences |
| 179 | 180 |
| 180 elif any(species_name in x for x in os.listdir(pubmlst_dir)): | 181 elif any(species_name in x for x in os.listdir(pubmlst_dir)): |
| 181 print "Older version of %s's scheme found! Deleting..." % (SchemaName) | 182 print("Older version of %s's scheme found! Deleting..." % (SchemaName)) |
| 182 for directory in glob(str(pubmlst_dir + str(species_name + '_*'))): | 183 for directory in glob(str(pubmlst_dir + str(species_name + '_*'))): |
| 183 utils.removeDirectory(directory) | 184 utils.remove_directory(directory) |
| 184 os.makedirs(outDit) | 185 os.makedirs(outDit) |
| 185 else: | 186 else: |
| 186 os.makedirs(outDit) | 187 os.makedirs(outDit) |
| 187 | 188 |
| 188 contentProfile = urllib2.urlopen(URL[0]) | 189 contentProfile = urllib.request.urlopen(URL[0]) |
| 189 profileFile = csv.reader(contentProfile, delimiter='\t') | 190 header = next(contentProfile).decode("utf8").strip().split('\t') # skip header |
| 190 header = profileFile.next() # skip header | |
| 191 try: | 191 try: |
| 192 indexCC = header.index('clonal_complex') if 'clonal_complex' in header else header.index('CC') | 192 indexCC = header.index('clonal_complex') if 'clonal_complex' in header else header.index('CC') |
| 193 except: | 193 except: |
| 194 indexCC = len(header) | 194 indexCC = len(header) |
| 195 lociOrder = header[1:indexCC] | 195 lociOrder = header[1:indexCC] |
| 196 for row in profileFile: | 196 for row in contentProfile: |
| 197 row = row.decode("utf8").strip().split('\t') | |
| 197 ST = row[0] | 198 ST = row[0] |
| 198 alleles = ','.join(row[1:indexCC]) | 199 alleles = ','.join(row[1:indexCC]) |
| 199 STdict[alleles] = ST | 200 STdict[alleles] = ST |
| 200 for lociName, lociURL in URL[1].items(): | 201 for lociName, lociURL in list(URL[1].items()): |
| 201 if lociName not in SequenceDict.keys(): | 202 if lociName not in list(SequenceDict.keys()): |
| 202 SequenceDict[lociName] = {} | 203 SequenceDict[lociName] = {} |
| 203 url_file = os.path.join(outDit, lociURL.rsplit('/', 1)[1]) | 204 url_file = os.path.join(outDit, lociURL.rsplit('/', 1)[1]) |
| 204 urllib.urlretrieve(lociURL, url_file) | 205 urllib.request.urlretrieve(lociURL, url_file) |
| 205 sequences, ignore, ignore = rematch_module.get_sequence_information(url_file, 0) | 206 sequences, ignore, ignore = rematch_module.get_sequence_information(url_file, 0) |
| 206 for key in sequences.keys(): | 207 for key in list(sequences.keys()): |
| 207 header = re.sub("\D", "", sequences[key]['header']) | 208 header = re.sub("\D", "", sequences[key]['header']) |
| 208 sequence = sequences[key]['sequence'].upper() | 209 sequence = sequences[key]['sequence'].upper() |
| 209 SequenceDict[lociName][sequence] = header | 210 SequenceDict[lociName][sequence] = header |
| 210 if lociName not in mlst_sequences.keys(): | 211 if lociName not in list(mlst_sequences.keys()): |
| 211 mlst_sequences[lociName] = sequence | 212 mlst_sequences[lociName] = sequence |
| 212 os.remove(url_file) | 213 os.remove(url_file) |
| 213 mlst_dicts = [SequenceDict, STdict, lociOrder] | 214 mlst_dicts = [SequenceDict, STdict, lociOrder] |
| 214 utils.saveVariableToPickle(mlst_dicts, outDit, schema_date) | 215 utils.save_variable_to_pickle(mlst_dicts, outDit, schema_date) |
| 215 return mlst_dicts, mlst_sequences | 216 return mlst_dicts, mlst_sequences |
