Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
view scripts/ReMatCh/modules/checkMLST.py @ 0:965517909457 draft
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Wed, 22 Jan 2020 08:41:44 -0500 |
parents | |
children | 0cbed1c0a762 |
line wrap: on
line source
import sys import os import urllib2 import urllib import csv from glob import glob import re import functools try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET import utils import rematch_module def determine_species(species): species = species.lower().split(' ') if len(species) >= 2: species = species[:2] if species[1] in ('spp', 'spp.', 'complex'): species = [species[0]] return species def check_existing_schema(species, schema_number, script_path): species = determine_species(species) if schema_number is None: schema_number = '' else: schema_number = '#' + str(schema_number) mlst_schemas_folder = os.path.join(os.path.dirname(script_path), 'modules', 'mlst_schemas', '') reference = [] files = [f for f in os.listdir(mlst_schemas_folder) if not f.startswith('.') and os.path.isfile(os.path.join(mlst_schemas_folder, f))] for file_found in files: file_path = os.path.join(mlst_schemas_folder, file_found) if file_found.startswith('_'.join(species) + schema_number) and file_found.endswith('.fasta'): reference = file_path if len(reference) > 1: if schema_number == '': schema_number = '#1' for scheme in reference: if os.path.splitext(scheme)[0].endswith(schema_number): reference = [scheme] break if len(reference) == 0: reference = None elif len(reference) == 1: reference = reference[0] return reference def write_mlst_reference(species, mlst_sequences, outdir, time_str): print 'Writing MLST alleles as reference_sequences' + '\n' reference_file = os.path.join(outdir, str(species.replace('/', '_').replace(' ', '_') + '.' + time_str + '.fasta')) with open(reference_file, 'wt') as writer: for header, sequence in mlst_sequences.items(): writer.write('>' + header + '\n') fasta_sequence_lines = rematch_module.chunkstring(sequence, 80) for line in fasta_sequence_lines: writer.write(line + '\n') return reference_file def getST(mlst_dicts, dict_sequences): SequenceDict = mlst_dicts[0] STdict = mlst_dicts[1] lociOrder = mlst_dicts[2] alleles_profile = ['-'] * len(lociOrder) for x, sequence_data in dict_sequences.items(): if sequence_data['header'] not in SequenceDict: print sequence_data['header'] + ' not found between consensus sequences!' break if sequence_data['sequence'] in SequenceDict[sequence_data['header']].keys(): allele_number = SequenceDict[sequence_data['header']][sequence_data['sequence']] alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number else: for sequence_st, allele_number in SequenceDict[sequence_data['header']].items(): if sequence_st in sequence_data['sequence']: alleles_profile[lociOrder.index(sequence_data['header'])] = allele_number alleles_profile = ','.join(alleles_profile) st = '-' if alleles_profile in STdict: st = STdict[alleles_profile] return st, alleles_profile downloadPubMLST = functools.partial(utils.timer, name='Download PubMLST module') @downloadPubMLST def downloadPubMLSTxml(originalSpecies, schema_number, outdir): print 'Searching MLST database for ' + originalSpecies xmlURL = 'http://pubmlst.org/data/dbases.xml' try: content = urllib2.urlopen(xmlURL) xml = content.read() tree = ET.fromstring(xml) except: print "Ooops! There might be a problem with the PubMLST service, try later or check if the xml is well formated at " + xmlURL raise xmlData = {} if schema_number is None: schema_number = 1 success = 0 for scheme in tree.findall('species'): species_scheme = scheme.text.splitlines()[0].rsplit('#', 1) number_scheme = species_scheme[1] if len(species_scheme) == 2 else 1 species_scheme = species_scheme[0] if determine_species(species_scheme) == determine_species(originalSpecies): if schema_number == number_scheme: success += 1 xmlData[scheme.text.strip()] = {} for info in scheme: # mlst for database in info: # database for retrievedDate in database.findall('retrieved'): retrieved = retrievedDate.text xmlData[scheme.text.strip()][retrieved] = [] for profile in database.findall('profiles'): profileURl = profile.find('url').text xmlData[scheme.text.strip()][retrieved].append(profileURl) for lociScheme in database.findall('loci'): loci = {} for locus in lociScheme: locusID = locus.text for locusInfo in locus: locusUrl = locusInfo.text loci[locusID.strip()] = locusUrl xmlData[scheme.text.strip()][retrieved].append(loci) if success == 0: sys.exit("\tError. No schema found for %s. Please refer to https://pubmlst.org/databases/" % (originalSpecies)) elif success > 1: keys = xmlData.keys() keys = sorted(keys) print "\tWarning. More than one schema found for %s. only keeping the first one... %s" % (originalSpecies, keys[0]) for key in keys[1:]: del xmlData[key] pubmlst_dir = os.path.join(outdir, 'pubmlst', '') if not os.path.isdir(pubmlst_dir): os.makedirs(pubmlst_dir) for SchemaName, info in xmlData.items(): STdict = {} SequenceDict = {} mlst_sequences = {} species_name = '_'.join(determine_species(SchemaName)).replace('/', '_') for RetrievalDate, URL in info.items(): schema_date = species_name + '_' + RetrievalDate outDit = os.path.join(pubmlst_dir, schema_date) # compatible with windows? See if it already exists, if so, break if os.path.isdir(outDit): pickle = os.path.join(outDit, str(schema_date + '.pkl')) if os.path.isfile(pickle): print "\tschema files already exist for %s" % (SchemaName) mlst_dicts = utils.extractVariableFromPickle(pickle) SequenceDict = mlst_dicts[0] for lociName, alleleSequences in SequenceDict.items(): for sequence in alleleSequences: if lociName not in mlst_sequences.keys(): mlst_sequences[lociName] = sequence else: break return mlst_dicts, mlst_sequences elif any(species_name in x for x in os.listdir(pubmlst_dir)): print "Older version of %s's scheme found! Deleting..." % (SchemaName) for directory in glob(str(pubmlst_dir + str(species_name + '_*'))): utils.removeDirectory(directory) os.makedirs(outDit) else: os.makedirs(outDit) contentProfile = urllib2.urlopen(URL[0]) profileFile = csv.reader(contentProfile, delimiter='\t') header = profileFile.next() # skip header try: indexCC = header.index('clonal_complex') if 'clonal_complex' in header else header.index('CC') except: indexCC = len(header) lociOrder = header[1:indexCC] for row in profileFile: ST = row[0] alleles = ','.join(row[1:indexCC]) STdict[alleles] = ST for lociName, lociURL in URL[1].items(): if lociName not in SequenceDict.keys(): SequenceDict[lociName] = {} url_file = os.path.join(outDit, lociURL.rsplit('/', 1)[1]) urllib.urlretrieve(lociURL, url_file) sequences, ignore, ignore = rematch_module.get_sequence_information(url_file, 0) for key in sequences.keys(): header = re.sub("\D", "", sequences[key]['header']) sequence = sequences[key]['sequence'].upper() SequenceDict[lociName][sequence] = header if lociName not in mlst_sequences.keys(): mlst_sequences[lociName] = sequence os.remove(url_file) mlst_dicts = [SequenceDict, STdict, lociOrder] utils.saveVariableToPickle(mlst_dicts, outDit, schema_date) return mlst_dicts, mlst_sequences