Mercurial > repos > ulfschaefer > data_manager_phemost
view data_manager/fetch_mlst_data.py @ 0:25d4d9f313a0 draft default tip
Uploaded
author | ulfschaefer |
---|---|
date | Wed, 13 Jul 2016 05:50:48 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python ''' Download MLST datasets from this site: http://pubmlst.org/data/ by parsing an xml file (http://pubmlst.org/data/dbases.xml). Data is downloaded for a species determined by the user: - profiles (maps STs to allele numbers) - numbered sequences for each locus in the scheme In addition, the alleles are concatenated together for use with SRST2. A log file is also generated in the working directory, detailing the time, date and location of all files downloaded, as well as the <retrieved> tag which tells us when the XML entry was last updated. If the species name input by the user matches multiple <species> in the xml file, the script simply reports the possible matches so the user can try again. ''' """ - Remove empty line at the end of profiles.txt file. - Ensure the allele names at the profiles.txt file don't contain "_". """ from argparse import ArgumentParser import xml.dom.minidom as xml import urllib2 as url import re import os import sys import glob import csv import shutil from urlparse import urlparse import time import subprocess from json import dumps from json import loads # -------------------------------------------------------------------------------------------------- def parse_args(): parser = ArgumentParser(description='Download MLST datasets by species' 'from pubmlst.org.') parser.add_argument('--repository_url', metavar = 'URL', default = 'http://pubmlst.org/data/dbases.xml', help = 'URL for MLST repository XML index') parser.add_argument('--species', metavar = 'NAME', required = True, help = 'The name of the species that you want to download (e.g. "Escherichia coli")') parser.add_argument('--outfile', metavar = 'FILE', required = True, help = 'The name of the Json file to write that galaxy stuff to.') parser.add_argument('--reference', metavar = 'ACCESSION', required = True, help = 'NCBI accession number of the reference genome to use for flanking regions.') return parser.parse_args() # -------------------------------------------------------------------------------------------------- def main(): """ <species> Achromobacter spp. <mlst> <database> <url>http://pubmlst.org/achromobacter</url> <retrieved>2015-08-11</retrieved> <profiles> <count>272</count> <url>http://pubmlst.org/data/profiles/achromobacter.txt</url> </profiles> <loci> <locus> nusA <url> http://pubmlst.org/data/alleles/achromobacter/nusA.tfa </url> </locus> <locus> rpoB <url> http://pubmlst.org/data/alleles/achromobacter/rpoB.tfa </url> </locus> """ args = parse_args() docFile = url.urlopen(args.repository_url) # url address #args.repository_url =http://pubmlst.org/data/dbases.xml doc = xml.parse(docFile) root = doc.childNodes[0] found_species = [] if args.species == "Escherichia coli": args.species = "Escherichia coli#1" elif args.species == "Acinetobacter baumannii": args.species = "Acinetobacter baumannii#1" elif args.species == "Pasteurella multocida": args.species = "Pasteurella multocida#1" else: pass for species_node in root.getElementsByTagName('species'): info = getSpeciesInfo(species_node, args.species) if info != None: found_species.append(info) if len(found_species) == 0: sys.stderr.write("No species matched your query.\n") exit(1) if len(found_species) > 1: sys.stderr.write("The following %i species match your query, please be more specific:\n" % (len(found_species))) for info in found_species: sys.stderr.write(info.name + '\n') exit(2) # output information for the single matching species assert len(found_species) == 1 species_info = found_species[0] species_name_underscores = species_info.name.replace(' ', '_') timestamp = time.strftime("%Y%m%d%H%M%S") params = loads(open(args.outfile).read()) folder = os.path.join(params['output_data'][0]['extra_files_path'], species_name_underscores, timestamp) if not os.path.isdir(folder): os.makedirs(folder) profile_doc = url.urlopen(species_info.profiles_url) with open(os.path.join(folder, 'profiles.txt'), 'w') as f: sys.stdout.write("Writing to %s\n" % (os.path.join(folder, 'profiles.txt'))) for line in profile_doc.readlines(): cols = line.split("\t") f.write("%s\n" % ('\t'.join(cols[0:8]))) profile_doc.close() for locus in species_info.loci: locus_path = urlparse(locus.url).path locus_filename = locus_path.split('/')[-1] locus_filename = locus_filename.replace("_.tfa", ".fas") locus_filename = locus_filename.replace("tfa", "fas") locus_doc = url.urlopen(locus.url) with open(os.path.join(folder, locus_filename), 'w') as locus_file: locus_fasta_content = locus_doc.read() locus_fasta_content = locus_fasta_content.replace("_","-").replace("--","-") sys.stdout.write("Writing to %s\n" % (os.path.join(folder, locus_filename))) locus_file.write(locus_fasta_content) locus_doc.close() get_reference(folder, args.reference) # do Galaxy stuff data_manager_dict = {} data_manager_dict['data_tables'] = {} name = "%s-%s" % (species_info.name, timestamp) data_manager_dict['data_tables']['mlst_data'] = [dict(value=species_name_underscores, dbkey=species_name_underscores, name=name, time_stamp=timestamp, file_path=folder)] #save info to json file with open(args.outfile, 'wb') as fjson: fjson.write(dumps(data_manager_dict)) # end of main -------------------------------------------------------------------------------------- def get_reference(folder, acc): # We're getting this file from Japan! # It seems to work pretty well until they take down or change their website # See: http://www.ncbi.nlm.nih.gov/pubmed/20472643 refurl = 'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta' % (acc) remote_ref = url.urlopen(refurl) ref_filename = os.path.join(folder, 'reference.seq') with open(ref_filename, 'wb') as fRef: fRef.write(remote_ref.read()) remote_ref.close() cmd = "makeblastdb -in %s -dbtype nucl -out %s" \ % (ref_filename, ref_filename.replace("reference.seq", "reference")) p = subprocess.Popen(cmd, shell=True, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) p.wait() return # -------------------------------------------------------------------------------------------------- # test if a node is an Element and that it has a specific tag name def testElementTag(node, name): return node.nodeType == node.ELEMENT_NODE and node.localName == name # -------------------------------------------------------------------------------------------------- # Get the text from an element node with a text node child def getText(element): result = '' for node in element.childNodes: if node.nodeType == node.TEXT_NODE: result += node.data return normaliseText(result) # -------------------------------------------------------------------------------------------------- # remove unwanted whitespace including linebreaks etc. def normaliseText(str): return ' '.join(str.split()) # -------------------------------------------------------------------------------------------------- # A collection of interesting information about a taxa class SpeciesInfo(object): def __init__(self): self.name = None # String name of species self.database_url = None # URL as string self.retrieved = None # date as string self.profiles_url = None # URL as string self.profiles_count = None # positive integer self.loci = [] # list of loci def __str__(self): s = "Name: %s\n" % self.name s += "Database URL: %s\n" % self.database_url s += "Retrieved: %s\n" % self.retrieved s += "Profiles URL: %s\n" % self.profiles_url s += "Profiles count: %s\n" % self.profiles_count s += "Loci: %s\n" % (','.join([str(x) for x in self.loci])) return s # -------------------------------------------------------------------------------------------------- class LocusInfo(object): def __init__(self): self.url = None self.name = None def __str__(self): return "Locus: name:%s,url:%s" % (self.name, self.url) # -------------------------------------------------------------------------------------------------- # retrieve the interesting information for a given sample element def getSpeciesInfo(species_node, species): this_name = getText(species_node) print this_name if this_name.startswith(species): info = SpeciesInfo() info.name = this_name for mlst_node in species_node.getElementsByTagName('mlst'): for database_node in mlst_node.getElementsByTagName('database'): for database_child_node in database_node.childNodes: if testElementTag(database_child_node, 'url'): info.database_url = getText(database_child_node) elif testElementTag(database_child_node, 'retrieved'): info.retrieved = getText(database_child_node) elif testElementTag(database_child_node, 'profiles'): for profile_count in database_child_node.getElementsByTagName('count'): info.profiles_count = getText(profile_count) for profile_url in database_child_node.getElementsByTagName('url'): info.profiles_url = getText(profile_url) elif testElementTag(database_child_node, 'loci'): for locus_node in database_child_node.getElementsByTagName('locus'): locus_info = LocusInfo() locus_info.name = getText(locus_node) for locus_url in locus_node.getElementsByTagName('url'): locus_info.url = getText(locus_url) info.loci.append(locus_info) return info else: return None # -------------------------------------------------------------------------------------------------- if __name__ == '__main__': main()